def download_retry(url, candidate): """ NCBI Rate-limits refseq downloads to 3 per second from each IP. When multiple files are being analyzed simultaneously this limit may be exceeded. Retry """ try: urllib.request.urlretrieve(url, candidate['fasta_path']) logger.info( "file_downloaded", timestamp=str(now()), url=url, accession=candidate['accession'], sample_id=sample_id, ) except HTTPError as e: if int(e.code) == 429: time.sleep(5) logger.info( "retried_download", timestamp=str(now()), url=url, accession=candidate['accession'], sample_id=sample_id, ) download_retry(url, candidate) else: logger.error( "download_failed", timestamp=str(now()), url=url, sample_id=sample_id, )
def custom_plasmids(sample_id, paths): mash_jobs = [ { 'job_name': "_".join(['mash_screen_custom_plasmid', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8 -shell y', 'remote_command': os.path.join(paths['job_scripts'], 'mash_screen_custom_db.sh'), 'args': [ "--R1", paths['reads1_fastq'], "--R2", paths['reads2_fastq'], "--min-identity", 0.996, "--plasmid-db-dir", os.path.join( paths['mash_custom_plasmid_db'], "mash", ), "--output_file", os.path.join( paths['custom_plasmid_output'], 'mash_screen.tsv', ) ], }, ] run_jobs(mash_jobs) mash_screen_results = result_parsers.parse_mash_screen_result( os.path.join( paths['custom_plasmid_output'], 'mash_screen.tsv', ) ) custom_plasmid_db_data = {} for dat_file in glob.glob(os.path.join(paths['mash_custom_plasmid_db'], "data", "*.dat")): [dat] = parsers.custom_plasmid_db_dat_parser(dat_file) custom_plasmid_db_data[dat['accession']] = dat for mash_screen_result in mash_screen_results: accession = re.sub('\.fna$', '', mash_screen_result['query_id']) mash_screen_result['accession'] = accession mash_screen_result['allele'] = custom_plasmid_db_data[accession]['allele'] mash_screen_result['circularity'] = custom_plasmid_db_data[accession]['circularity'] mash_screen_result['plasmid_length'] = custom_plasmid_db_data[accession]['plasmid_length'] mash_screen_result['incompatibility_group'] = custom_plasmid_db_data[accession]['incompatibility_group'] mash_screen_results.sort(key=operator.itemgetter('accession')) mash_screen_results.sort(key=operator.itemgetter('plasmid_length'), reverse=True) mash_screen_results.sort(key=operator.itemgetter('identity'), reverse=True) mash_screen_results.sort(key=operator.itemgetter('circularity')) mash_screen_results.sort(key=operator.itemgetter('incompatibility_group')) candidates_keys = [ 'identity', 'accession', 'circularity', 'plasmid_length', 'allele', 'incompatibility_group', ] with open(os.path.join(paths['custom_plasmid_output'], 'candidates.tsv'), 'w+') as candidates_file: writer = csv.DictWriter(candidates_file, candidates_keys, delimiter='\t', extrasaction='ignore') writer.writerows(mash_screen_results) candidates = [] with open(os.path.join(paths['custom_plasmid_output'], 'candidates.tsv'), 'r') as candidates_file: reader = csv.DictReader(candidates_file, fieldnames=candidates_keys, delimiter='\t') for row in reader: row['fasta_path'] = os.path.join( paths['custom_plasmid_output'], 'candidates', row['accession'] + '.fna', ) candidates.append(row) for candidate in candidates: candidate['database'] = 'custom' for candidate in candidates: candidate_fasta_db_path = os.path.join( paths['mash_custom_plasmid_db'], candidate['accession'] + ".fna" ) shutil.copyfile(candidate_fasta_db_path, candidate['fasta_path']) logger.info( "file_copied", timestamp=str(now()), accession=candidate['accession'], sample_id=sample_id ) return candidates
def refseq_plasmids(sample_id, paths): mash_jobs = [ { 'job_name': "_".join(['mash_screen_refseq_plasmid', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(paths['job_scripts'], 'mash_screen.sh'), 'args': [ "--R1", paths['reads1_fastq'], "--R2", paths['reads2_fastq'], "--queries", paths['mash_refseq_plasmid_db'], "--min-identity", 0.975, "--output_file", os.path.join( paths['refseq_plasmid_output'], 'mash_screen.tsv', ), ], }, ] run_jobs(mash_jobs) mash_screen_result_path = os.path.join( paths['refseq_plasmid_output'], 'mash_screen.tsv', ) mash_screen_results = result_parsers.parse_mash_screen_result( mash_screen_result_path ) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(mash_screen_result_path) ) for result in mash_screen_results: result['accession'] = re.search('ref\|(.*)\|', result['query_id']).group(1) candidates_keys = [ 'identity', 'accession', ] with open(os.path.join(paths['refseq_plasmid_output'], 'candidates.tsv'), 'w+') as candidates_file: writer = csv.DictWriter(candidates_file, candidates_keys, delimiter='\t', extrasaction='ignore') writer.writerows(mash_screen_results) candidates = [] with open(os.path.join(paths['refseq_plasmid_output'], 'candidates.tsv'), 'r') as candidates_file: reader = csv.DictReader(candidates_file, fieldnames=candidates_keys, delimiter='\t') for row in reader: row['fasta_path'] = os.path.join( paths['refseq_plasmid_output'], 'candidates', row['accession'] + '.fna', ) candidates.append(row) for candidate in candidates: candidate['database'] = 'refseq' # NCBI Rate-limits downloads to 3 per second. for candidate in candidates: candidate_fasta = os.path.join( candidate['fasta_path'] ) url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + \ "&".join([ "db=nucleotide", "id=" + candidate['accession'], "rettype=fasta", ]) def download_retry(url, candidate): """ NCBI Rate-limits refseq downloads to 3 per second from each IP. When multiple files are being analyzed simultaneously this limit may be exceeded. Retry """ try: urllib.request.urlretrieve(url, candidate['fasta_path']) logger.info( "file_downloaded", timestamp=str(now()), url=url, accession=candidate['accession'], sample_id=sample_id, ) except HTTPError as e: if int(e.code) == 429: time.sleep(5) logger.info( "retried_download", timestamp=str(now()), url=url, accession=candidate['accession'], sample_id=sample_id, ) download_retry(url, candidate) else: logger.error( "download_failed", timestamp=str(now()), url=url, sample_id=sample_id, ) download_retry(url, candidate) time.sleep(2) return candidates
def main(args): """ """ config = configparser.ConfigParser() config.read(args.config_file) analysis_id = uuid.uuid4() logger.new( analysis_id=str(uuid.uuid4()), sample_id=args.sample_id, pipeline_version=cpo_pipeline.__version__, ) logger.info( "analysis_started", timestamp=str(now()), ) cpo_pipeline.plasmids.pipeline.main(args) cpo_pipeline.assembly.pipeline.main(args) cpo_pipeline.typing.pipeline.main(args) cpo_pipeline.resistance.pipeline.main(args) final_outputs = collect_final_outputs(args.outdir, args.sample_id) logger.info( "collected_final_outputs", final_outputs=final_outputs, ) final_output_path = "/".join( [args.outdir, args.sample_id, 'final_output.tsv']) final_outputs_headers = [ 'sample_id', 'bp', 'est_genome_size', 'Coverage', 'MASH_BEST_HIT', 'MLST_SCHEME', 'MLST', 'MLST_ALLELE_1', 'MLST_ALLELE_2', 'MLST_ALLELE_3', 'MLST_ALLELE_4', 'MLST_ALLELE_5', 'MLST_ALLELE_6', 'MLST_ALLELE_7', ] with open(final_output_path, 'w+') as f: writer = csv.DictWriter(f, fieldnames=final_outputs_headers, delimiter='\t') writer.writeheader() writer.writerow(final_outputs) logger.info( "analysis_completed", timestamp=str(now()), )
def collect_final_outputs(outdir, sample_id): final_outputs = {} final_outputs['sample_id'] = sample_id total_bp_path = os.path.join(outdir, sample_id, 'pre-assembly_qc', 'totalbp') try: total_bp = cpo_pipeline.assembly.parsers.result_parsers.parse_total_bp( total_bp_path) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(total_bp_path), ) except FileNotFoundError: logger.error( "output_parsing_failed", timestamp=str(now()), ) total_bp = None estimated_genome_coverage_stats_path = os.path.join( outdir, sample_id, 'pre-assembly_qc', 'estimated_coverage_stats.tsv') try: estimated_coverage_stats = cpo_pipeline.assembly.parsers.result_parsers.parse_estimated_coverage_stats( estimated_genome_coverage_stats_path) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(estimated_genome_coverage_stats_path), ) except FileNotFoundError: logger.error( "output_parsing_failed", timestamp=str(now()), filename=os.path.abspath(estimated_genome_coverage_stats_path), ) estimated_coverage_stats = { 'sample_id': sample_id, 'total_bp': '-', 'estimated_genome_size': '-', 'estimated_depth_of_coverage': '-', } reference_genome_assembly_stats_glob = os.path.join( outdir, sample_id, 'reference', "*_assembly_stats.txt") try: [reference_genome_assembly_stats_path ] = glob.glob(reference_genome_assembly_stats_glob) except ValueError: logger.error( "result_parsing_failed", timestamp=str(now()), filename=str(reference_genome_assembly_stats_glob), ) try: reference_genome_assembly_stats = cpo_pipeline.assembly.parsers.result_parsers.parse_reference_genome_assembly_stats( reference_genome_assembly_stats_path) except FileNotFoundError: logger.error( "output_parsing_failed", timestamp=str(now()), filename=os.path.abspath(reference_genome_assembly_stats_path), ) reference_genome_assembly_stats = { 'organism_name': 'Unknown (parsing failed)', 'infraspecific_name': 'Unknown (parsing failed)', 'refseq_assembly_accession': 'Unknown (parsing failed)', 'taxid': 'Unknown (parsing failed)', 'total_length': 0, 'contig_count': 0, 'contig_N50': 0, } mlst_result_path = os.path.join(outdir, sample_id, 'typing', 'mlst', 'mlst.tsv') try: [mlst_result ] = cpo_pipeline.typing.parsers.result_parsers.parse_mlst_result( mlst_result_path) except ValueError: logger.error( "output_parsing_failed", timestamp=str(now()), ) mlst_result = { 'contig_file': os.path.join(outdir, sample_id, 'assembly', 'contigs.fa'), 'scheme_id': '-', 'sequence_type': '-', 'multi_locus_alleles': { 'adk': '-', 'fumc': '-', 'gyrB': '-', 'icd': '-', 'mdh': '-', 'purA': '-', 'recA': '-' } } final_outputs['bp'] = total_bp final_outputs['est_genome_size'] = estimated_coverage_stats[ 'estimated_genome_size'] final_outputs['Coverage'] = round( estimated_coverage_stats['estimated_depth_of_coverage'], 2) final_outputs['MASH_BEST_HIT'] = " ".join([ reference_genome_assembly_stats['organism_name'], reference_genome_assembly_stats['infraspecific_name'], ]) final_outputs['MLST_SCHEME'] = mlst_result['scheme_id'] final_outputs['MLST'] = mlst_result['sequence_type'] allele_number = 1 for key, value in mlst_result['multi_locus_alleles'].items(): final_outputs['MLST_ALLELE_' + str(allele_number)] = key + "(" + value + ")" allele_number += 1 return final_outputs
def main(args): """ main entrypoint Args: args(): Returns: (void) """ config = configparser.ConfigParser() config.read(args.config_file) try: mash_genome_db = args.mash_genome_db except AttributeError: try: mash_genome_db = config['databases']['mash_genome_db'] if not os.path.exists(mash_genome_db): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), mash_genome_db) logger.info( "configuration_loaded", timestamp=str(now()), configuration_attribute="databases/mash_genome_db", configuration_value=mash_genome_db, ) except Exception as e: logger.error( "configuration_failed", timestamp=str(now()), configuration_attribute="databases/mash_genome_db", error_message=str(e), ) sample_id = args.sample_id reads1_fastq = args.reads1_fastq reads2_fastq = args.reads2_fastq output_dir = args.outdir prepare_output_directories(output_dir, sample_id) #dictionary to store QC PASS/FAIL flags qc_verdicts = { "multiple_species_contamination": None, "fastq_contains_plasmids": None, "acceptable_coverage": None, "acceptable_fastqc_forward": None, "acceptable_fastqc_reverse": None, "acceptable_quast_assembly_metrics": None, "acceptable_busco_assembly_metrics": None } qc_thresholds = { # genome mash will include all hits with scores (top hit score - $thisvalue) "mash_hits_genome_score_cutoff": 300, # plasmid mash will include all hits with scores (top hit score - $thisvalue) "mash_hits_plasmid_score_cutoff": 100, # sequencing coverage greater than ($thisvalue) will pass the QC "coverage_cutoff": 30, # QUAST QC: assembly length within +-($thisvalue) percent # in reference to reference length will pass the QC "quast_assembly_length_cutoff": 0.10, # BUSCO QC: complete single genes greater than ($thisvalue) percent will pass the QC "busco_complete_single_cutoff": 0.90, # BUSCO QC: complete duplicate genes less than ($thisvalue) percent will pass the QC "busco_complete_duplicate_cutoff": 0.10 } paths = { "output_dir": output_dir, 'logs': os.path.join( output_dir, sample_id, 'logs', ), "mash_genome_path": os.path.join(output_dir, sample_id, "pre-assembly_qc", "mash_dist.genome.tsv"), "fastqc_output_path": os.path.join(output_dir, sample_id, "pre-assembly_qc", "fastqc"), "totalbp_path": os.path.join(output_dir, sample_id, "pre-assembly_qc", "totalbp"), "estimated_coverage_stats_path": os.path.join(output_dir, sample_id, "pre-assembly_qc", "estimated_coverage_stats.tsv"), "reference_genome_path": os.path.join(output_dir, sample_id, "reference"), "assembly_output": os.path.join(output_dir, sample_id, "assembly"), "quast_path": os.path.join(output_dir, sample_id, "post-assembly_qc", "quast"), } job_script_path = resource_filename('data', 'job_scripts') estimated_genome_sizes_path = resource_filename( 'data', 'estimated_genome_sizes.tsv') estimated_genome_sizes = input_parsers.parse_estimated_genome_sizes( estimated_genome_sizes_path) pre_assembly_qc_jobs = [{ 'job_name': "_".join(['mash_dist_sort_head', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'mash_dist_sort_head.sh'), 'args': [ "--R1", reads1_fastq, "--R2", reads2_fastq, "--queries", mash_genome_db, "--output_file", paths['mash_genome_path'] ], }, { 'job_name': "_".join(['fastqc', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'fastqc.sh'), 'args': [ "--R1", reads1_fastq, "--R2", reads2_fastq, "--output_dir", paths['fastqc_output_path'] ], }, { 'job_name': "_".join(['seqtk_totalbp', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'seqtk_totalbp.sh'), 'args': [ "--R1", reads1_fastq, "--R2", reads2_fastq, "--output_file", paths['totalbp_path'] ], }] run_jobs(pre_assembly_qc_jobs) #parse genome mash results mash_dist_results = [] try: mash_dist_results = result_parsers.parse_mash_dist_result( paths["mash_genome_path"]) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(paths["mash_genome_path"]), closest_match_reference_id=mash_dist_results[0]['reference_id'], ) except Exception as e: logger.info( "result_parsing_failed", timestamp=str(now()), filename=os.path.abspath(paths["mash_genome_path"]), error_message=e.message, ) # parse fastqc fastqc_results = {} for read in ["R1", "R2"]: try: [fastqc_result_summary_path] = glob.glob( os.path.join(paths['fastqc_output_path'], "*_" + read + "_*" + "fastqc", 'summary.txt')) fastqc_results[read] = result_parsers.parse_fastqc_result( fastqc_result_summary_path) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(fastqc_result_summary_path), summary=fastqc_results[read], ) except Exception as e: logger.error("result_parsing_failed", timestamp=str(now()), filename=fastqc_result_summary_path) fastqc_results["R1"] = { "basic_statistics": "FAILED_TO_PARSE", "per_base_sequence_quality": "FAILED_TO_PARSE", "per_tile_sequence_quality": "FAILED_TO_PARSE", "per_sequence_quality_scores": "FAILED_TO_PARSE", "per_base_sequence_content": "FAILED_TO_PARSE", "per_sequence_gc_content": "FAILED_TO_PARSE", "per_base_n_content": "FAILED_TO_PARSE", "sequence_length_distribution": "FAILED_TO_PARSE", "sequence_duplication_levels": "FAILED_TO_PARSE", "overrepresented_sequences": "FAILED_TO_PARSE", "adapter_content": "FAILED_TO_PARSE", } fastqc_results["R2"] = { "basic_statistics": "FAILED_TO_PARSE", "per_base_sequence_quality": "FAILED_TO_PARSE", "per_tile_sequence_quality": "FAILED_TO_PARSE", "per_sequence_quality_scores": "FAILED_TO_PARSE", "per_base_sequence_content": "FAILED_TO_PARSE", "per_sequence_gc_content": "FAILED_TO_PARSE", "per_base_n_content": "FAILED_TO_PARSE", "sequence_length_distribution": "FAILED_TO_PARSE", "sequence_duplication_levels": "FAILED_TO_PARSE", "overrepresented_sequences": "FAILED_TO_PARSE", "adapter_content": "FAILED_TO_PARSE", } #look at fastqc results qc_verdicts["acceptable_fastqc_forward"] = qc.fastqc_qc_check( fastqc_results["R1"]) qc_verdicts["acceptable_fastqc_reverse"] = qc.fastqc_qc_check( fastqc_results["R2"]) try: reference_genome = mash_dist_results[0]['reference_id'] except Exception as e: logger.error( "failed_quality_control_check", timestamp=str(now()), qc_check_failed="determine_reference_sequence", error_message=e.message, ) # build the save paths try: os.makedirs(paths['reference_genome_path']) except OSError as exc: if exc.errno != errno.EEXIST: raise download_refseq_reference(reference_genome, paths['reference_genome_path']) # If the user passes an expected organism NCBI taxonomy ID, then # use that to estimate the genome size. Otherwise, use the downloaded reference. estimated_genome_size = DEFAULT_ESTIMATED_GENOME_SIZE if args.expected_organism_ncbi_taxid: estimated_genome_size = get_estimated_genome_size( estimated_genome_sizes, args.expected_organism_ncbi_taxid) else: try: [reference_genome_assembly_stats_path ] = glob.glob(paths["reference_genome_path"] + "/*_assembly_stats.txt") except ValueError: logger.error( "result_parsing_failed", timestamp=str(now()), filename=str(os.path.abspath(paths["reference_genome_path"])) + "/*_assembly_stats.txt", ) try: reference_genome_assembly_stats = result_parsers.parse_reference_genome_assembly_stats( reference_genome_assembly_stats_path) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(reference_genome_assembly_stats_path), total_length=reference_genome_assembly_stats['total_length'], contig_count=reference_genome_assembly_stats['contig_count'], contig_N50=reference_genome_assembly_stats['contig_N50'], organism_name=reference_genome_assembly_stats['organism_name'], infraspecific_name=reference_genome_assembly_stats[ 'infraspecific_name'], ncbi_taxonomy_id=reference_genome_assembly_stats['taxid'], refseq_assembly_accession=reference_genome_assembly_stats[ 'refseq_assembly_accession'], ) estimated_genome_size = reference_genome_assembly_stats[ 'total_length'] except Exception as e: logger.error( "result_parsing_failed", timestamp=str(now()), filename=os.path.abspath(reference_genome_assembly_stats_path), error_message=e.message, ) total_bp = result_parsers.parse_total_bp(paths["totalbp_path"]) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(paths["totalbp_path"]), total_bp=total_bp, ) estimated_depth_of_coverage = total_bp / estimated_genome_size if estimated_depth_of_coverage >= int(qc_thresholds["coverage_cutoff"]): qc_verdicts["acceptable_coverage"] = True estimated_coverage_stats_headers = [ 'sample_id', 'total_bp', 'estimated_genome_size', 'estimated_depth_of_coverage', ] with open(paths['estimated_coverage_stats_path'], 'w+') as f: writer = csv.DictWriter(f, fieldnames=estimated_coverage_stats_headers, delimiter='\t') writer.writeheader() writer.writerow({ 'sample_id': sample_id, 'total_bp': int(total_bp), 'estimated_genome_size': int(estimated_genome_size), 'estimated_depth_of_coverage': round(estimated_depth_of_coverage, 4), }) assembly_jobs = [{ 'job_name': "_".join(['shovill', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 16 -l h_vmem=4G', 'remote_command': os.path.join(job_script_path, 'shovill.sh'), 'args': [ "--R1", reads1_fastq, "--R2", reads2_fastq, "--mincov", "3", "--minlen", "500", "--output_dir", paths['assembly_output'] ], }] run_jobs(assembly_jobs) post_assembly_qc_jobs = [ { 'job_name': "_".join(['quast', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'quast.sh'), 'args': [ "--input", os.path.join(paths['assembly_output'], "contigs.fa"), "--outdir", paths['quast_path'] ] }, ] run_jobs(post_assembly_qc_jobs) busco_short_summary_contigs_path = os.path.abspath( paths["quast_path"] + "/busco_stats/short_summary_contigs.txt") busco_results = result_parsers.parse_busco_result( busco_short_summary_contigs_path) logger.info("parsed_result_file", timestamp=str(now()), filename=os.path.abspath(busco_short_summary_contigs_path), busco_results=busco_results) quast_report_path = os.path.abspath(paths["quast_path"] + "/report.txt") quast_results = result_parsers.parse_quast_result(quast_report_path) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(quast_report_path), num_contigs=quast_results["num_contigs"], N50=quast_results["N50"], ) qc_verdicts["acceptable_busco_assembly_metrics"] = qc.busco_qc_check( busco_results, qc_thresholds) qc_verdicts["acceptable_quast_assembly_metrics"] = qc.quast_qc_check( quast_results, estimated_genome_size)
def download_refseq_reference(reference_id, download_path): """ Given a mash_hit, download the query sequence from NCBI FTP servers Will fail if the download_path doesn't exist. Args: mash_hit(dict): download_path(str): Returns: (void) """ def mash_reference_id_to_ncbi_ftp_path(reference_id): """ Args: query_id (str): Mash reference ID (column 1 of mash dist report) Returns: list: Directory names used to locate reference genome on ftp://ftp.ncbi.nlm.nih.gov/genomes/all/ For example: "GCF/001/022/155" """ prefix = reference_id.split('_')[0] digits = reference_id.split('_')[1].split('.')[0] path_list = [prefix ] + [digits[i:i + 3] for i in range(0, len(digits), 3)] return "/".join(path_list) ncbi_ftp_path = mash_reference_id_to_ncbi_ftp_path(reference_id) assembly = reference_id[:reference_id.find("_genomic.fna.gz")] ncbi_ftp_server_base = "ftp://ftp.ncbi.nlm.nih.gov" fasta_url = "/".join([ ncbi_ftp_server_base, "genomes", "all", ncbi_ftp_path, assembly, reference_id ]) assembly_stat_url = "/".join([ ncbi_ftp_server_base, "genomes", "all", ncbi_ftp_path, assembly, assembly + "_assembly_stats.txt" ]) #fetch the files try: urllib.request.urlretrieve(fasta_url, "/".join([download_path, reference_id])) logger.info( "file_downloaded", timestamp=str(now()), url=fasta_url, ) except Exception as e: logging.error( "download_failed", timestamp=str(now()), url=fasta_url, ) try: urllib.request.urlretrieve( assembly_stat_url, "/".join([download_path, assembly + "_assembly_stats.txt"])) logger.info( "file_downloaded", timestamp=str(now()), url=assembly_stat_url, ) except Exception as e: logging.error( "download_failed", timestamp=str(now()), url=assembly_stat_url, )
args = parser.parse_args() logging.basicConfig( format="%(message)s", stream=sys.stdout, level=logging.DEBUG, ) structlog.configure_once( processors=[ structlog.stdlib.add_log_level, structlog.processors.JSONRenderer() ], logger_factory=structlog.stdlib.LoggerFactory(), wrapper_class=structlog.stdlib.BoundLogger, context_class=structlog.threadlocal.wrap_dict(dict), ) logger = structlog.get_logger( analysis_id=str(uuid.uuid4()), sample_id=args.sample_id, pipeline_version=cpo_pipeline.__version__, ) logger.info( "analysis_started", timestamp=str(now()), ) main(args)
def run_jobs(jobs): """ """ with drmaa.Session() as session: running_jobs = [] for job in jobs: prepared_job = prepare_job(job, session) job_id = session.runJob(prepared_job) job_name = prepared_job.jobName logger.info( "job_submitted", timestamp=str(now()), job_name=job_name, job_id=job_id, ) running_jobs.append({"id": job_id, "name": job_name}) session.synchronize([x['id'] for x in running_jobs], drmaa.Session.TIMEOUT_WAIT_FOREVER, False) for job in running_jobs: job_info = session.wait(job["id"], drmaa.Session.TIMEOUT_WAIT_FOREVER) resource_usage = job_info.resourceUsage float_fields = [ "io", "iow", "mem", "cpu", "vmem", "maxvmem", "priority", "ru_wallclock", "ru_utime", "ru_stime", "ru_maxrss", "ru_ixrss", "ru_ismrss", "ru_idrss", "ru_isrss", "ru_minflt", "ru_majflt", "ru_nswap", "ru_inblock", "ru_oublock", "ru_msgsnd", "ru_msgrcv", "ru_nsignals", "ru_nvcsw", "ru_nivcsw", "acct_cpu", "acct_mem", "acct_io", "acct_iow", "acct_maxvmem", ] for float_field in float_fields: resource_usage[float_field] = float( resource_usage[float_field]) int_fields = ["exit_status"] for int_field in int_fields: resource_usage[int_field] = int( float(resource_usage[int_field])) # Convert unix epoch timestamps to ISO8601 (YYYY-MM-DDTHH:mm:ss+tz) time_fields = [ "submission_time", "start_time", "end_time", ] for time_field in time_fields: unix_timestamp = resource_usage[time_field] iso8601_timestamp = str( datetime.datetime.fromtimestamp( int(float(unix_timestamp)), datetime.timezone.utc).isoformat()) resource_usage[time_field] = iso8601_timestamp logger.info( "job_completed", timestamp=str(now()), job_id=job["id"], job_name=job["name"], resource_usage=job_info.resourceUsage, exit_status=job_info.exitStatus, )
def main(args): """ main entrypoint Args: args(): Returns: (void) """ config = configparser.ConfigParser() config.read(args.config_file) sample_id = args.sample_id output_dir = args.outdir try: assembly = args.assembly except AttributeError: assembly = os.path.join(output_dir, sample_id, 'assembly', 'contigs.fa') try: mlst_scheme_map_file = args.mlst_scheme_map_file except AttributeError: mlst_scheme_map_file = resource_filename('data', 'scheme_species_map.tab') if not mlst_scheme_map_file: mlst_scheme_map_file = resource_filename('data', 'scheme_species_map.tab') paths = { "output_dir": output_dir, 'logs': os.path.join( output_dir, sample_id, 'logs', ), 'mlst_path': os.path.join(output_dir, sample_id, 'typing', 'mlst', 'mlst.tsv'), 'mob_recon_path': os.path.join(output_dir, sample_id, 'typing', 'mob_recon'), 'abricate_plasmidfinder_path': os.path.join(output_dir, sample_id, 'typing', 'abricate', 'abricate_plasmidfinder.tsv'), } job_script_path = resource_filename('data', 'job_scripts') typing_jobs = [{ 'job_name': "_".join(['mlst', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'mlst.sh'), 'args': [ "--input", assembly, "--label", sample_id, "--output_file", paths['mlst_path'] ] }, { 'job_name': "_".join(['abricate', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'abricate.sh'), 'args': [ "--input", assembly, "--database", "plasmidfinder", "--output_file", paths['abricate_plasmidfinder_path'] ] }, { 'job_name': "_".join(['mob_recon', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'mob_recon.sh'), 'args': ["--input", assembly, "--output_dir", paths['mob_recon_path']] }] run_jobs(typing_jobs) mlst_report = os.path.join(output_dir, sample_id, "typing", "mlst", "mlst.tsv") mlst_hits = result_parsers.parse_mlst_result(mlst_report) # TODO: Check that there is only one MLST result in the report, and handle # cases where the report is malformed. [mlst_hit] = mlst_hits logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(mlst_report), scheme_id=mlst_hit["scheme_id"], sequence_type=mlst_hit["sequence_type"], ) mlst_scheme_map = input_parsers.parse_scheme_species_map( mlst_scheme_map_file) mlst_species = "Undefined" for scheme in mlst_scheme_map: if 'species' in scheme and scheme['scheme_id'] == mlst_hit['scheme_id']: mlst_species = scheme['species'] mob_recon_contig_report_path = os.path.join(output_dir, sample_id, "typing", "mob_recon", "contig_report.txt") mob_recon_contig_report = result_parsers.parse_mob_recon_contig_report( mob_recon_contig_report_path) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(mob_recon_contig_report_path), num_records=len(mob_recon_contig_report), ) mob_recon_aggregate_report_path = os.path.join( output_dir, sample_id, "typing", "mob_recon", "mobtyper_aggregate_report.txt") mob_recon_aggregate_report = result_parsers.parse_mob_recon_mobtyper_aggregate_report( mob_recon_aggregate_report_path) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(mob_recon_aggregate_report_path), num_records=len(mob_recon_aggregate_report), ) def extract_contig_num(contig_id): """ Given a contig_id from a mob_recon contig_report.txt file, return only the contig number. Args: contig_id (str): contig_id field from mob_recon contig_report.txt For example: "contigs.fa|contig00054_len=2672_cov=424.9_corr=0_origname=NODE_54_length_2672_cov_424.949312_pilon_sw=shovill-spades/1.0.1_date=20181024" Returns: str: contig number. For example: "00054" """ prefix = '|contig' suffix = '_len=' prefix_index = contig_id.find(prefix) + len(prefix) suffix_index = contig_id.find(suffix) contig_num = contig_id[prefix_index:suffix_index] return contig_num def get_plasmid_contigs(mob_recon_contig_report): """ Given a list of dicts generated by parsing a mob_recon contig_report.txt file, return a list of plasmid contigs. Args: mob_recon_contig_report (list of dict): Returns: list: plasmid contigs For example: ['00021', '00022', '00032', ...] """ plasmid_contigs = [] for contig_report_record in mob_recon_contig_report: contig_num = extract_contig_num(contig_report_record['contig_id']) if contig_num not in plasmid_contigs and contig_report_record[ 'rep_type']: plasmid_contigs.append(contig_num) return plasmid_contigs def get_likely_plasmid_contigs(mob_recon_contig_report): """ Given a list of dicts generated by parsing a mob_recon contig_report.txt file, return a list of likely plasmid contigs. Args: mob_recon_contig_report (list of dict): Returns: list: likely plasmid contigs For example: ['00054', '00039', '00061', ...] """ likely_plasmid_contigs = [] for contig_report_record in mob_recon_contig_report: contig_num = extract_contig_num(contig_report_record['contig_id']) if contig_num not in likely_plasmid_contigs and not contig_report_record[ 'rep_type']: likely_plasmid_contigs.append(contig_num) return likely_plasmid_contigs def get_plasmid_origins(mob_recon_contig_report): """ Given a list of dicts generated by parsing a mob_recon contig_report.txt file, return a list of plasmid origins. Args: mob_recon_contig_report (list of dict): Returns: list: plasmid origins For example: ['rep_cluster_1254', 'IncL/M', 'IncN', ...] """ origins = [] for contig_report_record in mob_recon_contig_report: if contig_report_record['rep_type']: if contig_report_record['rep_type'] not in origins: origins.append(contig_report_record['rep_type']) return origins plasmid_contigs = get_plasmid_contigs(mob_recon_contig_report) likely_plasmid_contigs = get_likely_plasmid_contigs( mob_recon_contig_report) origins = get_plasmid_origins(mob_recon_contig_report)
def main(args): """ main entrypoint Args: args(argparse.Namespace): Parsed command-line arguments. Returns: (void) """ config = configparser.ConfigParser() config.read(args.config_file) sample_id = args.sample_id output_dir = args.outdir try: assembly = args.assembly except AttributeError: assembly = os.path.join(output_dir, sample_id, 'assembly', 'contigs.fa') try: card_path = args.card_json except AttributeError: try: card_path = config['databases']['card_json'] if not os.path.exists(card_path): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), card_path) logger.info( "configuration_loaded", timestamp=str(now()), configuration_attribute="databases/card_json", configuration_value=card_path, ) except Exception as e: logger.error( "configuration_failed", timestamp=str(now()), configuration_attribute="databases/card_json", error_message=str(e), ) try: abricate_datadir = args.abricate_datadir except AttributeError: try: abricate_datadir = config['databases']['abricate_datadir'] if not os.path.exists(abricate_datadir): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), abricate_datadir) logger.info( "configuration_loaded", timestamp=str(now()), configuration_attribute="databases/abricate_datadir", configuration_value=abricate_datadir, ) except Exception as e: logger.error( "configuration_failed", timestamp=str(now()), configuration_attribute="databases/abricate_datadir", error_message=str(e), ) try: abricate_cpo_plasmid_db = args.abricate_cpo_plasmid_db except AttributeError: try: abricate_cpo_plasmid_db = config['databases'][ 'abricate_cpo_plasmid_db'] if not os.path.exists( os.path.join(abricate_datadir, abricate_cpo_plasmid_db)): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), abricate_cpo_plasmid_db) logger.info( "configuration_loaded", timestamp=str(now()), configuration_attribute="databases/abricate_cpo_plasmid_db", configuration_value=abricate_cpo_plasmid_db, ) except Exception as e: logger.error( "configuration_failed", timestamp=str(now()), configuration_attribute="databases/abricate_cpo_plasmid_db", ) paths = { "output_dir": output_dir, 'logs': os.path.join( output_dir, sample_id, 'logs', ), 'abricate_path': os.path.join(output_dir, sample_id, 'resistance', 'abricate', 'abricate.tsv'), 'rgi_path': os.path.join(output_dir, sample_id, 'resistance', 'rgi'), } job_script_path = resource_filename('data', 'job_scripts') resistance_jobs = [{ 'job_name': "_".join(['abricate', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'abricate.sh'), 'args': [ "--input", assembly, "--datadir", abricate_datadir, "--database", abricate_cpo_plasmid_db, "--output_file", paths['abricate_path'] ] }, { 'job_name': "_".join(['rgi', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'rgi.sh'), 'args': [ "--input", assembly, "--card_json", card_path, "--output_dir", paths['rgi_path'] ] }] run_jobs(resistance_jobs) abricate_report_path = os.path.join(output_dir, sample_id, "resistance", "abricate", "abricate.tsv") abricate_report = result_parsers.parse_abricate_result( abricate_report_path) logger.info("parsed_result_file", timestamp=str(datetime.datetime.utcnow().replace( tzinfo=datetime.timezone.utc).isoformat()), filename=os.path.abspath(abricate_report_path), resistance_genes=[{ key: record[key] for key in [ "gene", "accession", "database", "percent_coverage", "percent_identity", ] } for record in abricate_report]) rgi_report_path = os.path.join(output_dir, sample_id, "resistance", "rgi", "rgi.txt") rgi_report = result_parsers.parse_rgi_result_txt(rgi_report_path) logger.info("parsed_result_file", timestamp=str(datetime.datetime.utcnow().replace( tzinfo=datetime.timezone.utc).isoformat()), filename=os.path.abspath(rgi_report_path), resistance_genes=[{ key: record[key] for key in [ "best_hit_aro", "aro", ] } for record in rgi_report]) def get_abricate_carbapenemases(abricate_report): """ Given a list of dicts generated by parsing an abricate report file, return a list of carbapenemases. Args: abricate_report (list of dict): Returns: list: likely plasmid contigs For example: ['NDM-1', '', '', ...] """ abricate_carbapenemases = [] for abricate_report_record in abricate_report: abricate_carbapenemases.append(abricate_report_record['gene']) return abricate_carbapenemases def get_rgi_carbapenemases(rgi_report): """ Given a list of dicts generated by parsing an rgi report file, return a list of carbapenemases. Args: rgi_report (list of dict): Returns: list: likely plasmid contigs For example: ['', '', '', ...] """ rgi_carbapenemases = [] for rgi_report_record in rgi_report: if re.search("carbapenem", rgi_report_record['drug_class']): rgi_carbapenemases.append(rgi_report_record['best_hit_aro']) return rgi_carbapenemases
def main(args): """ main entrypoint Args: args(): Returns: (void) """ config = configparser.ConfigParser() config.read(args.config_file) try: mash_refseq_plasmid_db = args.mash_refseq_plasmid_db except AttributeError: try: mash_refseq_plasmid_db = config['databases'][ 'mash_refseq_plasmid_db'] if not os.path.exists(mash_refseq_plasmid_db): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), mash_refseq_plasmid_db) logger.info( "configuration_loaded", timestamp=str(now()), configuration_attribute="databases/mash_refseq_plasmid_db", configuration_value=mash_refseq_plasmid_db, ) except Exception as e: logger.error( "configuration_failed", timestamp=str(now()), configuration_attribute="databases/mash_refseq_plasmid_db", error_message=str(e), ) try: mash_custom_plasmid_db = args.mash_custom_plasmid_db except AttributeError: try: mash_custom_plasmid_db = config['databases'][ 'mash_custom_plasmid_db'] if not os.path.exists(mash_custom_plasmid_db): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), mash_custom_plasmid_db) logger.info( "configuration_loaded", timestamp=str(now()), configuration_attribute="databases/mash_custom_plasmid_db", configuration_value=mash_custom_plasmid_db, ) except Exception as e: logger.error( "configuration_failed", timestamp=str(now()), configuration_attribute="databases/mash_custom_plasmid_db", error_message=str(e), ) sample_id = args.sample_id output_dir = args.outdir paths = { 'job_scripts': resource_filename('data', 'job_scripts'), 'reads1_fastq': args.reads1_fastq, 'reads2_fastq': args.reads2_fastq, 'mash_custom_plasmid_db': mash_custom_plasmid_db, 'mash_refseq_plasmid_db': mash_refseq_plasmid_db, 'output_dir': output_dir, 'logs': os.path.join( output_dir, sample_id, 'logs', ), 'plasmid_output': os.path.join( output_dir, sample_id, "plasmids", ), "refseq_plasmid_output": os.path.join( output_dir, sample_id, "plasmids", "refseq_plasmids", ), "custom_plasmid_output": os.path.join( output_dir, sample_id, "plasmids", "custom_plasmids", ), } os.makedirs(paths['logs'], exist_ok=True) os.makedirs(os.path.join( paths['custom_plasmid_output'], 'candidates', ), exist_ok=True) os.makedirs(os.path.join( paths['refseq_plasmid_output'], 'candidates', ), exist_ok=True) refseq_candidates = strategies.refseq_plasmids(sample_id, paths) custom_candidates = strategies.custom_plasmids(sample_id, paths) candidates = refseq_candidates + custom_candidates samtools_faidx_jobs = [] bwa_index_jobs = [] for candidate in candidates: samtools_faidx_job = { 'job_name': "_".join(['samtools_faidx', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 2', 'remote_command': os.path.join(paths['job_scripts'], 'samtools_faidx.sh'), 'args': [ "--fasta", candidate['fasta_path'], ] } bwa_index_job = { 'job_name': "_".join(['bwa_index', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 2', 'remote_command': os.path.join(paths['job_scripts'], 'bwa_index.sh'), 'args': [ "--fasta", candidate['fasta_path'], ] } samtools_faidx_jobs.append(samtools_faidx_job) bwa_index_jobs.append(bwa_index_job) run_jobs(samtools_faidx_jobs + bwa_index_jobs) bwa_mem_jobs = [] for candidate in candidates: bwa_mem_job = { 'job_name': "_".join(['bwa_mem', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8 -shell y', 'remote_command': os.path.join(paths['job_scripts'], 'bwa_mem.sh'), 'args': [ "--reference", candidate['fasta_path'], "--R1", paths['reads1_fastq'], "--R2", paths['reads2_fastq'], "--output", re.sub("\.fna$", ".sam", candidate['fasta_path']) ] } bwa_mem_jobs.append(bwa_mem_job) run_jobs(bwa_mem_jobs) samtools_filter_fixmate_sort_jobs = [] for candidate in candidates: alignment = os.path.join( re.sub("\.fna$", ".sam", candidate['fasta_path'])) samtools_filter_fixmate_sort_job = { 'job_name': "_".join([ 'samtools_filter_fixmate_sort', sample_id, candidate['accession'] ]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 4', 'remote_command': os.path.join(paths['job_scripts'], 'samtools_filter_fixmate_sort.sh'), 'args': [ "--input", alignment, "--flags", 1540, "--output", re.sub('\.sam$', '.bam', alignment), ] } samtools_filter_fixmate_sort_jobs.append( samtools_filter_fixmate_sort_job) run_jobs(samtools_filter_fixmate_sort_jobs) for candidate in candidates: sam_alignment = "/".join([ re.sub('\.fna$', '.sam', candidate['fasta_path']), ]) os.remove(sam_alignment) samtools_index_jobs = [] for candidate in candidates: alignment = os.path.join( re.sub('\.fna', '.bam', candidate['fasta_path'])) samtools_index_job = { 'job_name': "_".join(['samtools_index', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 4', 'remote_command': os.path.join(paths['job_scripts'], 'samtools_index.sh'), 'args': [ "--input", alignment, ] } samtools_index_jobs.append(samtools_index_job) run_jobs(samtools_index_jobs) samtools_depth_jobs = [] for candidate in candidates: alignment = os.path.join( re.sub('\.fna', '.bam', candidate['fasta_path'])) samtools_depth_job = { 'job_name': "_".join(['samtools_depth', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 1', 'remote_command': os.path.join(paths['job_scripts'], 'samtools_depth.sh'), 'args': [ "--input", alignment, "--output", re.sub('\.bam$', '.depth', alignment), ] } samtools_depth_jobs.append(samtools_depth_job) run_jobs(samtools_depth_jobs) for candidate in candidates: depth = os.path.join( re.sub('\.fna$', '.depth', candidate['fasta_path']), ) MINIMUM_DEPTH = 10 MINIMUM_COVERAGE_PERCENT = 95.0 positions_above_minimum_depth = 0 total_length = 0 with open(depth) as depth_file: for line in depth_file: [_, position, depth] = line.split() total_length += 1 if int(depth) >= MINIMUM_DEPTH: positions_above_minimum_depth += 1 candidate['bases_above_minimum_depth'] = positions_above_minimum_depth try: candidate[ 'percent_above_minimum_depth'] = positions_above_minimum_depth / total_length except ZeroDivisionError: candidate['percent_above_minimum_depth'] = 0.0 freebayes_jobs = [] for candidate in candidates: alignment = re.sub('\.fna$', '.bam', candidate['fasta_path']) reference = candidate['fasta_path'] vcf = re.sub('\.fna$', '.vcf', candidate['fasta_path']) freebayes_job = { 'job_name': "_".join(['freebayes', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(paths['job_scripts'], 'freebayes.sh'), 'args': [ "--input", alignment, "--reference", reference, "--output", vcf, ] } freebayes_jobs.append(freebayes_job) run_jobs(freebayes_jobs) bcftools_view_jobs = [] for candidate in candidates: vcf = re.sub('\.fna$', '.vcf', candidate['fasta_path']) bcftools_view_job = { 'job_name': "_".join(['bcftools_view', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 2 -shell y', 'remote_command': os.path.join(paths['job_scripts'], 'bcftools_view.sh'), 'args': [ "--input", vcf, "--output", re.sub('\.vcf$', '.snps.vcf', vcf), ] } bcftools_view_jobs.append(bcftools_view_job) run_jobs(bcftools_view_jobs) for candidate in candidates: snps_vcf = re.sub('\.fna$', '.snps.vcf', candidate['fasta_path']) snps = 0 with open(snps_vcf, 'r') as f: for line in f: if not line.startswith('#'): snps += 1 candidate['snps'] = snps plasmid_output_summary = os.path.join(paths['plasmid_output'], 'custom_plasmid.txt') plasmid_output_final = os.path.join(output_dir, sample_id, 'final_plasmid.tsv') custom_candidates = [c for c in candidates if c['database'] == 'custom'] custom_candidates.sort(key=operator.itemgetter('snps')) custom_candidates.sort(key=operator.itemgetter('plasmid_length'), reverse=True) custom_candidates.sort( key=operator.itemgetter('percent_above_minimum_depth'), reverse=True) custom_best_candidate = next(iter(custom_candidates), None) with open(plasmid_output_final, 'w+') as f: fieldnames = [ 'sample_id', 'accession', 'circularity', 'plasmid_length', 'bases_above_minimum_depth', 'percent_above_minimum_depth', 'snps', 'allele', 'incompatibility_group' ] writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t', extrasaction='ignore') writer.writeheader() if custom_best_candidate: f.write(args.sample_id + '\t') # Truncate floats to 4 digits writer.writerow({ k: round(v, 4) if isinstance(v, float) else v for k, v in custom_best_candidate.items() }) with open(plasmid_output_summary, 'w+') as f: fieldnames = [ 'sample_id', 'accession', 'circularity', 'plasmid_length', 'bases_above_minimum_depth', 'percent_above_minimum_depth', 'snps', 'allele', 'incompatibility_group' ] writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t', extrasaction='ignore') writer.writeheader() for candidate in custom_candidates: f.write(args.sample_id + '\t') # Truncate floats to 4 digits writer.writerow({ k: round(v, 4) if isinstance(v, float) else v for k, v in candidate.items() })