def testMapHeaders(self): """ Gets header(s) and info from each file """ fname = path.join(summary_testfiles, '{}_gatk.variant_function').format(control) header_ids = {0: 'var_type_1', 1: 'gene', 7: 'zygosity', 12: 'rsid_1', 8: 'GATK_Score'} variant_idx = [2, 3, 4, 5, 6] out = summary.map_headers(fname, header_ids, variant_idx) out = list(out) self.assertEqual(len(out), 1713) header_keys = set(header_ids.values()) # confirm that all keys in header_ids are contained in each row of the output for pos, data in out: self.assertFalse(header_keys - set(data.keys()))
def action(args): (infiles, ) = args.infiles headers = ['Position'] + variant_headers[3:5] + [ 'Gene', 'dbSNP_ID', 'Variant_Type', 'Transcripts', 'Clinically_Flagged', 'NCI60', 'Cosmic', 'Segdup', 'Polyphen', 'Sift', 'Mutation_Taster', 'Gerp', '1000g_ALL', 'EVS_esp6500_ALL', '1000g_AMR', 'EVS_esp6500_AA', '1000g_EUR', 'EVS_esp6500_EU', '1000g_ASN', '1000g_AFR'] # accumulate data from all input files for each variant output = defaultdict(dict) for fname in infiles: _, file_type = path.basename(fname).split('.', 1) try: header_ids, var_key_ids = file_types[file_type] except KeyError: log.warning('no match: %s' % fname) if args.strict: sys.exit(1) continue for var_key, data in map_headers(fname, header_ids, var_key_ids): output[var_key].update(data) writer = csv.DictWriter(args.outfile, fieldnames = headers, quoting = csv.QUOTE_MINIMAL, extrasaction = 'ignore', delimiter='\t') writer.writeheader() sort_key = lambda row: [(row[k]) for k in ['chr','start','stop']] # write each row (with all data aggregated), modifying fields as necessary for data in sorted(output.values(), key = sort_key): # modify any specific fields here data['Variant_Type'] = data.get('var_type_2') if data.get('var_type_2','').strip() else data.get('var_type_1') data['Gene'], data['Transcripts'] = munge_gene_and_Transcripts(data) data['dbSNP_ID']=data.get('rsid_1') or data.get('rsid_2') data['1000g_ALL']=data.get('1000g_ALL') or -1 data['1000g_AMR']=data.get('1000g_AMR') or -1 data['1000g_ASN']=data.get('1000g_ASN') or -1 data['1000g_AFR']=data.get('1000g_AFR') or -1 data['1000g_EUR']=data.get('1000g_EUR') or -1 data['EVS_esp6500_ALL']=data.get('EVS_esp6500_ALL') or -1 data['EVS_esp6500_AA']=data.get('EVS_esp6500_AA') or -1 data['EVS_esp6500_EU']=data.get('EVS_esp6500_EU') or -1 writer.writerow(data)