def testMungePFX6(self): real_info = { 'sample_id': '6037', 'pfx': '6037', 'assay': 'hotspot-heme', 'mini-pfx': '6037' } test_info = munge_pfx('6037_HP06')
def testMungePFX4(self): real_info = { 'sample_id': '6037', 'pfx': '6037', 'assay': 'MSI-PLUS', 'mini-pfx': '6037' } test_info = munge_pfx('6037_MSI-Plus')
def testMungePFX5(self): real_info = { 'sample_id': '6037', 'pfx': '6037', 'assay': 'hotspot-hereditary', 'mini-pfx': '6037' } test_info = munge_pfx('6037_GLT06')
def testMungePFX2(self): real_info = { 'sample_id': 'LMG-240', 'pfx': 'LMG-240', 'assay': 'Coloseq', 'mini-pfx': 'LMG-240' } test_info = munge_pfx('LMG-240') self.assertDictEqual(real_info, test_info)
def testMungePFX3(self): real_info = { 'sample_id': '6037', 'pfx': '6037', 'assay': 'Coloseq', 'well': '01', 'mini-pfx': '6037' } test_info = munge_pfx('6037_01_BROv8')
def action(args): specimens = collections.defaultdict(dict) annotation = {} prefixes = [] variant_keys = ['Position', 'Ref_Base', 'Var_Base'] files = ifilter(filters.any_analysis, walker(args.path)) files = ifilter(filters.only_analysis, files) #sort the files so that the output in the workbook is sorted files=sorted(files) for pth in files: pfx = munge_pfx(pth.fname) reads_pfx=pfx['mini-pfx']+'_Ref|Var' prefixes.append(reads_pfx) with open(os.path.join(pth.dir, pth.fname)) as fname: print pth.fname reader = csv.DictReader(fname, delimiter='\t') for row in reader: variant = tuple(row[k] for k in variant_keys) specimens[variant][reads_pfx] = row['Ref_Reads']+'|'+row['Var_Reads'] annotation[variant] = row annotation_headers = [ 'Gene', 'Variant_Type', 'Transcripts', 'Clinically_Flagged', 'Cosmic', 'Segdup', 'Polyphen', 'Sift', 'Mutation_Taster', 'Gerp', 'HiSeq_Freq', 'HiSeq_Count', 'MiSeq_Freq', 'MiSeq_Count', '1000g_ALL', 'EVS_esp6500_ALL', '1000g_AMR', 'EVS_esp6500_AA', '1000g_EUR', 'EVS_esp6500_EU', '1000g_ASN', '1000g_AFR'] writer = csv.DictWriter(args.outfile, fieldnames = variant_keys + annotation_headers + prefixes, extrasaction = 'ignore', delimiter = '\t') writer.writeheader() for variant in sorted(specimens.keys()): d = {k:v for k,v in zip(variant_keys,variant)} d.update({pfx:specimens[variant].get(pfx) for pfx in prefixes}) d.update(annotation[variant]) writer.writerow(d)
def testMungePFX1(self): real_info = { 'control': 'NA12878', 'machine-run': 'HA0201', 'library-version': 'OPXv4', 'well': 'E05', 'run': '60', 'sample_id': '6037', 'pfx': '6037_E05_OPXv4_NA12878_HA0201', 'assay': 'OncoPlex', 'mini-pfx': '6037_NA12878' } test_info = munge_pfx('6037_E05_OPXv4_NA12878_HA0201') self.assertDictEqual(real_info, test_info)
def parse_pindel(variant_keys, files, path): specimens = collections.defaultdict(dict) annotation = {} prefixes = [] for pth in files: pfx = munge_pfx(pth.fname) prefixes.append(pfx['mini-pfx']) with open(os.path.join(pth.dir, pth.fname)) as fname: print pth.fname reader = csv.DictReader(fname, delimiter='\t') for row in reader: variant = tuple(row[k] for k in variant_keys) specimens[variant][pfx['mini-pfx']] = row['Reads'] annotation[variant] = row return specimens, annotation, prefixes
def action(args): #Grab all analysis files from the path files = ifilter(filters.any_analysis, walker(args.path)) files = filter(filters.polyhunter_analysis, files) # interesting_files = glob.glob("*.csv") df_list = [] df = pd.DataFrame() sort_order = [ x['barcode_id'] for x in csv.DictReader(args.pipeline_manifest) ] for sample in sort_order: #Grab the file for each sample, in specified sort order pfx_file = [s for s in files if sample in s.fname] if pfx_file: pfx_file = pfx_file[0] pfx = munge_pfx(pfx_file.fname) #Create a smaller version of this really long string data = pd.read_csv(os.path.join(pfx_file.dir, pfx_file.fname), sep='\t') data.index = [pfx['mini-pfx']] * len(data) df = df.append(data) cols = natsorted(df.columns) df.to_csv(args.outfile, sep='\t', na_rep='0', columns=cols)
def action(args): specimens = collections.defaultdict(dict) annotation = {} prefixes = [] # apply a series of filters to files files = ifilter(filters.any_analysis, walker(args.path)) if args.type == "Exon": files = ifilter(filters.cnv_exon_analysis, files) elif args.type == "Gene": files = ifilter(filters.cnv_gene_analysis, files) variant_keys = ["Position", "Gene"] # sort the files so that the output in the workbook is sorted for pth in files: pfx = munge_pfx(pth.fname) log_pfx = pfx["mini-pfx"] + "_Log" prefixes.append(log_pfx) with open(os.path.join(pth.dir, pth.fname)) as fname: print pth.fname reader = csv.DictReader(fname, delimiter="\t") for row in reader: variant = tuple(row[k] for k in variant_keys) specimens[variant][log_pfx] = row["Ave_Adjusted_Log_Ratio"] annotation[variant] = row annotation_headers = ["Transcripts"] writer = csv.DictWriter( args.outfile, fieldnames=variant_keys + annotation_headers + prefixes, extrasaction="ignore", delimiter="\t" ) writer.writeheader() for variant in sorted(specimens.keys()): d = {k: v for k, v in zip(variant_keys, variant)} d.update({pfx: specimens[variant].get(pfx) for pfx in prefixes}) d.update(annotation[variant]) writer.writerow(d)
def action(args): #read in the data, adding the name into the df and skipping the 'version' filelist = ifilter(filters.hs_file_finder, walker(args.path)) df_list = [] pd.set_option('display.width', 100) for pfx_file in filelist: pfx = munge_pfx(pfx_file.fname) log_pfx = pfx['mini-pfx'] data = pd.read_csv(os.path.join(pfx_file.dir, pfx_file.fname), sep='\t', comment='#', error_bad_lines=False).assign(SAMPLE=log_pfx) df_list.append(data[0:1]) #concatenate them together big_df = pd.concat(df_list, ignore_index=True) # #now, lets grab just the data we want qc_df = big_df[[ 'SAMPLE', 'MEAN_TARGET_COVERAGE', # 'MEDIAN_TARGET_COVERAGE','MAX_TARGET_COVERAGE', 'PCT_USABLE_BASES_ON_TARGET', 'PF_UNIQUE_READS', ]] #Setup the values we wish to plot qc_df['On Target Reads'] = qc_df['PF_UNIQUE_READS'] * qc_df[ 'PCT_USABLE_BASES_ON_TARGET'] qc_df['Off Target Reads'] = qc_df['PF_UNIQUE_READS'] - qc_df[ 'On Target Reads'] qc_df['On Target Reads'] = qc_df['On Target Reads'].astype(int) qc_df['Off Target Reads'] = qc_df['Off Target Reads'].astype(int) qc_df = qc_df.sort_values(by=['SAMPLE']) #Setup the plot data1 = go.Bar( x=qc_df['SAMPLE'], # assign x as the dataframe column 'x' y=qc_df['Off Target Reads'], name='Off Target Reads', xaxis='x1', yaxis='y1') data2 = go.Bar(x=qc_df['SAMPLE'], y=qc_df['On Target Reads'], name='On Target Reads', xaxis='x1', yaxis='y1') layout = { 'title': 'QC Metrics', 'xaxis': { 'type': 'category', #required so mini sample names are strings instead of numbers 'domain': [0, 1] }, 'yaxis': { 'hoverformat': ',f', #print real numbers, not 149.786k 'domain': [.7, 1] }, #only take bottom portion of screen}, 'barmode': 'stack' } #setup table table = go.Table( header=dict(values=[ 'Sample ID', 'Mean Target Coverage', 'Total Read Pairs', 'On Target Reads', 'Off Target Reads' ], line=dict(color='#7D7F80'), fill=dict(color='#a1c3d1'), align=['left'] * 5), cells=dict( values=[ qc_df['SAMPLE'], qc_df['MEAN_TARGET_COVERAGE'], qc_df['PF_UNIQUE_READS'], qc_df['On Target Reads'], qc_df['Off Target Reads'] ], line=dict(color='#7D7F80'), fill=dict(color=[ 'rgb(245,245,245)', #color for the first column, red if Target Coverage below 500 [ 'rgba(0,250,0, 0.8)' if val >= 500 else 'rgba(250,0,0, 0.8)' for val in qc_df['MEAN_TARGET_COVERAGE'] ] ]), align=['left'] * 5), domain=dict( x=[0, 1], #above/belowe y=[0, .5])) #Make the plot fig = go.Figure(data=[data1, data2, table], layout=layout) plotly.offline.plot(fig, filename=args.outfile, auto_open=false)