예제 #1
0
 def testMungePFX6(self):
     real_info = {
         'sample_id': '6037',
         'pfx': '6037',
         'assay': 'hotspot-heme',
         'mini-pfx': '6037'
     }
     test_info = munge_pfx('6037_HP06')
예제 #2
0
 def testMungePFX4(self):
     real_info = {
         'sample_id': '6037',
         'pfx': '6037',
         'assay': 'MSI-PLUS',
         'mini-pfx': '6037'
     }
     test_info = munge_pfx('6037_MSI-Plus')
예제 #3
0
 def testMungePFX5(self):
     real_info = {
         'sample_id': '6037',
         'pfx': '6037',
         'assay': 'hotspot-hereditary',
         'mini-pfx': '6037'
     }
     test_info = munge_pfx('6037_GLT06')
예제 #4
0
 def testMungePFX2(self):
     real_info = {
         'sample_id': 'LMG-240',
         'pfx': 'LMG-240',
         'assay': 'Coloseq',
         'mini-pfx': 'LMG-240'
     }
     test_info = munge_pfx('LMG-240')
     self.assertDictEqual(real_info, test_info)
예제 #5
0
 def testMungePFX3(self):
     real_info = {
         'sample_id': '6037',
         'pfx': '6037',
         'assay': 'Coloseq',
         'well': '01',
         'mini-pfx': '6037'
     }
     test_info = munge_pfx('6037_01_BROv8')
예제 #6
0
def action(args):
    specimens = collections.defaultdict(dict)
    annotation = {}
    prefixes = []
    variant_keys = ['Position', 'Ref_Base', 'Var_Base']
    files = ifilter(filters.any_analysis, walker(args.path))
    files = ifilter(filters.only_analysis, files)

    #sort the files so that the output in the workbook is sorted
    files=sorted(files)
    for pth in files:
        pfx = munge_pfx(pth.fname)
        reads_pfx=pfx['mini-pfx']+'_Ref|Var'
        prefixes.append(reads_pfx)
        with open(os.path.join(pth.dir, pth.fname)) as fname:
            print pth.fname
            reader = csv.DictReader(fname, delimiter='\t')
            for row in reader:
                variant = tuple(row[k] for k in variant_keys)
                specimens[variant][reads_pfx] = row['Ref_Reads']+'|'+row['Var_Reads']
                annotation[variant] = row


    annotation_headers = [
        'Gene',
        'Variant_Type',
        'Transcripts',
        'Clinically_Flagged',
        'Cosmic',
        'Segdup',
        'Polyphen',
        'Sift',
        'Mutation_Taster',
        'Gerp',
        'HiSeq_Freq',
        'HiSeq_Count',
        'MiSeq_Freq',
        'MiSeq_Count',
        '1000g_ALL',
        'EVS_esp6500_ALL',
        '1000g_AMR',
        'EVS_esp6500_AA',
        '1000g_EUR',
        'EVS_esp6500_EU',
        '1000g_ASN',
        '1000g_AFR']

    writer = csv.DictWriter(args.outfile, fieldnames = variant_keys + annotation_headers + prefixes,  extrasaction = 'ignore', delimiter = '\t')
    writer.writeheader()
    for variant in sorted(specimens.keys()):
        d = {k:v for k,v in zip(variant_keys,variant)}
        d.update({pfx:specimens[variant].get(pfx) for pfx in prefixes})
        d.update(annotation[variant])
        writer.writerow(d)
예제 #7
0
    def testMungePFX1(self):
        real_info = {
            'control': 'NA12878',
            'machine-run': 'HA0201',
            'library-version': 'OPXv4',
            'well': 'E05',
            'run': '60',
            'sample_id': '6037',
            'pfx': '6037_E05_OPXv4_NA12878_HA0201',
            'assay': 'OncoPlex',
            'mini-pfx': '6037_NA12878'
        }

        test_info = munge_pfx('6037_E05_OPXv4_NA12878_HA0201')
        self.assertDictEqual(real_info, test_info)
예제 #8
0
def parse_pindel(variant_keys, files, path):
    specimens = collections.defaultdict(dict)
    annotation = {}
    prefixes = []

    for pth in files:
        pfx = munge_pfx(pth.fname)
        prefixes.append(pfx['mini-pfx'])
        with open(os.path.join(pth.dir, pth.fname)) as fname:
            print pth.fname
            reader = csv.DictReader(fname, delimiter='\t')
            for row in reader:
                variant = tuple(row[k] for k in variant_keys)
                specimens[variant][pfx['mini-pfx']] = row['Reads']
                annotation[variant] = row
    return specimens, annotation, prefixes
예제 #9
0
def action(args):
    #Grab all analysis files from the path
    files = ifilter(filters.any_analysis, walker(args.path))
    files = filter(filters.polyhunter_analysis, files)
    #    interesting_files = glob.glob("*.csv")
    df_list = []
    df = pd.DataFrame()
    sort_order = [
        x['barcode_id'] for x in csv.DictReader(args.pipeline_manifest)
    ]
    for sample in sort_order:
        #Grab the file for each sample, in specified sort order
        pfx_file = [s for s in files if sample in s.fname]
        if pfx_file:
            pfx_file = pfx_file[0]
            pfx = munge_pfx(pfx_file.fname)
            #Create a smaller version of this really long string
            data = pd.read_csv(os.path.join(pfx_file.dir, pfx_file.fname),
                               sep='\t')
            data.index = [pfx['mini-pfx']] * len(data)
            df = df.append(data)
    cols = natsorted(df.columns)
    df.to_csv(args.outfile, sep='\t', na_rep='0', columns=cols)
예제 #10
0
def action(args):
    specimens = collections.defaultdict(dict)
    annotation = {}
    prefixes = []
    # apply a series of filters to files

    files = ifilter(filters.any_analysis, walker(args.path))
    if args.type == "Exon":
        files = ifilter(filters.cnv_exon_analysis, files)
    elif args.type == "Gene":
        files = ifilter(filters.cnv_gene_analysis, files)
    variant_keys = ["Position", "Gene"]
    # sort the files so that the output in the workbook is sorted
    for pth in files:
        pfx = munge_pfx(pth.fname)
        log_pfx = pfx["mini-pfx"] + "_Log"
        prefixes.append(log_pfx)
        with open(os.path.join(pth.dir, pth.fname)) as fname:
            print pth.fname
            reader = csv.DictReader(fname, delimiter="\t")
            for row in reader:
                variant = tuple(row[k] for k in variant_keys)
                specimens[variant][log_pfx] = row["Ave_Adjusted_Log_Ratio"]
                annotation[variant] = row

    annotation_headers = ["Transcripts"]

    writer = csv.DictWriter(
        args.outfile, fieldnames=variant_keys + annotation_headers + prefixes, extrasaction="ignore", delimiter="\t"
    )
    writer.writeheader()
    for variant in sorted(specimens.keys()):
        d = {k: v for k, v in zip(variant_keys, variant)}
        d.update({pfx: specimens[variant].get(pfx) for pfx in prefixes})
        d.update(annotation[variant])
        writer.writerow(d)
예제 #11
0
def action(args):

    #read in the data, adding the name into the df and skipping the 'version'
    filelist = ifilter(filters.hs_file_finder, walker(args.path))
    df_list = []
    pd.set_option('display.width', 100)
    for pfx_file in filelist:
        pfx = munge_pfx(pfx_file.fname)
        log_pfx = pfx['mini-pfx']
        data = pd.read_csv(os.path.join(pfx_file.dir, pfx_file.fname),
                           sep='\t',
                           comment='#',
                           error_bad_lines=False).assign(SAMPLE=log_pfx)

        df_list.append(data[0:1])

    #concatenate them together
    big_df = pd.concat(df_list, ignore_index=True)

    # #now, lets grab just the data we want
    qc_df = big_df[[
        'SAMPLE',
        'MEAN_TARGET_COVERAGE',
        #              'MEDIAN_TARGET_COVERAGE','MAX_TARGET_COVERAGE',
        'PCT_USABLE_BASES_ON_TARGET',
        'PF_UNIQUE_READS',
    ]]

    #Setup the values we wish to plot
    qc_df['On Target Reads'] = qc_df['PF_UNIQUE_READS'] * qc_df[
        'PCT_USABLE_BASES_ON_TARGET']
    qc_df['Off Target Reads'] = qc_df['PF_UNIQUE_READS'] - qc_df[
        'On Target Reads']
    qc_df['On Target Reads'] = qc_df['On Target Reads'].astype(int)
    qc_df['Off Target Reads'] = qc_df['Off Target Reads'].astype(int)

    qc_df = qc_df.sort_values(by=['SAMPLE'])

    #Setup the plot
    data1 = go.Bar(
        x=qc_df['SAMPLE'],  # assign x as the dataframe column 'x'
        y=qc_df['Off Target Reads'],
        name='Off Target Reads',
        xaxis='x1',
        yaxis='y1')
    data2 = go.Bar(x=qc_df['SAMPLE'],
                   y=qc_df['On Target Reads'],
                   name='On Target Reads',
                   xaxis='x1',
                   yaxis='y1')
    layout = {
        'title': 'QC Metrics',
        'xaxis': {
            'type':
            'category',  #required so mini sample names are strings instead of numbers
            'domain': [0, 1]
        },
        'yaxis': {
            'hoverformat': ',f',  #print real numbers, not 149.786k
            'domain': [.7, 1]
        },  #only take bottom portion of screen}, 
        'barmode': 'stack'
    }

    #setup table
    table = go.Table(
        header=dict(values=[
            'Sample ID', 'Mean Target Coverage', 'Total Read Pairs',
            'On Target Reads', 'Off Target Reads'
        ],
                    line=dict(color='#7D7F80'),
                    fill=dict(color='#a1c3d1'),
                    align=['left'] * 5),
        cells=dict(
            values=[
                qc_df['SAMPLE'], qc_df['MEAN_TARGET_COVERAGE'],
                qc_df['PF_UNIQUE_READS'], qc_df['On Target Reads'],
                qc_df['Off Target Reads']
            ],
            line=dict(color='#7D7F80'),
            fill=dict(color=[
                'rgb(245,245,245)',  #color for the first column, red if Target Coverage below 500
                [
                    'rgba(0,250,0, 0.8)'
                    if val >= 500 else 'rgba(250,0,0, 0.8)'
                    for val in qc_df['MEAN_TARGET_COVERAGE']
                ]
            ]),
            align=['left'] * 5),
        domain=dict(
            x=[0, 1],  #above/belowe
            y=[0, .5]))

    #Make the plot
    fig = go.Figure(data=[data1, data2, table], layout=layout)

    plotly.offline.plot(fig, filename=args.outfile, auto_open=false)