示例#1
0
 accession = lib['bioSple']
 print(accession)
 total_reads = 0
 for batch_lib in batch_obj['batch']['readsDist']['DNA']:
     if batch_lib['bioSple'] == accession:
         total_reads = batch_lib['postQualityReads']
         print(total_reads)
 if total_reads == 0:
     continue
 summary_paths = lib['diagnosticOutput']
 composition_path = glob.glob(
     os.path.join(results_dir, 'tax',
                  seq_sple + '*sample_composition.out'))
 if composition_path:
     sample_comp = bin_reads(composition_path[0],
                             ncbi_class=ncbi,
                             ctrl_taxids=[10760],
                             quantification='relative')
     # for key in sample_comp:
     #     sample_comp[key] = sample_comp[key] * 10e6 / total_reads
 else:
     sample_comp = {
         "Human": None,
         "Bacteria": None,
         "Virus": None,
         "Parasite": None,
         "Fungus": None,
         "Unclassified": None
     }
 viral_summary = [
     path for path in lib['diagnosticOutput']
     if 'dna.viral.dxsm.out.summary.gz' in path
示例#2
0
def get_lib_dict_ls(results_dir=None, fqo=None):
    fqo_hi_ic = pd.read_csv(fqo)
    fqo_hi_ic['Run Date (YYYY-MM-DD)'] = pd.to_datetime(
        fqo_hi_ic['Run Date (YYYY-MM-DD)'])
    results_dir = results_dir
    batch_paths = glob.glob(os.path.join(results_dir, 'batch', '*'))
    lib_dict_ls = []
    for batch_path in batch_paths:
        with open(batch_path) as batch_file:
            batch = json.load(batch_file)
            batch_id = batch['batch']['libBatchId']
        for lib in batch['libraries']:
            accession = lib['bioSple']
            seq_sple = lib['seqSple']
            lib_type = lib['libType']
            if not any(fqo_hi_ic['Seq Sple'] == seq_sple.lower()):
                continue
            run_date = fqo_hi_ic.loc[fqo_hi_ic['Seq Sple'] == seq_sple.lower(),
                                     'Run Date (YYYY-MM-DD)'].values[0]
            sample_name = lib['spleName']
            total_reads = lib['qualityFilterInfo']['readsOut']
            tax_paths = lib['diagnosticOutput']
            vir_paths = [
                path for path in tax_paths
                if 'dna.viral.dxsm.out.summary.gz' in path
            ]
            t7_read_cnt = 0
            pr772_read_cnt = 0
            if vir_paths:
                vir_path = vir_paths[0]
                with gzip.open(os.path.join(results_dir,
                                            vir_path)) as vir_file:
                    for line in vir_file:
                        obj = json.loads(line.strip())
                        if obj['reporting_id'] == '26706_10760':
                            t7_read_cnt = obj['read_count']
                        elif obj['reporting_id'] == '26648_261665':
                            pr772_read_cnt = obj['read_count']
            t7_norm = 10e6 * t7_read_cnt / total_reads
            pr772_norm = 10e6 * pr772_read_cnt / total_reads
            # composition file data
            composition_path_ls = glob.glob(
                os.path.join(results_dir, 'tax',
                             seq_sple + '*dna.sample_composition.out'))
            if not composition_path_ls:
                composition_data = {
                    'Total Read Count': 1e-6,
                    'Taxid Read Count': [(10760, 0), (261665, 0)]
                }
                org_composition = {
                    "Human": 0,
                    "Bacteria": 0,
                    "Virus": 0,
                    "Parasite": 0,
                    "Fungus": 0,
                    "Unclassified": 0
                }
                composition_file = None
            else:
                composition_path = composition_path_ls[0]
                composition_file = os.path.basename(composition_path)
                composition_data = parse_composition_file(
                    composition_path, [10760, 261665])
                org_composition = bin_reads(composition_path,
                                            ncbi_class=ncbi,
                                            quantification='relative',
                                            ctrl_taxids=[10760, 261665])
                # for key in org_composition:
                #     org_composition[key] = 10e6 * org_composition[key] / composition_data['Total Read Count']
            if not composition_data['Taxid Read Count']:
                composition_data['Taxid Read Count'] = [(10760, 0),
                                                        (261665, 0)]
            elif len(composition_data['Taxid Read Count']) == 1:
                if composition_data['Taxid Read Count'][0][0] == 10760:
                    composition_data['Taxid Read Count'].append((261665, 0))
                elif composition_data['Taxid Read Count'][0][0] == 261665:
                    composition_data['Taxid Read Count'].append((10760, 0))
            t7_raw_comp = [
                item[1] for item in composition_data['Taxid Read Count']
                if item[0] == 10760
            ][0]
            pr772_raw_comp = [
                item[1] for item in composition_data['Taxid Read Count']
                if item[0] == 261665
            ][0]
            t7_norm_comp = 10e6 * t7_raw_comp / composition_data[
                'Total Read Count']
            pr772_norm_comp = 10e6 * pr772_raw_comp / composition_data[
                'Total Read Count']
            lib_dict = {
                'Accession':
                accession,
                'Seq Sple':
                seq_sple,
                'Composition File':
                composition_file,
                'Batch ID':
                batch_id,
                'Run Date':
                run_date,
                'Sample Name':
                sample_name,
                'Library Type':
                lib_type,
                'Total Reads':
                total_reads,
                'T7 Raw Reads':
                t7_read_cnt,
                'PR772 Raw Reads':
                pr772_read_cnt,
                'T7 Normalized Reads':
                t7_norm,
                'PR772 Normalized Reads':
                pr772_norm,
                'T7 + PR772 NR':
                t7_norm + pr772_norm,
                'Log10 T7 + PR772 NR':
                np.log10(t7_norm + pr772_norm),
                'Total Reads (Composition File)':
                composition_data['Total Read Count'],
                'T7 Raw Reads (Composition File)':
                t7_raw_comp,
                'T4 Raw Reads (Composition File)':
                pr772_raw_comp,
                'T7 Normalized (Composition File)':
                t7_norm_comp,
                'T4 Normalized (Composition File)':
                pr772_norm_comp,
                'T7 + PR772 NR (Composition File)':
                t7_norm_comp + pr772_norm_comp,
                'Log10 T7 + PR772 NR (Composition File)':
                np.log10(t7_norm_comp + pr772_norm_comp),
                'Summary - Composition (log10)':
                np.log10(t7_norm + pr772_norm) -
                np.log10(t7_norm_comp + pr772_norm_comp),
                "Human":
                org_composition['Human'],
                "Bacteria":
                org_composition['Bacteria'],
                "Virus":
                org_composition['Virus'],
                "Parasite":
                org_composition['Parasite'],
                "Fungus":
                org_composition['Fungus'],
                "Unclassified":
                org_composition['Unclassified']
            }
            lib_dict_ls.append(lib_dict)
    return lib_dict_ls
示例#3
0
batch_file_paths = glob.glob(os.path.join(results_dir, 'batch', '*'))

results = {}
for batch_file in batch_file_paths:
    with open(batch_file) as input:
        batch_obj = json.load(input)
    for lib in batch_obj['libraries']:
        seq_sple = lib['seqSple']
        accession = lib['bioSple']
        for batch_lib in batch_obj['batch']['readsDist']['DNA']:
            if batch_lib['bioSple'] == accession:
                total_reads = batch_lib['postQualityReads']
        summary_paths = lib['diagnosticOutput']
        composition_path = glob.glob(os.path.join(results_dir, 'tax', seq_sple + '*sample_composition.out'))
        if composition_path:
            sample_comp = bin_reads(composition_path[0], ncbi_class=ncbi, quantification='absolute', ctrl_taxids=[10760])
            for key in sample_comp:
                sample_comp[key] = sample_comp[key] * 10e6 / total_reads
        else:
            sample_comp = {
            "Human": None,
            "Bacteria": None,
            "Virus": None,
            "Parasite": None,
            "Fungus": None,
            "Unclassified": None
        }
        viral_summary = [path for path in lib['diagnosticOutput'] if 'dna.viral.dxsm.out.summary.gz' in path]
        if viral_summary:
            viral_summary_path = os.path.join(results_dir, viral_summary[0])
            with gzip.open(viral_summary_path, 'rt') as viral_summary_file:
示例#4
0
     os.path.join('low_ic_compositions', low_ic_comp), [10760, 261665])
 low_ic_t7_raw_comp = [
     item[1] for item in low_ic_comp_data['Taxid Read Count']
     if item[0] == 10760
 ][0]
 low_ic_pr772_raw_comp = [
     item[1] for item in low_ic_comp_data['Taxid Read Count']
     if item[0] == 261665
 ][0]
 low_ic_t7_norm_comp = 10e6 * low_ic_t7_raw_comp / low_ic_comp_data[
     'Total Read Count']
 low_ic_pr772_norm_comp = 10e6 * low_ic_pr772_raw_comp / low_ic_comp_data[
     'Total Read Count']
 org_composition = bin_reads(os.path.join('low_ic_compositions',
                                          low_ic_comp),
                             ncbi_class=ncbi,
                             quantification='relative',
                             ctrl_taxids=[10760, 261665])
 # for key in org_composition:
 #     org_composition[key] = 10e6 * org_composition[key] / low_ic_comp_data['Total Read Count']
 low_ic_comp_dict = {
     'Accession':
     accession,
     'Seq Sple':
     seq_sple,
     'Composition File':
     low_ic_comp,
     'Batch ID':
     batch_id,
     'Run Date':
     run_date,