accession = lib['bioSple'] print(accession) total_reads = 0 for batch_lib in batch_obj['batch']['readsDist']['DNA']: if batch_lib['bioSple'] == accession: total_reads = batch_lib['postQualityReads'] print(total_reads) if total_reads == 0: continue summary_paths = lib['diagnosticOutput'] composition_path = glob.glob( os.path.join(results_dir, 'tax', seq_sple + '*sample_composition.out')) if composition_path: sample_comp = bin_reads(composition_path[0], ncbi_class=ncbi, ctrl_taxids=[10760], quantification='relative') # for key in sample_comp: # sample_comp[key] = sample_comp[key] * 10e6 / total_reads else: sample_comp = { "Human": None, "Bacteria": None, "Virus": None, "Parasite": None, "Fungus": None, "Unclassified": None } viral_summary = [ path for path in lib['diagnosticOutput'] if 'dna.viral.dxsm.out.summary.gz' in path
def get_lib_dict_ls(results_dir=None, fqo=None): fqo_hi_ic = pd.read_csv(fqo) fqo_hi_ic['Run Date (YYYY-MM-DD)'] = pd.to_datetime( fqo_hi_ic['Run Date (YYYY-MM-DD)']) results_dir = results_dir batch_paths = glob.glob(os.path.join(results_dir, 'batch', '*')) lib_dict_ls = [] for batch_path in batch_paths: with open(batch_path) as batch_file: batch = json.load(batch_file) batch_id = batch['batch']['libBatchId'] for lib in batch['libraries']: accession = lib['bioSple'] seq_sple = lib['seqSple'] lib_type = lib['libType'] if not any(fqo_hi_ic['Seq Sple'] == seq_sple.lower()): continue run_date = fqo_hi_ic.loc[fqo_hi_ic['Seq Sple'] == seq_sple.lower(), 'Run Date (YYYY-MM-DD)'].values[0] sample_name = lib['spleName'] total_reads = lib['qualityFilterInfo']['readsOut'] tax_paths = lib['diagnosticOutput'] vir_paths = [ path for path in tax_paths if 'dna.viral.dxsm.out.summary.gz' in path ] t7_read_cnt = 0 pr772_read_cnt = 0 if vir_paths: vir_path = vir_paths[0] with gzip.open(os.path.join(results_dir, vir_path)) as vir_file: for line in vir_file: obj = json.loads(line.strip()) if obj['reporting_id'] == '26706_10760': t7_read_cnt = obj['read_count'] elif obj['reporting_id'] == '26648_261665': pr772_read_cnt = obj['read_count'] t7_norm = 10e6 * t7_read_cnt / total_reads pr772_norm = 10e6 * pr772_read_cnt / total_reads # composition file data composition_path_ls = glob.glob( os.path.join(results_dir, 'tax', seq_sple + '*dna.sample_composition.out')) if not composition_path_ls: composition_data = { 'Total Read Count': 1e-6, 'Taxid Read Count': [(10760, 0), (261665, 0)] } org_composition = { "Human": 0, "Bacteria": 0, "Virus": 0, "Parasite": 0, "Fungus": 0, "Unclassified": 0 } composition_file = None else: composition_path = composition_path_ls[0] composition_file = os.path.basename(composition_path) composition_data = parse_composition_file( composition_path, [10760, 261665]) org_composition = bin_reads(composition_path, ncbi_class=ncbi, quantification='relative', ctrl_taxids=[10760, 261665]) # for key in org_composition: # org_composition[key] = 10e6 * org_composition[key] / composition_data['Total Read Count'] if not composition_data['Taxid Read Count']: composition_data['Taxid Read Count'] = [(10760, 0), (261665, 0)] elif len(composition_data['Taxid Read Count']) == 1: if composition_data['Taxid Read Count'][0][0] == 10760: composition_data['Taxid Read Count'].append((261665, 0)) elif composition_data['Taxid Read Count'][0][0] == 261665: composition_data['Taxid Read Count'].append((10760, 0)) t7_raw_comp = [ item[1] for item in composition_data['Taxid Read Count'] if item[0] == 10760 ][0] pr772_raw_comp = [ item[1] for item in composition_data['Taxid Read Count'] if item[0] == 261665 ][0] t7_norm_comp = 10e6 * t7_raw_comp / composition_data[ 'Total Read Count'] pr772_norm_comp = 10e6 * pr772_raw_comp / composition_data[ 'Total Read Count'] lib_dict = { 'Accession': accession, 'Seq Sple': seq_sple, 'Composition File': composition_file, 'Batch ID': batch_id, 'Run Date': run_date, 'Sample Name': sample_name, 'Library Type': lib_type, 'Total Reads': total_reads, 'T7 Raw Reads': t7_read_cnt, 'PR772 Raw Reads': pr772_read_cnt, 'T7 Normalized Reads': t7_norm, 'PR772 Normalized Reads': pr772_norm, 'T7 + PR772 NR': t7_norm + pr772_norm, 'Log10 T7 + PR772 NR': np.log10(t7_norm + pr772_norm), 'Total Reads (Composition File)': composition_data['Total Read Count'], 'T7 Raw Reads (Composition File)': t7_raw_comp, 'T4 Raw Reads (Composition File)': pr772_raw_comp, 'T7 Normalized (Composition File)': t7_norm_comp, 'T4 Normalized (Composition File)': pr772_norm_comp, 'T7 + PR772 NR (Composition File)': t7_norm_comp + pr772_norm_comp, 'Log10 T7 + PR772 NR (Composition File)': np.log10(t7_norm_comp + pr772_norm_comp), 'Summary - Composition (log10)': np.log10(t7_norm + pr772_norm) - np.log10(t7_norm_comp + pr772_norm_comp), "Human": org_composition['Human'], "Bacteria": org_composition['Bacteria'], "Virus": org_composition['Virus'], "Parasite": org_composition['Parasite'], "Fungus": org_composition['Fungus'], "Unclassified": org_composition['Unclassified'] } lib_dict_ls.append(lib_dict) return lib_dict_ls
batch_file_paths = glob.glob(os.path.join(results_dir, 'batch', '*')) results = {} for batch_file in batch_file_paths: with open(batch_file) as input: batch_obj = json.load(input) for lib in batch_obj['libraries']: seq_sple = lib['seqSple'] accession = lib['bioSple'] for batch_lib in batch_obj['batch']['readsDist']['DNA']: if batch_lib['bioSple'] == accession: total_reads = batch_lib['postQualityReads'] summary_paths = lib['diagnosticOutput'] composition_path = glob.glob(os.path.join(results_dir, 'tax', seq_sple + '*sample_composition.out')) if composition_path: sample_comp = bin_reads(composition_path[0], ncbi_class=ncbi, quantification='absolute', ctrl_taxids=[10760]) for key in sample_comp: sample_comp[key] = sample_comp[key] * 10e6 / total_reads else: sample_comp = { "Human": None, "Bacteria": None, "Virus": None, "Parasite": None, "Fungus": None, "Unclassified": None } viral_summary = [path for path in lib['diagnosticOutput'] if 'dna.viral.dxsm.out.summary.gz' in path] if viral_summary: viral_summary_path = os.path.join(results_dir, viral_summary[0]) with gzip.open(viral_summary_path, 'rt') as viral_summary_file:
os.path.join('low_ic_compositions', low_ic_comp), [10760, 261665]) low_ic_t7_raw_comp = [ item[1] for item in low_ic_comp_data['Taxid Read Count'] if item[0] == 10760 ][0] low_ic_pr772_raw_comp = [ item[1] for item in low_ic_comp_data['Taxid Read Count'] if item[0] == 261665 ][0] low_ic_t7_norm_comp = 10e6 * low_ic_t7_raw_comp / low_ic_comp_data[ 'Total Read Count'] low_ic_pr772_norm_comp = 10e6 * low_ic_pr772_raw_comp / low_ic_comp_data[ 'Total Read Count'] org_composition = bin_reads(os.path.join('low_ic_compositions', low_ic_comp), ncbi_class=ncbi, quantification='relative', ctrl_taxids=[10760, 261665]) # for key in org_composition: # org_composition[key] = 10e6 * org_composition[key] / low_ic_comp_data['Total Read Count'] low_ic_comp_dict = { 'Accession': accession, 'Seq Sple': seq_sple, 'Composition File': low_ic_comp, 'Batch ID': batch_id, 'Run Date': run_date,