def chain_fusion_samples(dirs, names, group_filename, gff_filename, count_filename, field_to_use='count_fl', fuzzy_junction=0, fastq_filename=None): for d in dirs.values(): sample_sanity_check(os.path.join(d, group_filename),\ os.path.join(d, gff_filename),\ os.path.join(d, count_filename),\ os.path.join(d, fastq_filename) if fastq_filename is not None else None) count_header, count_info = read_count_info(count_filename, dirs, field_to_use) # some names may already start with "tmp_" which means they are intermediate results that have already been chained # find the first non "tmp_" and start from there if names[0].startswith('tmp_'): chain = [] for start_i, name in enumerate(names): if name.startswith('tmp_'): chain.append(name[4:]) else: break # start_i, name now points at the first "non-tmp" sample # we want to go to the last tmp_ sample and read it name = names[start_i - 1][4:] # this is the last tmp_ sample, let's read it o = sp.MegaPBTreeFusion('tmp_'+name+'.gff', 'tmp_'+name+'.group.txt', self_prefix='tmp_'+name, \ internal_fuzzy_max_dist=fuzzy_junction, \ fastq_filename='tmp_'+name+'.rep.fq' if fastq_filename is not None else None) #chain.append(name) # no need, already done above else: # everything is new, start fresh name = names[0] d = dirs[name] chain = [name] o = sp.MegaPBTreeFusion(os.path.join(d, gff_filename), os.path.join(d, group_filename), \ self_prefix=name, internal_fuzzy_max_dist=fuzzy_junction, \ fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None) start_i = 1 for name in names[start_i:]: assert not name.startswith('tmp_') d = dirs[name] o.add_sample(os.path.join(d, gff_filename), os.path.join(d, group_filename), \ sample_prefix=name, output_prefix='tmp_'+name, \ fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None) o = sp.MegaPBTreeFusion('tmp_'+name+'.gff', 'tmp_'+name+'.group.txt', self_prefix='tmp_'+name, \ internal_fuzzy_max_dist=fuzzy_junction, \ fastq_filename='tmp_'+name+'.rep.fq' if fastq_filename is not None else None) chain.append(name) # now recursively chain back by looking at mega_info.txt!!! d = {} # ex: (tmp_1009, PB.1.1) --> mega info dict for c in chain[1:]: for r in DictReader(open('tmp_' + c + '.mega_info.txt'), delimiter='\t'): d['tmp_' + c, r['pbid']] = r f1 = open('all_samples.chained_ids.txt', 'w') f1.write("superPBID") f2 = open('all_samples.chained_count.txt', 'w') f2.write("superPBID") for c in chain: f1.write('\t' + c) f2.write('\t' + c) f1.write('\n') f2.write('\n') reader = DictReader(open('tmp_' + chain[-1] + '.mega_info.txt'), delimiter='\t') for r in reader: saw_NA = False r0 = r answer = defaultdict(lambda: 'NA') # ex: 1009 --> PB.1.1 answer2 = defaultdict(lambda: 'NA') # ex: 1009 --> count answer[chain[-1]] = r[chain[-1]] if r[chain[-1]] != 'NA': answer2[chain[-1]] = count_info[chain[-1], answer[chain[-1]]] for c in chain[::-1][ 1: -1]: # the first sample does not have tmp_, because it's not a chain if r['tmp_' + c] == 'NA': saw_NA = True break else: r2 = d['tmp_' + c, r['tmp_' + c]] answer[c] = r2[c] if answer[c] != 'NA': answer2[c] = count_info[c, answer[c]] r = r2 if not saw_NA: answer[chain[0]] = r[chain[0]] if answer[chain[0]] != 'NA': answer2[chain[0]] = count_info[chain[0], answer[chain[0]]] f1.write(r0['pbid']) f2.write(r0['pbid']) for c in chain: f1.write("\t" + answer[c]) # each tissue still share the same PB id f2.write("\t" + str(answer2[c])) f1.write('\n') f2.write('\n') f1.close() f2.close() shutil.copyfile('tmp_' + chain[-1] + '.gff', 'all_samples.chained.gff') if fastq_filename is not None: shutil.copyfile('tmp_' + chain[-1] + '.rep.fq', 'all_samples.chained.rep.fq') print("Chained output written to:", file=sys.stderr) print("all_samples.chained.gff", file=sys.stderr) print(f1.name, file=sys.stderr) print(f2.name, file=sys.stderr) if fastq_filename is not None: print("all_samples.chained.rep.fq", file=sys.stderr)
def chain_fusion_samples( dirs: List[str], names: List[str], group_filename: Union[str, Path], gff_filename: Union[str, Path], count_filename: Union[str, Path], field_to_use: str = "count_fl", fuzzy_junction: int = 0, fastq_filename: Optional[Union[str, Path]] = None, ) -> None: for d in dirs.values(): sample_sanity_check( Path(d, group_filename), Path(d, gff_filename), Path(d, count_filename), Path(d, fastq_filename) if fastq_filename is not None else None, ) count_info = read_count_info(count_filename, dirs, field_to_use) # some names may already start with "tmp_" which means they are intermediate results that have already been chained # find the first non "tmp_" and start from there if names[0].startswith("tmp_"): chain = [] for start_i, name in enumerate(names): if name.startswith("tmp_"): chain.append(name[4:]) else: break # start_i, name now points at the first "non-tmp" sample # we want to go to the last tmp_ sample and read it name = names[start_i - 1][4:] # this is the last tmp_ sample, let's read it o = sp.MegaPBTreeFusion( gff_filename=f"tmp_{name}.gff", group_filename=f"tmp_{name}.group.txt", self_prefix=f"tmp_{name}", internal_fuzzy_max_dist=fuzzy_junction, fastq_filename=f"tmp_{name}.rep.fq" if fastq_filename is not None else None, ) # chain.append(name) # no need, already done above else: # everything is new, start fresh name = names[0] d = dirs[name] chain = [name] o = sp.MegaPBTreeFusion( gff_filename=Path(d, gff_filename), group_filename=Path(d, group_filename), self_prefix=name, internal_fuzzy_max_dist=fuzzy_junction, fastq_filename=Path(d, fastq_filename) if fastq_filename is not None else None, ) start_i = 1 for name in names[start_i:]: assert not name.startswith("tmp_") d = dirs[name] o.add_sample( gff_filename=Path(d, gff_filename), group_filename=Path(d, group_filename), sample_prefix=name, output_prefix=f"tmp_{name}", fastq_filename=Path(d, fastq_filename) if fastq_filename is not None else None, ) o = sp.MegaPBTreeFusion( gff_filename=f"tmp_{name}.gff", group_filename=f"tmp_{name}.group.txt", self_prefix=f"tmp_{name}", internal_fuzzy_max_dist=fuzzy_junction, fastq_filename=f"tmp_{name}.rep.fq" if fastq_filename is not None else None, ) chain.append(name) # now recursively chain back by looking at mega_info.txt!!! d = {} # ex: (tmp_1009, PB.1.1) --> mega info dict for c in chain[1:]: for r in DictReader(open(f"tmp_{c}.mega_info.txt"), delimiter="\t"): d[f"tmp_{c}", r["pbid"]] = r with open("all_samples.chained_ids.txt", "w") as f1, open("all_samples.chained_count.txt", "w") as f2: f1.write("superPBID") f2.write("superPBID") for c in chain: f1.write(f" {c}") f2.write(f" {c}") f1.write("\n") f2.write("\n") reader = DictReader(Path(f"tmp_{chain[-1]}.mega_info.txt").open(), delimiter="\t") for r in reader: saw_NA = False r0 = r answer = defaultdict(lambda: "NA") # ex: 1009 --> PB.1.1 answer2 = defaultdict(lambda: "NA") # ex: 1009 --> count answer[chain[-1]] = r[chain[-1]] if r[chain[-1]] != "NA": answer2[chain[-1]] = count_info[chain[-1], answer[chain[-1]]] for c in chain[::-1][ 1: -1]: # the first sample does not have tmp_, because it's not a chain if r[f"tmp_{c}"] == "NA": saw_NA = True break else: r2 = d[f"tmp_{c}", r[f"tmp_{c}"]] answer[c] = r2[c] if answer[c] != "NA": answer2[c] = count_info[c, answer[c]] r = r2 if not saw_NA: answer[chain[0]] = r[chain[0]] if answer[chain[0]] != "NA": answer2[chain[0]] = count_info[chain[0], answer[chain[0]]] f1.write(r0["pbid"]) f2.write(r0["pbid"]) for c in chain: f1.write( f" {answer[c]}") # each tissue still share the same PB id f2.write(f" {str(answer2[c])}") f1.write("\n") f2.write("\n") shutil.copyfile(f"tmp_{chain[-1]}.gff", "all_samples.chained.gff") if fastq_filename is not None: shutil.copyfile(f"tmp_{chain[-1]}.rep.fq", "all_samples.chained.rep.fq") logger.info("Chained output written to:") logger.info("all_samples.chained.gff") logger.info(f1.name) logger.info(f2.name) if fastq_filename is not None: logger.info("all_samples.chained.rep.fq")