def chain_helper( ref_gff: Union[str, Path], ref_group: Union[str, Path], addon_gff: Union[str, Path], addon_group: Union[str, Path], name1: str, name2: str, fuzzy_junction: int, allow_5merge: bool, max_3_diff: int, ) -> None: o = sp.MegaPBTree( gff_filename=ref_gff, group_filename=ref_group, self_prefix=name1, internal_fuzzy_max_dist=fuzzy_junction, allow_5merge=allow_5merge, max_3_diff=max_3_diff, fastq_filename=None, ) o.add_sample( gff_filename=addon_gff, group_filename=addon_group, sample_prefix=name2, output_prefix=f"tmp_{name2}", fastq_filename=None, )
def chain_helper(ref_gff, ref_group, addon_gff, addon_group, name1, name2, fuzzy_junction, allow_5merge, max_3_diff): o = sp.MegaPBTree(ref_gff, ref_group, self_prefix=name1, \ internal_fuzzy_max_dist=fuzzy_junction, \ allow_5merge=allow_5merge, \ max_3_diff=max_3_diff, \ fastq_filename=None) o.add_sample(addon_gff, addon_group, \ sample_prefix=name2, output_prefix='tmp_' + name2, \ fastq_filename=None)
def chain_helper(ref_gff, ref_group, addon_gff, addon_group, name1, name2, fuzzy_junction, allow_5merge, max_3_diff): #print("chain_helper called w: ref {0}, add on {1}, output: tmp_{2}".format(ref_gff, addon_gff, name2)) o = sp.MegaPBTree(ref_gff, ref_group, self_prefix=name1, \ internal_fuzzy_max_dist=fuzzy_junction, \ allow_5merge=allow_5merge, \ max_3_diff=max_3_diff, \ fastq_filename=None) o.add_sample(addon_gff, addon_group, \ sample_prefix=name2, output_prefix='tmp_' + name2, \ fastq_filename=None)
def chain_samples(dirs, names, group_filename, gff_filename, count_filename, field_to_use='norm_nfl', fuzzy_junction=0, allow_5merge=False, fastq_filename=None): for d in dirs.itervalues(): sample_sanity_check(os.path.join(d, group_filename),\ os.path.join(d, gff_filename),\ os.path.join(d, count_filename),\ os.path.join(d, fastq_filename) if fastq_filename is not None else None) count_header, count_info = read_count_info(count_filename, dirs, field_to_use) # some names may already start with "tmp_" which means they are intermediate results that have already been chained # find the first non "tmp_" and start from there if names[0].startswith('tmp_'): chain = [] for start_i, name in enumerate(names): if name.startswith('tmp_'): chain.append(name[4:]) else: break # start_i, name now points at the first "non-tmp" sample # we want to go to the last tmp_ sample and read it name = names[start_i - 1][4:] # this is the last tmp_ sample, let's read it o = sp.MegaPBTree('tmp_'+name+'.gff', 'tmp_'+name+'.group.txt', self_prefix='tmp_'+name, \ internal_fuzzy_max_dist=fuzzy_junction, \ allow_5merge=allow_5merge, \ fastq_filename='tmp_'+name+'.rep.fq' if fastq_filename is not None else None) #chain.append(name) # no need, already done above else: # everything is new, start fresh name = names[0] d = dirs[name] chain = [name] o = sp.MegaPBTree(os.path.join(d, gff_filename), os.path.join(d, group_filename), \ self_prefix=name, internal_fuzzy_max_dist=fuzzy_junction, \ allow_5merge=allow_5merge, \ fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None) start_i = 1 for name in names[start_i:]: assert not name.startswith('tmp_') d = dirs[name] o.add_sample(os.path.join(d, gff_filename), os.path.join(d, group_filename), \ sample_prefix=name, output_prefix='tmp_'+name, \ fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None) o = sp.MegaPBTree('tmp_'+name+'.gff', 'tmp_'+name+'.group.txt', self_prefix='tmp_'+name, \ internal_fuzzy_max_dist=fuzzy_junction, \ allow_5merge=allow_5merge, \ fastq_filename='tmp_'+name+'.rep.fq' if fastq_filename is not None else None) chain.append(name) # now recursively chain back by looking at mega_info.txt!!! d = {} # ex: (tmp_1009, PB.1.1) --> mega info dict for c in chain[1:]: for r in DictReader(open('tmp_' + c + '.mega_info.txt'), delimiter='\t'): d['tmp_' + c, r['pbid']] = r f1 = open('all_samples.chained_ids.txt', 'w') f1.write("superPBID") f2 = open('all_samples.chained_count.txt', 'w') f2.write("superPBID") for c in chain: f1.write('\t' + c) f2.write('\t' + c) f1.write('\n') f2.write('\n') reader = DictReader(open('tmp_' + chain[-1] + '.mega_info.txt'), delimiter='\t') for r in reader: saw_NA = False r0 = r answer = defaultdict(lambda: 'NA') # ex: 1009 --> PB.1.1 answer2 = defaultdict(lambda: 'NA') # ex: 1009 --> count answer[chain[-1]] = r[chain[-1]] if r[chain[-1]] != 'NA': answer2[chain[-1]] = count_info[chain[-1], answer[chain[-1]]] for c in chain[::-1][ 1: -1]: # the first sample does not have tmp_, because it's not a chain if r['tmp_' + c] == 'NA': saw_NA = True break else: r2 = d['tmp_' + c, r['tmp_' + c]] answer[c] = r2[c] if answer[c] != 'NA': answer2[c] = count_info[c, answer[c]] r = r2 if not saw_NA: answer[chain[0]] = r[chain[0]] if answer[chain[0]] != 'NA': answer2[chain[0]] = count_info[chain[0], answer[chain[0]]] f1.write(r0['pbid']) f2.write(r0['pbid']) for c in chain: f1.write("\t" + answer[c]) # each tissue still share the same PB id f2.write("\t" + str(answer2[c])) f1.write('\n') f2.write('\n') f1.close() f2.close() shutil.copyfile('tmp_' + chain[-1] + '.gff', 'all_samples.chained.gff') if fastq_filename is not None: shutil.copyfile('tmp_' + chain[-1] + '.rep.fq', 'all_samples.chained.rep.fq') print >> sys.stderr, "Chained output written to:" print >> sys.stderr, "all_samples.chained.gff" print >> sys.stderr, f1.name print >> sys.stderr, f2.name if fastq_filename is not None: print >> sys.stderr, "all_samples.chained.rep.fq"
def chain_samples(dirs, names, group_filename, gff_filename, count_filename, field_to_use='norm_nfl', fuzzy_junction=0, allow_5merge=False, fastq_filename=None): for d in dirs.itervalues(): sample_sanity_check(os.path.join(d, group_filename),\ os.path.join(d, gff_filename),\ os.path.join(d, count_filename),\ os.path.join(d, fastq_filename) if fastq_filename is not None else None) count_info = {} # key: (sample, PB.1.1) --> count for name, d in dirs.iteritems(): f = open(os.path.join(d, count_filename)) while True: cur = f.tell() if not f.readline().startswith('#'): break f.seek(cur) for r in DictReader(f, delimiter='\t'): count_info[name, r['pbid']] = r[field_to_use] name = names[0] d = dirs[name] chain = [name] o = sp.MegaPBTree(os.path.join(d, gff_filename), os.path.join(d, group_filename), \ self_prefix=name, internal_fuzzy_max_dist=fuzzy_junction, \ allow_5merge=allow_5merge, \ fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None) for name in names[1:]: d = dirs[name] o.add_sample(os.path.join(d, gff_filename), os.path.join(d, group_filename), \ sample_prefix=name, output_prefix='tmp_'+name, \ fastq_filename=os.path.join(d, fastq_filename) if fastq_filename is not None else None) o = sp.MegaPBTree('tmp_'+name+'.gff', 'tmp_'+name+'.group.txt', self_prefix='tmp_'+name, \ internal_fuzzy_max_dist=fuzzy_junction, \ allow_5merge=allow_5merge, \ fastq_filename='tmp_'+name+'.rep.fq' if fastq_filename is not None else None) chain.append(name) # now recursively chain back by looking at mega_info.txt!!! d = {} # ex: (tmp_1009, PB.1.1) --> mega info dict for c in chain[1:]: for r in DictReader(open('tmp_' + c + '.mega_info.txt'), delimiter='\t'): d['tmp_' + c, r['pbid']] = r f1 = open('all_samples.chained_ids.txt', 'w') f1.write("superPBID") f2 = open('all_samples.chained_count.txt', 'w') f2.write("superPBID") for c in chain: f1.write('\t' + c) f2.write('\t' + c) f1.write('\n') f2.write('\n') reader = DictReader(open('tmp_' + chain[-1] + '.mega_info.txt'), delimiter='\t') for r in reader: saw_NA = False r0 = r answer = defaultdict(lambda: 'NA') # ex: 1009 --> PB.1.1 answer2 = defaultdict(lambda: 'NA') # ex: 1009 --> count answer[chain[-1]] = r[chain[-1]] if r[chain[-1]] != 'NA': answer2[chain[-1]] = count_info[chain[-1], answer[chain[-1]]] for c in chain[::-1][ 1: -1]: # the first sample does not have tmp_, because it's not a chain if r['tmp_' + c] == 'NA': saw_NA = True break else: r2 = d['tmp_' + c, r['tmp_' + c]] answer[c] = r2[c] if answer[c] != 'NA': answer2[c] = count_info[c, answer[c]] r = r2 if not saw_NA: answer[chain[0]] = r[chain[0]] if answer[chain[0]] != 'NA': answer2[chain[0]] = count_info[chain[0], answer[chain[0]]] f1.write(r0['pbid']) f2.write(r0['pbid']) for c in chain: f1.write("\t" + answer[c]) # each tissue still share the same PB id f2.write("\t" + str(answer2[c])) f1.write('\n') f2.write('\n') f1.close() f2.close() shutil.copyfile('tmp_' + chain[-1] + '.gff', 'all_samples.chained.gff') if fastq_filename is not None: shutil.copyfile('tmp_' + chain[-1] + '.rep.fq', 'all_samples.chained.rep.fq') print >> sys.stderr, "Chained output written to:" print >> sys.stderr, "all_samples.chained.gff" print >> sys.stderr, f1.name print >> sys.stderr, f2.name if fastq_filename is not None: print >> sys.stderr, "all_samples.chained.rep.fq"
def chain_samples( dirs, names, group_filename, gff_filename, count_filename, field_to_use="count_fl", fuzzy_junction=0, allow_5merge=False, max_3_diff=100, fastq_filename=None, ): for d in dirs.values(): sample_sanity_check( Path(d, group_filename), Path(d, gff_filename), Path(d, count_filename), Path(d, fastq_filename) if fastq_filename is not None else None, ) count_info = read_count_info(count_filename, dirs, field_to_use) # some names may already start with "tmp_" which means they are intermediate results that have already been chained # find the first non "tmp_" and start from there if names[0].startswith("tmp_"): chain = [] for start_i, name in enumerate(names): if name.startswith("tmp_"): chain.append(name[4:]) else: break # start_i, name now points at the first "non-tmp" sample # we want to go to the last tmp_ sample and read it name = names[start_i - 1][4:] # this is the last tmp_ sample, let's read it o = sp.MegaPBTree( f"tmp_{name}.gff", f"tmp_{name}.group.txt", self_prefix=f"tmp_{name}", internal_fuzzy_max_dist=fuzzy_junction, allow_5merge=allow_5merge, max_3_diff=max_3_diff, fastq_filename=f"tmp_{name}.rep.fq" if fastq_filename is not None else None, ) # chain.append(name) # no need, already done above else: # everything is new, start fresh name = names[0] d = Path(dirs[name]) chain = [name] o = sp.MegaPBTree( d.joinpath(gff_filename), d.joinpath(group_filename), self_prefix=name, internal_fuzzy_max_dist=fuzzy_junction, allow_5merge=allow_5merge, max_3_diff=max_3_diff, fastq_filename=d.joinpath(fastq_filename) if fastq_filename is not None else None, ) start_i = 1 for name in names[start_i:]: if name.startswith("tmp_"): raise AssertionError("trying to add a temp file!") d = Path(dirs[name]) o.add_sample( d.joinpath(gff_filename), d.joinpath(group_filename), sample_prefix=name, output_prefix=f"tmp_{name}", fastq_filename=d.joinpath(fastq_filename) if fastq_filename is not None else None, ) o = sp.MegaPBTree( f"tmp_{name}.gff", f"tmp_{name}.group.txt", self_prefix=f"tmp_{name}", internal_fuzzy_max_dist=fuzzy_junction, allow_5merge=allow_5merge, max_3_diff=max_3_diff, fastq_filename=f"tmp_{name}.rep.fq" if fastq_filename is not None else None, ) chain.append(name) # now recursively chain back by looking at mega_info.txt!!! d = {} # ex: (tmp_1009, PB.1.1) --> mega info dict for c in chain[1:]: for r in DictReader(open(f"tmp_{c}.mega_info.txt"), delimiter="\t"): d[f"tmp_{c}", r["superPBID"]] = r with open("all_samples.chained_ids.txt", "w") as f1, open("all_samples.chained_count.txt", "w") as f2: writer1 = DictWriter(f1, fieldnames=["superPBID"] + chain, delimiter="\t") writer1.writeheader() writer2 = DictWriter(f2, fieldnames=["superPBID"] + chain, delimiter="\t") writer2.writeheader() reader = DictReader(open(f"tmp_{chain[-1]}.mega_info.txt"), delimiter="\t") for r in reader: saw_NA = False r0 = r answer = defaultdict(lambda: "NA") # ex: 1009 --> PB.1.1 answer2 = defaultdict(lambda: "NA") # ex: 1009 --> count answer[chain[-1]] = r[chain[-1]] if r[chain[-1]] != "NA": answer2[chain[-1]] = count_info[chain[-1], answer[chain[-1]]] for c in chain[::-1][ 1: -1]: # the first sample does not have tmp_, because it's not a chain if r[f"tmp_{c}"] == "NA": saw_NA = True break else: r2 = d[f"tmp_{c}", r[f"tmp_{c}"]] answer[c] = r2[c] if answer[c] != "NA": answer2[c] = count_info[c, answer[c]] r = r2 if not saw_NA: answer[chain[0]] = r[chain[0]] if answer[chain[0]] != "NA": answer2[chain[0]] = count_info[chain[0], answer[chain[0]]] rec1 = {"superPBID": r0["superPBID"]} rec2 = {"superPBID": r0["superPBID"]} for c in chain: rec1[c] = answer[c] rec2[c] = str(answer2[c]) writer1.writerow(rec1) writer2.writerow(rec2) shutil.copyfile(f"tmp_{chain[-1]}.gff", "all_samples.chained.gff") if fastq_filename is not None: shutil.copyfile(f"tmp_{chain[-1]}.rep.fq", "all_samples.chained.rep.fq") logger.info("Chained output written to:") logger.info("all_samples.chained.gff") logger.info(f1.name) logger.info(f2.name) if fastq_filename is not None: logger.info("all_samples.chained.rep.fq")