def pick_longest_rep(fasta_filename, gff_filename, group_filename, output_filename): """ For each group, select the representative record to be the longest """ fastad = LazyFastaReader(fasta_filename) fout = FastaWriter(output_filename) coords = {} for line in open(gff_filename): # ex: chr1 PacBio transcript 27567 29336 . - . gene_id "PB.1"; transcript_id "PB.1.1"; raw = line.strip().split('\t') if raw[2] == 'transcript': tid = raw[-1].split('; ')[1].split()[1][1:-2] coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) for line in open(group_filename): pb_id, members = line.strip().split('\t') best_id = None best_seq = None max_len = 0 for x in members.split(','): if len(fastad[x].sequence) >= max_len: best_id = x best_seq = fastad[x].sequence max_len = len(fastad[x].sequence) fout.writeRecord("{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id), best_seq) fout.close()
def pick_longest_rep(fasta_filename, gff_filename, group_filename, output_filename): """ For each group, select the representative record to be the longest """ fastad = LazyFastaReader(fasta_filename) fout = FastaWriter(output_filename) coords = {} for line in open(gff_filename): # ex: chr1 PacBio transcript 27567 29336 . - . gene_id "PB.1"; transcript_id "PB.1.1"; raw = line.strip().split("\t") if raw[2] == "transcript": tid = raw[-1].split("; ")[1].split()[1][1:-2] coords[tid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) for line in open(group_filename): pb_id, members = line.strip().split("\t") best_id = None best_seq = None max_len = 0 for x in members.split(","): if len(fastad[x].sequence) >= max_len: best_id = x best_seq = fastad[x].sequence max_len = len(fastad[x].sequence) fout.writeRecord("{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id), best_seq) fout.close()
def convert_to_dazz_fasta(self): """ Convert input fasta/fastq file to daligner-compatibe fasta with ids: <prefix>/<index>/0_<seqlen> Also write out mappings to pickle """ i = 1 reader = FastaReader(self.input_filename) if self.filetype == "fasta" else FastqReader(self.input_filename) f = FastaWriter(self.dazz_filename) for r in reader: f.writeRecord("{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence) self.dazz_mapping[i] = r.id i += 1 f.close() with open(self.dazz_filename + ".pickle", "w") as f: dump(self.dazz_mapping, f)
def convert_to_dazz_fasta(self): """ Convert input fasta/fastq file to daligner-compatibe fasta with ids: <prefix>/<index>/0_<seqlen> Also write out mappings to pickle """ i = 1 reader = FastaReader(self.input_filename) if self.filetype == 'fasta' else \ FastqReader(self.input_filename) f = FastaWriter(self.dazz_filename) for r in reader: f.writeRecord("{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence) self.dazz_mapping[i] = r.id i += 1 f.close() with open(self.dazz_filename + '.pickle', 'w') as f: dump(self.dazz_mapping, f)