def create_renaming_key(self, raw_subreads, renamed_subreads): """ Create a key for translating HBAR subread names to canonical PacBio names """ log.info("Looking for Raw<--->HBAR subread renaming key") renaming_key = self.get_filepath('subreads', 'renaming_key.txt') if valid_file(renaming_key): log.info('Using existing subread renaming key\n') return renaming_key log.info("No subread renaming key round, creating one...") # Compare the two files to make sure they're equivalent raw_count = fasta_size(raw_subreads) new_count = fasta_size(renamed_subreads) try: assert raw_count == new_count except AssertionError: msg = 'The number of raw subreads (%s) does not ' % raw_count + \ 'match the number of renamed reads (%s)' % new_count log.info(msg) raise ValueError(msg) # Write out the pairs of names to file with open(renaming_key, 'w') as handle: for raw, renamed in zip(FastaReader(raw_subreads), FastaReader(renamed_subreads)): raw_name = raw.name.split()[0] new_name = renamed.name.split()[0] handle.write('%s\t%s\n' % (new_name, raw_name)) check_output_file(renaming_key) log.info("Finished creating subread renaming key\n") return renaming_key
def sep_flnc_by_primer(flnc_filename, root_dir, output_filename='isoseq_flnc.fasta'): """ Separate flnc fasta by primer. Useful for targeted sequencing. ex: make <root_dir>/primer0/isoseq_flnc.fasta ... etc ... """ def get_primer(r): for x in r.name.split(';'): if x.startswith('primer='): return x.split('=')[1] primers = set() for r in FastaReader(flnc_filename): p = get_primer(r) primers.add(p) handles = {} for p in primers: dirname = os.path.join(root_dir, "primer{0}".format(p)) if os.path.exists(dirname): print >> sys.stderr, "WARNING: {0} already exists.".format(dirname) else: os.makedirs(dirname) handles[p] = open(os.path.join(dirname, output_filename), 'w') for r in FastaReader(flnc_filename): p = get_primer(r) handles[p].write(">{0}\n{1}\n".format(r.name, r.sequence)) for f in handles.itervalues(): f.close() primers = list(primers) primers.sort(key=lambda x: int(x)) return [handles[x] for x in primers]
def pbdagcon_wrapper(fasta_filename, output_prefix, consensus_name, nproc=8, maxScore=-1000, min_seq_len=300): """ (1) Find the best seed as reference (2) Align rest to seed (3) Call pbdagcon """ try: out_filename_m1 = output_prefix + ".saln.m1" ref = choose_template_by_blasr(fasta_filename=fasta_filename, out_filename=out_filename_m1, nproc=nproc, maxScore=maxScore) os.remove(out_filename_m1) ref_filename = output_prefix + '_ref.fa' with open(ref_filename, 'w') as f: f.write(">{0}\n{1}\n".format(consensus_name, ref.sequence)) # create alignment file aln_filename = output_prefix + '.saln' make_aln_input_to_ref(fasta_filename=fasta_filename, ref_filename=ref_filename, out_filename=aln_filename, nproc=nproc) cons_filename = output_prefix + '.fa' tmp_cons_filename = output_prefix + '.fa.tmp' # call pbdagcon cmd = "pbdagcon -t 0 -m {minlen} -c 1 -j {nproc} {aln} > {out}".format( minlen=min_seq_len, nproc=nproc, aln=aln_filename, out=tmp_cons_filename) if subprocess.check_call(cmd, shell=True): raise AlignGraphUtilError, "Cannot run command:", cmd with FastaReader(tmp_cons_filename) as reader, \ open(cons_filename, 'w') as writer: for rec in reader: name = rec.name.strip() if "/" in name: # change cid format from c{cid}/0_{len} to c{cid} name = name[:name.find('/')] seq = rec.sequence.strip() writer.write(">{0}\n{1}\n".format(name, seq)) os.remove(tmp_cons_filename) except AlignGraphUtilError: # pick the first sequence as reference as a backup plan first_seq = FastaReader(fasta_filename).__iter__().next() with open(ref_filename, 'w') as f: f.write(">{0}_ref\n{1}\n".format(consensus_name, first_seq.sequence)) return 0
def sep_flnc_by_size(flnc_filename, root_dir, bin_size_kb=1, bin_manual=None, output_filename='isoseq_flnc.fasta'): """ Separate flnc fasta into different size bins ex: make <root_dir>/0to2k/isoseq_flnc.fasta ... etc ... If <bin_manual> (ex: (0, 2, 4, 12)) is given, <bin_size_kb> is ignored. """ # first check min - max size range min_size = 0 max_size = 0 for r in FastaReader(flnc_filename): seqlen = len(r.sequence) min_size = min(min_size, seqlen) max_size = max(max_size, seqlen) min_size_kb = min_size / 1000 max_size_kb = max_size / 1000 + (1 if max_size % 1000 > 1 else 0) if bin_manual is not None: if bin_manual[0] > min_size_kb: raise Exception, "Min sequence length is {0} kb, below the bin!".format( min_size) if bin_manual[-1] < max_size_kb: raise Exception, "Max sequence length is {0} kb, above the bin!".format( max_size) bins = bin_manual else: bins = range(min_size_kb, max_size_kb + 1, bin_size_kb) print >> sys.stderr, bins handles = {} for i in xrange(len(bins) - 1): dirname = os.path.join(root_dir, "{0}to{1}kb".format(bins[i], bins[i + 1])) if os.path.exists(dirname): print >> sys.stderr, "WARNING: {0} already exists.".format(dirname) else: os.makedirs(dirname) handles[i] = open(os.path.join(dirname, output_filename), 'w') max_bin = len(bins) - 1 for r in FastaReader(flnc_filename): kb_size = len(r.sequence) / 1000 i = min(max_bin, max(0, bisect_right(bins, kb_size) - 1)) handles[i].write(">{0}\n{1}\n".format(r.name, r.sequence)) print >> sys.stderr, "putting {0} in {1}".format( len(r.sequence), handles[i].name) for h in handles.itervalues(): h.close() names = [handles[i].name for i in xrange(len(bins) - 1)] # return names return filter(lambda x: os.stat(x).st_size > 0, names)
def read_fasta_dict(fasta_input): records = {} if isinstance(fasta_input, str): for rec in FastaReader(fasta_input): name = rec.name.strip().split()[0] assert name not in records records[name] = rec elif isinstance(fasta_input, list): for filename in fasta_input: for rec in FastaReader(filename): name = rec.name.strip().split()[0] assert name not in records records[name] = rec return records
def add_seqs_from_fasta(self, fasta_filename, smooth=True): """Add sequence ids from a fasta file.""" with FastaReader(fasta_filename) as reader: newids = [r.name.split()[0] for r in reader] #with open(fasta_filename) as f: # newids = [r.id for r in SeqIO.parse(f, 'fasta')] self.add_ids_from_fasta(newids, smooth)
def __init__(self, transfrag_filename, fsm_maps, cov_threshold=2, min_aln_coverage=.99, min_aln_identity=.85, is_fq=False): self.contiVec = None # current ContiVec object self.exons = None #self.MIN_EXON_SIZE = max_fuzzy_junction self.transfrag_filename = transfrag_filename if is_fq: self.transfrag_len_dict = dict( (r.name.split()[0], len(r.sequence)) for r in FastqReader(transfrag_filename)) else: self.transfrag_len_dict = dict( (r.name.split()[0], len(r.sequence)) for r in FastaReader(transfrag_filename)) self.fsm_maps = fsm_maps self.cov_threshold = cov_threshold # only output GTF records if >= this many GMAP records support it (this must be if I'm running non-clustered fasta on GMAP) self.min_aln_coverage = min_aln_coverage self.min_aln_identity = min_aln_identity self.cuff_index = 1
def main(argv): desc = 'A tool to trim quiver results for contigs majority lowercase' parser = argparse.ArgumentParser(description=desc) parser.add_argument('inputFile', help='input sequence') parser.add_argument('outputFile', help='output fasta') parser.add_argument( '--filt', default=0.5, dest='filt', type=float, help= 'proportion of lowercase bases a contig can have before being filtered out' ) args = parser.parse_args() writer = FastaWriter(args.outputFile) for record in FastaReader(args.inputFile): upper_output = [] upper_indx = [] lower = float(sum(1 for c in record.sequence if c.islower())) pro = lower / float(len(record.sequence)) print pro if pro < args.filt: writer.writeRecord(record)
def _write_assigned_reads(input_fasta, assignments): """ Write out subreads to the appropriate file """ log.info("Separating subreads based on their amplicon assignments") output_files = [] writers = {} root_name = '.'.join(input_fasta.split('.')[:-1]) # Open up output writers for each group for group in assignments: output_file = "%s_%s.fasta" % (root_name, group) output_files.append(output_file) writers[group] = FastaWriter(output_file) # Write each record to it's appropriate group(s) for record in FastaReader(input_fasta): name = record.name.split()[0] for group in assignments: if name in assignments[group]: writers[group].writeRecord(record) break # Close all of the output writers for group in writers: writers[group].close() return output_files
def pbdagcon_wrapper(fasta_filename, output_prefix, consensus_name, nproc=8, maxScore=-1000, min_seq_len=300): """ (1) Find the best seed as reference (2) Align rest to seed (3) Call pbdagcon """ ref_filename = output_prefix + '_ref.fasta' try: ref = choose_template_by_blasr(fasta_filename, nproc=nproc, maxScore=maxScore) with open(ref_filename, 'w') as f: f.write(">{0}\n{1}".format(consensus_name, ref.sequence)) # create alignment file aln_filename = make_aln_input_to_ref(fasta_filename, ref_filename, nproc=nproc) cons_filename = output_prefix + '.fasta' # call pbdagcon cmd = "pbdagcon -t 0 -m {minlen} -c 1 -j {nproc} {aln} > {out}".format(\ minlen=min_seq_len, nproc=nproc, aln=aln_filename, out=cons_filename) if subprocess.check_call(cmd, shell=True): raise AlignGraphUtilError, "Cannot run command:", cmd except AlignGraphUtilError: # pick the first sequence as reference as a backup plan first_seq = FastaReader(fasta_filename).__iter__().next() with open(ref_filename, 'w') as f: f.write(">{0}_ref\n{1}".format(consensus_name, first_seq.sequence))
def _parse_exon_records(exon_file, output_type): if output_type == 'fasta': return list(FastaReader(exon_file)) elif output_type == 'fastq': return list(FastqReader(exon_file)) msg = 'Exon data must be in either Fasta or Fastq format' log.error(msg) raise TypeError(msg)
def write_references(reference_file, references): for i, ref in enumerate(references): for record in FastaReader(reference_file): name = record.name.split()[0] if name == ref: filename = 'reference_%s.fasta' % (i + 1) with FastaWriter(filename) as writer: writer.writeRecord(record)
def subset_references(reference_file, reference_names): output = 'references.fasta' with FastaWriter(output) as writer: for record in FastaReader(reference_file): name = record.name.split()[0] if name in reference_names: writer.writeRecord(record) return output
def fasta_count(fasta_file): count = 0 try: for record in FastaReader(fasta_file): if len(record.sequence) > 0: count += 1 except: return 0 return count
def fasta_length(fasta): """ Return the maximum sequence length in a Fasta file """ try: f = FastaReader(fasta) except: return 0 return max([len(read.sequence) for read in f])
def extract_names(fasta): """ Extract all of the names from a Fasta file """ names = [] for record in FastaReader(fasta): name = record.name.split()[0] names.append(name) return names
def combine_fasta(fasta_files, destination): with FastaWriter(destination) as handle: for fasta in fasta_files: try: for record in FastaReader(fasta): handle.writeRecord(record) except: log.warn('Could not open "%s" as Fasta' % fasta) check_output_file(destination)
def select_references(reference_file, refs): for i, ref in enumerate(refs): for record in FastaReader(reference_file): if record.name.startswith(ref): hla_type = HlaType.from_string(record.name) if i == 0: first = ref first_type = hla_type elif first_type.field1 != hla_type.field1: return (first, ref)
def _read_fasta_record(input_file): """ Read a single FastaRecord, raising an error if a MultiFasta is found """ records = list(FastaReader(input_file)) if len(records) == 1: return records[0] msg = 'expected a single Fasta, found MultiFasta!' log.error(msg) raise TypeError(msg)
def read_names(sequence_file): # Open the sequence file with the appropriate reader if is_fasta(sequence_file): reader = FastaReader(sequence_file) elif is_fastq(sequence_file): reader = FastqReader(sequence_file) else: raise ValueError # Extract and return the sequence names return [r.name.strip().split()[0] for r in reader]
def make_current_fasta(icec_obj, flnc_filename, root_dir): """ current fasta will consists of all ids however --- if this was a already finished run and we are adding more input, then newids is empty, in this case we set newids = everything that has no affiliation or more than one affiliated cluster in d """ with FastaWriter(current_fasta(root_dir)) as f: for r in FastaReader(flnc_filename): f.writeRecord(r)
def create_fastaRecord(fasta): class Contig: def __init__(self, entry): self.id = entry.name self.sequence = entry.sequence contigs_diff = [] for entry in FastaReader(fasta): contig = Contig(entry) contigs_diff.append(contig) return contigs_diff
def _parse_reference_sequences(fofn_file): log.info('Parsing reference sequence data...') records = [] with open(fofn_file, 'r') as handle: for line in handle: if line.startswith('#'): continue filename, locus = line.strip().split() records += list(FastaReader(filename)) log.info("Found %s reference sequence records" % len(records)) return records
def _parse_fasta_lengths(fasta_file): """ Count the number of bases in each consensus sequence """ lengths = {} for record in FastaReader(fasta_file): name = record.name if name.endswith('_cns'): name = name[:-4] lengths[name] = len(record.sequence) return lengths
def trim_fasta( fasta_file, blasr_file, output_file, locus_dict, window=WINDOW, loci=LOCI ): log.info('Trimming sequences in "%s"' % fasta_file) log.debug("\tWindow Size:\t%s" % window) records = list( FastaReader( fasta_file ) ) trims = parse_trims( blasr_file, window ) trims = filter_trims_on_loci( trims, locus_dict, loci ) trimmed_records = apply_trims( records, trims ) write_fasta( trimmed_records, output_file ) log.info('Finished trimming the supplied sequencs\n') return
def output_read_count_RoI(cid_info, roi_filename, output_filename): """ For each """ f = open(output_filename, 'w') f.write("id\tlength\tis_fl\tstat\tpbid\n") for r in FastaReader(roi_filename): if r.id in cid_info: pbid, stat = cid_info[r.name], 'unique' else: pbid, stat = 'NA', 'unmapped' f.write("{id}\t{len}\t{is_fl}\t{stat}\t{pbid}\n".format(\ id=r.name, len=get_roi_len(r.name), is_fl='Y', stat=stat, pbid=pbid)) f.close()
def _parse_white_list(white_list): if white_list.endswith('.fasta') or white_list.endswith('.fa'): for record in FastaReader(white_list): name = record.name.split()[0] zmw = '/'.join(name.split('/')[:2]) yield zmw elif white_list.endswith('.txt') or white_list.endswith('.ids'): with open(white_list) as handle: for line in handle: name = line.strip().split()[0] zmw = '/'.join(name.split('/')[:2]) yield zmw
def fasta_size(fasta): """ Count the number of sequences in a Fasta """ try: f = FastaReader(fasta) count = 0 for read in f: count += 1 return count except: return None
def extract_sequence(fasta, names): f = FastaReader(fasta) if isinstance(names, str): for r in f: if r.name == names: return r.sequence elif isinstance(names, list): output = [] for r in f: if r.name in names: output.append(r) return output
def read_sequences(sequence_file): """ Parse a list of records from either a Fasta or Fastq file """ if is_fasta(sequence_file): return list(FastaReader(sequence_file)) elif is_fastq(sequence_file): return list(FastqReader(sequence_file)) else: msg = 'Sequence file must be either Fasta or Fastq' log.error(msg) raise TypeError(msg)
def _processPrimers(self, primer_fn_forward, primer_fn_reverse, window_size, primer_out_fn, revcmp_primers=False): """ Do basic sanity checks that: (1) all primers in forward start with f_xxx and are unique (2) all primers in reverse start with r_xxx and are unique (3) check that no forward primers appear in reverse primers (no symmetry) (4) write the primers (f_xxx, f_xxx_revcmp, r_xxx, r_xxx_revcmp) all to one primer file """ def sanity_check_primers(reader, prefix): """ Go through the primers, check that the prefix exists and all seqs are unique """ primers = {} # primer -> sequence, but can also contain the revcmp version with _revcmp suffix for r in reader: if not r.name.startswith(prefix): errMsg = "Forward primer should start with f_, but saw:", r.name raise ClassifierException(errMsg) if len(r.sequence) > window_size: errMsg = "Primer {n} has length {l} which is longer than {k}.".\ format(n=r.name, l=len(r.sequence), k=window_size) logging.error(errMsg) raise ClassifierException(errMsg) ss = r.sequence.upper() if ss in primers.itervalues(): errMsg = "Duplicate sequences found for", ss raise ClassifierException(errMsg) primers[r.name.strip()] = r.sequence # revcmp not needed becuz phmmer does both strands apparently... #primers[r.name.strip() + "_revcmp"] = revcmp(r.sequence) return primers logging.info("Process primers for {case}.". format(case=("finding primers" if not revcmp_primers else "detecting chimeras"))) reader_f = FastaReader(primer_fn_forward) reader_r = FastaReader(primer_fn_reverse) primers_f = sanity_check_primers(reader_f, prefix="f_") primers_r = sanity_check_primers(reader_r, prefix="r_") reader_f.close() reader_r.close() same_seqs = set(primers_f.values()).intersection(primers_r.values()) if len(same_seqs) > 0: errMsg = "Identical sequences found in both Forward/Reverse!\n" errMsg += "\n".join(same_seqs) raise ClassifierException(errMsg) # Write Fi and reverse-complemented Ri to primer_out_fn with open(primer_out_fn, 'w') as f: for (name, seq) in primers_f.iteritems(): f.write(">{n}\n{s}\n".format(n=name, s=seq)) for (name, seq) in primers_r.iteritems(): f.write(">{n}\n{s}\n".format(n=name, s=revcmp(seq))) return primers_f.keys() + primers_r.keys()
def _processPrimers(self, primer_fn, window_size, primer_out_fn, revcmp_primers=False): """ Check and generate primers. 1. Check primers in primer_fn are in order F0, R0, F1, R1, ... Fn, Rn, and lengths are all < k, where k is the primer search window length. F0 5' NNNNNNNNNN 3' R0 3' NNNNNNNNNN 5' 2. If Ri and Fi are revers complementarily identical, add a polyA tail to 3' of Ri. 3. For each combo of primers Fi and Ri, save the following to primer_out_fn. 3.1 If revcmp_primers is False, >Fi Fi_sequence >Ri revcmp(Ri_sequence) 3.2 If revcmp_primers is True, >Fi Fi_sequence >Ri Ri_sequence >Fi_revcmp revcmp(Fi_sqeuence) >Ri_revcmp revcmp(Ri_sqeuence) 4. return primers range(0, n) """ logging.info("Process primers for {case}.". format(case=("finding primers" if not revcmp_primers else "detecting chimeras"))) freader = FastaReader(primer_fn) primers = [] primerComboId = -1 for i, r in enumerate(freader): if i % 2 == 0: direction = "F" primerComboId += 1 else: direction = "R" expectedName = "{d}{n}".format(d=direction, n=primerComboId) if r.name != expectedName: errMsg = "Primers should be placed in order F0, R0, F1, R1..." logging.error(errMsg) raise ClassifierException(errMsg) if len(r.sequence) > window_size: errMsg = "Primer {n} has length {l} which is longer than {k}.".\ format(n=expectedName, l=len(r.sequence), k=window_size) logging.error(errMsg) raise ClassifierException(errMsg) if direction == "F": # Save >Fi and Fi_sequence. primers.append([expectedName, r.sequence]) else: # direction is "R" # fwdF/fwdR is the forward sequence of Fi/Ri fwdF, fwdR = primers[-1][1], r.sequence # revcmpF/revcmpR is the reverse complement of Fi/Ri revcmpF, revcmpR = revcmp(fwdF), revcmp(fwdR) # If Fi and Ri are reverse complementariliy identical, bail out, # because we need Poly A tail to distinguish Fi and Ri. if fwdF.find(revcmpR) >= 0 or revcmpR.find(fwdF) >= 0: infoMsg = "Primer F{n}, R{n} ".format(n=primerComboId) + \ "are reverse complementarily identical. " + \ "Need to add 'AAAA' to 3' to distinguish them." logging.info(infoMsg) if revcmp_primers is False: # Save primer Ri and revcmp(Ri_sequence) + TTTT primers.append([expectedName, revcmpR + "T" * 4]) else: # revcmp_primers is True primers.append([expectedName, "A" * 4 + fwdR]) primers.append(['F{n}_revcmp'.format(n=primerComboId), revcmpF]) primers.append(['R{n}_revcmp'.format(n=primerComboId), revcmpR + "T" * 4]) else: # Ri and Fi are not revcmp identical if revcmp_primers is False: # Save >Ri and revcmp(Ri_sequence) primers.append([expectedName, revcmpR]) else: # Save >Ri and Ri_sequence primers.append([expectedName, fwdR]) # Save >Fi_revcmp and revcmp(Fi_sequence) primers.append(['F{n}_revcmp'.format(n=primerComboId), revcmpF]) # Save >Ri_revcmp and revcmp(Ri_sequence) primers.append(['R{n}_revcmp'.format(n=primerComboId), revcmpR]) freader.close() # Write Fi and reverse-complemented Ri to primer_out_fn f = open(primer_out_fn, 'w') for (name, seq) in primers: f.write(">{n}\n{s}\n".format(n=name, s=seq)) f.close() return range(0, primerComboId + 1)