def mergeMSAs(msa1, msa2, full_ref): """ Given two MultipleSeqAlignment objects sharing a first (reference) sequence, merge them on the reference. Returns a MultipleSeqAlignment containing all the sequences from each alignment, in the alignment induced by the shared reference sequence. Also needs access to the full reference SeqRecord in case it needs bases to fill in a gap. The first sequence may actually be only a subrange in either MSA, and either MSA may be on either strand of it. Either MSA may be None, in which case the other MSA is returned. >>> ref = SeqRecord(Seq("ATATATATGCATATATAT"), "first") >>> ref.annotations = {"strand": 1, "start": 0, "size": 18, "srcSize": 18} >>> ref1 = SeqRecord(Seq("AT-ATATAT"), "first") >>> ref1.annotations = {"strand": 1, "start": 0, "size": 8, "srcSize": 18} >>> alt1 = SeqRecord(Seq("ATAATATAT"), "second") >>> alt1.annotations = {"strand": -1, "start": 0, "size": 9, "srcSize": 9} >>> ref2 = SeqRecord(Seq("ATATATAT--"), "first") >>> ref2.annotations = {"strand": -1, "start": 0, "size": 8, "srcSize": 18} >>> alt2 = SeqRecord(Seq("ATATGG--AT"), "third") >>> alt2.annotations = {"strand": 1, "start": 0, "size": 8, "srcSize": 8} >>> msa1 = Align.MultipleSeqAlignment([ref1, alt1]) >>> msa2 = Align.MultipleSeqAlignment([ref2, alt2]) >>> merged = mergeMSAs(msa1, msa2, ref) >>> print(merged) Alphabet() alignment with 3 rows and 21 columns AT-ATATATGC--ATATATAT first ATAATATAT------------ second -----------AT--CCATAT third >>> pprint.pprint(merged[0].annotations) {'size': 18, 'srcSize': 18, 'start': 0, 'strand': 1} >>> pprint.pprint(merged[1].annotations) {'size': 9, 'srcSize': 9, 'start': 0, 'strand': -1} >>> pprint.pprint(merged[2].annotations) {'size': 8, 'srcSize': 8, 'start': 0, 'strand': -1} >>> ref3 = SeqRecord(Seq("ATGCAT"), "first") >>> ref3.annotations = {"strand": 1, "start": 6, "size": 6, "srcSize": 18} >>> alt3 = SeqRecord(Seq("ATCCAT"), "fourth") >>> alt3.annotations = {"strand": 1, "start": 5, "size": 6, "srcSize": 15} >>> msa3 = Align.MultipleSeqAlignment([ref3, alt3]) >>> merged2 = mergeMSAs(merged, msa3, ref) >>> print(merged2) Alphabet() alignment with 4 rows and 21 columns AT-ATATATGC--ATATATAT first ATAATATAT------------ second -----------AT--CCATAT third -------ATCC--AT------ fourth >>> pprint.pprint(merged2[0].annotations) {'size': 18, 'srcSize': 18, 'start': 0, 'strand': 1} >>> pprint.pprint(merged2[1].annotations) {'size': 9, 'srcSize': 9, 'start': 0, 'strand': -1} >>> pprint.pprint(merged2[2].annotations) {'size': 8, 'srcSize': 8, 'start': 0, 'strand': -1} >>> pprint.pprint(merged2[3].annotations) {'size': 6, 'srcSize': 15, 'start': 5, 'strand': 1} """ if msa1 is None: # No merging to do. return msa2 if msa2 is None: # No merging to do this way either. return msa1 if msa1[0].annotations["strand"] == -1: # MSA 1 needs to be on the + strand of the reference msa1 = reverse_msa(msa1) if msa2[0].annotations["strand"] == -1: # MSA 2 also needs to be on the + strand of the reference msa2 = reverse_msa(msa2) if msa2[0].annotations["start"] < msa1[0].annotations["start"]: # msa2 starts before msa1. We want msa1 to start first, so we need to # flip them. msa1, msa2 = msa2, msa1 logging.debug("Zipping {}bp/{} sequence and {}bp/{} sequence reference " "alignments".format(msa1[0].annotations["size"], len(msa1), msa2[0].annotations["size"], len(msa2))) # Make sure we are joining on the right sequence. assert (msa1[0].id == msa2[0].id) logging.debug("Merging") logging.debug(msa1) logging.debug(msa1[0].annotations) logging.debug(msa2) logging.debug(msa2[0].annotations) # Compute the offset: number of extra reference columns that msa2 needs in # front of it. This will always be positive or 0. msa2_leading_offset = (msa2[0].annotations["start"] - msa1[0].annotations["start"]) logging.debug("{}bp between left and right alignment starts".format( msa2_leading_offset)) # It would be nice if we could shortcut by adjoining compatible alignments, # but the IDs wouldn't match up at all. # Make lists for each sequence we are going to build: those in msa1, and # then those in msa2 (except the duplicate reference). merged = [list() for i in xrange(len(msa1) + len(msa2) - 1)] # Start at the beginning of both alignments. msa1Pos = 0 msa2Pos = 0 # How many reference characters have been used? refChars = 0 while refChars < msa2_leading_offset and msa1Pos < len(msa1[0]): # Until we're to the point that MSA 2 might have anything to say, we # just copy MSA 1. for i, character in enumerate(msa1[:, msa1Pos]): # For each character in the first alignment in this column # Put that character as the character for the appropriate # sequence. merged[i].append(character) for i in xrange(len(msa1), len(msa1) + len(msa2) - 1): # For each of the alignment rows that come from msa2, put a gap. merged[i].append("-") if msa1[0, msa1Pos] != "-": # We consumed a reference character. refChars += 1 # We used some of MSA1 msa1Pos += 1 logging.debug("Used {}/{} offset".format(refChars, msa2_leading_offset)) while refChars < msa2_leading_offset: # We have a gap between the first MSA and the second, and we need to # fill it with reference sequence. # We know we are refChars after the beginning of the first reference, so # we use that to know what base to put here. merged[0].append(full_ref[msa1[0].annotations["start"] + refChars]) for i in xrange(1, len(msa1) + len(msa2) - 1): # And gap out all the other sequences. merged[i].append("-") # We consumed (or made up) a reference character refChars += 1 while msa1Pos < len(msa1[0]) and msa2Pos < len(msa2[0]): # Until we hit the end of both sequences if refChars % 10000 == 0: logging.debug("Now at {} in alignment 1, {} in alignment 2, {} in " "reference".format(msa1Pos, msa2Pos, refChars)) if (msa1[0, msa1Pos] == "-"): # We have a gap in the first reference. Put this column from the # first alignment alongside a gap for every sequence in the second # alignment. for i, character in enumerate(msa1[:, msa1Pos]): # For each character in the first alignment in this column # Put that character as the character for the appropriate # sequence. merged[i].append(character) for i in xrange(len(msa1), len(msa1) + len(msa2) - 1): # For each of the alignment rows that come from msa2, put a gap. merged[i].append("-") # Advance in msa1. We'll keep doing this until it doesn't have a gap # in its reference. msa1Pos += 1 elif (msa2[0, msa2Pos] == "-"): # We have a letter in the first reference but a gap in the second. # Gap out the merged reference and all the columns from alignment 1, # and take the non-reference characters from alignment 2. for i in xrange(len(msa1)): # For the reference and all the sequences in msa1, add gaps merged[i].append("-") for i, character in zip( xrange(len(msa1), len(msa1) + len(msa2) - 1), msa2[1:, msa2Pos]): # For each of the alignment rows that come from msa2, put the # character from that row. merged[i].append(character) # Advance in msa2. We'll keep doing this until both msa1 and msa2 # have a non-gap character in their references. We make it an # invariant that this will always be the same character. msa2Pos += 1 else: # Neither has a gap. They both have real characters. if (msa1[0, msa1Pos] != msa2[0, msa2Pos]): logging.error(msa1) logging.error(msa2) raise RuntimeError("{} in reference 1 does not match {} " "in reference 2".format( msa1[0, msa1Pos], msa2[0, msa2Pos])) for i, character in enumerate(msa1[:, msa1Pos]): # Copy all the characters from msa1's column merged[i].append(character) for character, i in zip( msa2[1:, msa2Pos], xrange(len(msa1), len(msa1) + len(msa2) - 1)): # Copy all the characters from msa2's column, except its # reference merged[i].append(character) # Advance both alignments msa1Pos += 1 msa2Pos += 1 # Say we used a reference character refChars += 1 for otherMerged in merged[1:]: # Make sure we aren't dropping characters anywhere. assert (len(otherMerged) == len(merged[0])) logging.debug("At {}/{} of msa2, {}/{} of msa1".format( msa2Pos, len(msa2[0]), msa1Pos, len(msa1[0]))) # By here, we must have finished one of the MSAs. Only one can have anything # left. assert (msa1Pos == len(msa1[0]) or msa2Pos == len(msa2[0])) while msa1Pos < len(msa1[0]): # MSA2 finished first and now we have to finish up with the tail end of # MSA1 for i, character in enumerate(msa1[:, msa1Pos]): # For each character in the first alignment in this column # Put that character as the character for the appropriate # sequence. merged[i].append(character) for i in xrange(len(msa1), len(msa1) + len(msa2) - 1): # For each of the alignment rows that come from msa2, put a gap. merged[i].append("-") # Advance in msa1, until we finish it. msa1Pos += 1 while msa2Pos < len(msa2[0]): # MSA1 finished first and now we have to finish up with the tail end of # MSA2 # For the reference, put whatever it has in MSA2 merged[0].append(msa2[0][msa2Pos]) for i in xrange(1, len(msa1)): # For all the sequences in msa1, add gaps merged[i].append("-") for i, character in zip(xrange(len(msa1), len(msa1) + len(msa2) - 1), msa2[1:, msa2Pos]): # For each of the alignment rows that come from msa2, put the # character from that row. merged[i].append(character) # Advance in msa2, until we finish it. msa2Pos += 1 # Now we have finished populating these aligned lists. We need to make a # MultipleSeqAlignment from them. # What names do the sequences in this alignment have? All the ones from # msa1, and then all the ones from msa2 except the first (which is the # reference) seqNames = [record.id for record in msa1] + [record.id for record in msa2[1:]] # Make a SeqRecord for each list of properly gapped-out characters, with the # appropriate name. seqRecords = [ SeqRecord(Seq("".join(alignedList)), name) for alignedList, name in zip(merged, seqNames) ] # Make the records into a proper MSA merged = Align.MultipleSeqAlignment(seqRecords) # Do the annotations for the reference merged[0].annotations.update(msa1[0].annotations) # Calculate the total reference bases used. It will be the distance between # the rightmost alignment end and the start of msa1, along the reference. merged[0].annotations["size"] = ( max(msa2[0].annotations["start"] + msa2[0].annotations["size"], msa1[0].annotations["start"] + msa1[0].annotations["size"]) - msa1[0].annotations["start"]) for i in xrange(1, len(msa1)): # Copy over annotations from MSA1 merged[i].annotations.update(msa1[i].annotations) for i in xrange(len(msa1), len(msa1) + len(msa2) - 1): # Copy over annotations from MSA2, starting after the reference. merged[i].annotations.update(msa2[i - len(msa1) + 1].annotations) # The merged result reverence needs to be longer than the input references. #assert(len(merged[0]) >= len(msa1[0])) #assert(len(merged[0]) >= len(msa2[0])) # Give back the merged MSA return merged
def __next__(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: # Empty file - just give up. raise StopIteration if not line.strip() == '# STOCKHOLM 1.0': raise ValueError("Did not find STOCKHOLM header") # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while True: line = self.handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == '# STOCKHOLM 1.0': self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, '') seqs[id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == '#=GC ': # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == '#=GS ': # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids: # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids: # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip( ) # append to any previous entry # TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) alignment_length = len(list(seqs.values())[0]) records = [] # Alignment obj will put them all in a list anyway for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) name, start, end = self._identifier_split(id) record = SeqRecord(Seq(seq, self.alphabet), id=id, name=name, description=id, annotations={"accession": name}) # Accession will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name if start is not None: record.annotations["start"] = start if end is not None: record.annotations["end"] = end self._populate_meta_data(id, record) records.append(record) alignment = MultipleSeqAlignment(records, self.alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr return alignment else: raise StopIteration
def cal_tm_bond(tri_seq,temp_seq,C_Na,C_Mg,C_Strand): tm1 = TmDeltaG.calTm(tri_seq, temp_seq, C_Na, C_Mg,C_Strand, 0.00008) tm2 = TmDeltaG.calTm(temp_seq, temp_seq, C_Na, C_Mg,C_Strand, 0.00008) bond = str(SecStructures_jf4.SecStructures(SeqRecord(Seq(tri_seq)),SeqRecord(Seq(temp_seq)))).split()[0] r=[tm1,tm2,bond] return r
def adapter_find(reference_database, reads, threads, max_intron_length, working_dir, verbose): subset_fasta = reads + "subset.10000.fasta" with open(subset_fasta, "w") as fh: for rec in SeqIO.parse(reads, "fasta"): if int(rec.id) < 10000: SeqIO.write(rec, fh, "fasta") bam = mapping.minimap(reference_database, subset_fasta, threads, max_intron_length, working_dir, verbose) #soft_clip_regions = soft_clip(bam) fasta_gz = bam + ".fasta" cmd = "extractSoftclipped %s | zcat | fastqToFa /dev/stdin %s" % (bam, fasta_gz) if verbose: sys.stderr.write('Executing: %s\n\n' % cmd) extract_clip = subprocess.Popen(cmd, cwd=working_dir, shell=True) extract_clip.communicate() list_short = [] list_long = [] dict_uniq = {} with open(fasta_gz, "r") as handle: for rec in SeqIO.parse(handle, "fasta"): name_seq = str(rec.id) name = name_seq.split("_")[0] if name in dict_uniq: if len(dict_uniq[name].seq) > len(rec.seq): list_long.append(dict_uniq[name]) list_short.append(rec) else: list_short.append(dict_uniq[name]) list_long.append(rec) else: dict_uniq[name] = rec long_file = fasta_gz + ".long.fasta" with open(long_file, "w") as fh: SeqIO.write(list_long, fh, "fasta") short_file = fasta_gz + ".short.fasta" with open(short_file, "w") as fh: SeqIO.write(list_short, fh, "fasta") list_file_clip = [(long_file, "long"), (short_file, "short")] for clip_file in list_file_clip: kmer_start = 21 list_kmer = [] while kmer_start < 120: cmd = "jellyfish count -s 10000000 -m %s -o %s.%s.kmer %s" % ( kmer_start, kmer_start, clip_file[1], clip_file[0]) if verbose: sys.stderr.write('Executing: %s\n\n' % cmd) jelly_count = subprocess.Popen(cmd, cwd=working_dir, shell=True) jelly_count.communicate() cmd = "jellyfish dump -L 2 -ct %s.%s.kmer | sort -k2n | tail -n 1" % ( kmer_start, clip_file[1]) if verbose: sys.stderr.write('Executing: %s\n\n' % cmd) jelly_dump = subprocess.Popen(cmd, cwd=working_dir, stdout=subprocess.PIPE, shell=True) out_dump = jelly_dump.communicate()[0].decode('utf-8') mer = out_dump.split("\t")[0] a_count = mer.count("A") t_count = mer.count("T") if a_count > t_count: bias_count = a_count else: bias_count = t_count data_kmer = (kmer_start, mer, GC(mer), (bias_count / kmer_start) * 100, (bias_count / kmer_start) * 100 - GC(mer)) list_kmer.append(data_kmer) kmer_start += 5 value_adapter = 0 for i in list_kmer: if i[4] > int(value_adapter): value_adapter = i[4] kmer_done = i[1] adapter_file = os.path.join(working_dir, "adapter.fasta") if value_adapter > 0: with open(adapter_file, "w") as fh: record = SeqRecord(Seq(str(kmer_done)), id="adapter") SeqIO.write(record, fh, "fasta") return adapter_file
def __generate_sequence_profiles_old(): mtx_dir_name = 'pssm_deltablast' DB_INDEX = SeqIO.index('data/scop40_structural_alignment.fasta', 'fasta') records = {} for i in DB_INDEX: domkey = i.split('&')[0] records[domkey] = SeqRecord(DB_INDEX[i].seq.ungap('-'), id=domkey, name='', description='') with Path('data/scop40_scopdom_pdbatom_seq.fasta').open('w') as f: SeqIO.write(records.values(), f, 'fasta') DB_INDEX = SeqIO.index('data/scop40_scopdom_pdbatom_seq.fasta', 'fasta') for sid in tqdm(list(DB_INDEX)): mtx_dir = Path(f'data/{mtx_dir_name}/{sid[2:4]}') mtx_dir.mkdir(exist_ok=True, parents=True) mtx_file = mtx_dir / f'{sid}.mtx' if mtx_file.exists(): logging.debug(f'PSSM already exists: {mtx_file}') continue try: SeqIO.write(DB_INDEX[sid], f'{sid}.fasta', 'fasta') NcbipsiblastCommandline(query=f'{sid}.fasta', db='uniref90', num_threads=int(os.cpu_count()), num_iterations=3, out_ascii_pssm=mtx_file.as_posix(), save_pssm_after_last_round=True)() except Exception as e: logging.exception(e) continue finally: if Path(f'{sid}.fasta').exists(): Path(f'{sid}.fasta').unlink() logging.info('') for sid in tqdm( pickle.load(Path('data/one_domain_superfamily.pkl').open('rb'))): mtx_dir = Path(f'data/{mtx_dir_name}/{sid[2:4]}') mtx_dir.mkdir(exist_ok=True, parents=True) mtx_file = mtx_dir / f'{sid}.mtx' if mtx_file.exists(): logging.debug(f'PSSM already exists: {mtx_file}') continue try: tmalign = TMalignCommandLine(f'data/scop_e/{sid[2:4]}/{sid}.ent', f'data/scop_e/{sid[2:4]}/{sid}.ent') tmalign.run() assert str(tmalign.alignment[0].seq).find('-') == -1 SeqIO.write(tmalign.alignment[0], f'{sid}.fasta', 'fasta') NcbipsiblastCommandline(query=f'{sid}.fasta', db='uniref90', num_threads=int(os.cpu_count()), num_iterations=3, out_ascii_pssm=mtx_file.as_posix(), save_pssm_after_last_round=True)() except Exception as e: logging.error(f'sid={sid}') logging.exception(e) continue finally: if Path(f'{sid}.fasta').exists(): Path(f'{sid}.fasta').unlink()
# python3 import argparse from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from dnachisel import * # imput parameters ap = argparse.ArgumentParser() ap.add_argument("-fa", "--fasta", required=True, help="input single or multi fasta file") ap.add_argument("-org","--organism", required=True, help="organism to input(use either the names of the genomes avaliable on dnachisel or use the taxid of the organisms in http://www.kazusa.or.jp/codon/)") ap.add_argument("-opt","--optimized", required=True, help="optimized fasta file") args = vars(ap.parse_args()) # main optimized_seqs = [] # setup an empty list for record in SeqIO.parse(args['fasta'], "fasta"): problem = DnaOptimizationProblem(sequence=str(record.seq), constraints=[EnforceTranslation()], objectives=[CodonOptimize(species= args['organism'])]) problem.optimize() # add this record to the list optimized_seqs.append(SeqRecord(Seq(problem.sequence),id=record.id,description="")) # export to fasta SeqIO.write(optimized_seqs, args['optimized'], "fasta")
def get_structure_seqrecords(model): """Get a dictionary of a PDB file's sequences. Special cases include: - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR - HETATMs. Currently written as an "X", or unknown amino acid. Args: model: Biopython Model object of a Structure Returns: list: List of SeqRecords """ structure_seq_records = [] # Loop over each chain of the PDB for chain in model: tracker = 0 chain_seq = '' chain_resnums = [] # Loop over the residues for res in chain.get_residues(): # NOTE: you can get the residue number too res_id = res.id res_num = res_id[1] res_icode = res_id[2] # Double check if the residue name is a standard residue # If it is not a standard residue (ie. selenomethionine), # it will be filled in with an X on the next iteration) if Polypeptide.is_aa(res, standard=True): end_tracker = res_num res_aa_one = Polypeptide.three_to_one(res.get_resname()) # Tracker to fill in X's if end_tracker != (tracker + 1): if res_icode != ' ': chain_seq += res_aa_one chain_resnums.append(res_num) tracker = end_tracker + 1 continue else: multiplier = (end_tracker - tracker - 1) chain_seq += 'X' * multiplier # Residue numbers for unresolved or nonstandard residues are Infinite chain_resnums.extend([float("Inf")] * multiplier) chain_seq += res_aa_one chain_resnums.append(res_num) tracker = end_tracker else: continue chain_seq_record = SeqRecord(Seq(chain_seq, IUPAC.protein), id=chain.get_id()) chain_seq_record.letter_annotations[ 'structure_resnums'] = chain_resnums structure_seq_records.append(chain_seq_record) return structure_seq_records
part_list = [] for kid in data.kids[rec.id]: for sub in data.ids[kid]: #assert sub.strand == rec.strand if sub.type == required_type: part_list.append(sub) #sort into order by start bp part_list.sort(key=lambda sub: sub.start) #assemble exons seq = '' for sub in part_list: seq += str(seq_dict[rec.seqid].seq[sub.start:sub.end]) #ignore if length zero if len(seq) == 0: continue if seq.upper().count('N') == len(seq): continue seq = Seq(seq) if rec.strand == '-': seq = seq.reverse_complement() if conf.protein: seq = translate(seq) if len(seq) == 0: continue newrec = SeqRecord(seq, id=rec.id, description='') #write out to file SeqIO.write(newrec, fout, "fasta") if conf.out != 'STDOUT': fout.close()
if args.subsample < len(alignments): alignments = random.sample(alignments,args.subsample) else: raise Exception('Number to subsample must be smaller than number of loci available!') #now make output try: os.makedirs(outname) except: pass if args.complete: #add missing species to each alignment all_taxa = [] for alignment in alignments: all_taxa.extend([record.id for record in alignment]) #first let's get a list of all taxa in each alignment all_taxa = set(all_taxa) for alignment in alignments: this_taxa = set([record.id for record in alignment]) missing_taxa = all_taxa - this_taxa al_len = alignment.get_alignment_length() if missing_taxa: sys.stderr.write('Adding ' + str(len(missing_taxa)) + ' missing taxa\n') seqrecs = [SeqRecord(Seq('N' * al_len, IUPACAmbiguousDNA()), id=tx) for tx in missing_taxa] seqs_to_add = MultipleSeqAlignment(seqrecs) alignment.extend(seqs_to_add) for i, alignment in enumerate(alignments): AlignIO.write(alignment, outname + '/' + outname + '_' + str(i) + ".nex", "nexus")
def build_target_info( base_dir, info, all_index_locations, defer_HA_identification=False, offtargets=False, ): ''' info should have keys: sgRNA_sequence amplicon_primers optional keys: donor_sequence nonhomologous_donor_sequence extra_sequences effector ''' genome = info['genome'] if info['genome'] not in all_index_locations: print(f'Error: can\'t locate indices for {genome}') sys.exit(0) else: index_locations = all_index_locations[genome] base_dir = Path(base_dir) name = info['name'] donor_info = info.get('donor_sequence') if donor_info is None: donor_name = None donor_seq = None else: donor_name, donor_seq = donor_info if donor_name is None: donor_name = f'{name}_donor' if donor_seq is None: has_donor = False else: has_donor = True if info['donor_type'] is None: donor_type = None else: _, donor_type = info['donor_type'] nh_donor_info = info.get('nonhomologous_donor_sequence') if nh_donor_info is None: nh_donor_name = None nh_donor_seq = None else: nh_donor_name, nh_donor_seq = nh_donor_info if nh_donor_name is None: nh_donor_name = f'{name}_NH_donor' if nh_donor_seq is None: has_nh_donor = False else: has_nh_donor = True target_dir = base_dir / 'targets' / name target_dir.mkdir(parents=True, exist_ok=True) protospacer, *other_protospacers = info['sgRNA_sequence'] primers_name, primers = info['amplicon_primers'] primers = primers.split(';') if primers_name is None: target_name = name else: target_name = primers_name protospacer_dir = target_dir / 'protospacer_alignment' protospacer_dir.mkdir(exist_ok=True) fastq_fn = protospacer_dir / 'protospacer.fastq' STAR_prefix = protospacer_dir / 'protospacer_' bam_fn = protospacer_dir / 'protospacer.bam' STAR_index = index_locations['STAR'] gb_fns = { 'target': target_dir / f'{target_name}.gb', 'donor': target_dir / f'{donor_name}.gb', 'nh_donor': target_dir / f'{nh_donor_name}.gb', } # Make a fastq file with a single read containing the protospacer sequence. protospacer_name, protospacer_seq = protospacer with fastq_fn.open('w') as fh: quals = fastq.encode_sanger([40] * len(protospacer_seq)) read = fastq.Read('protospacer', protospacer_seq, quals) fh.write(str(read)) # Align the protospacer to the reference genome. mapping_tools.map_STAR(fastq_fn, STAR_index, STAR_prefix, mode='guide_alignment', bam_fn=bam_fn, sort=False) with pysam.AlignmentFile(bam_fn) as bam_fh: perfect_als = [ al for al in bam_fh if not al.is_unmapped and sam.total_edit_distance(al) == 0 ] imperfect_als = [al for al in bam_fh if not al.is_unmapped] region_fetcher = genomes.build_region_fetcher(index_locations['fasta']) def evaluate_candidate(al): results = { 'location': f'{al.reference_name} {al.reference_start:,} {sam.get_strand(al)}', } full_window_around = 5000 full_around = region_fetcher( al.reference_name, al.reference_start - full_window_around, al.reference_end + full_window_around).upper() if sam.get_strand(al) == '+': ps_seq = protospacer_seq ps_strand = 1 else: ps_seq = utilities.reverse_complement(protospacer_seq) ps_strand = -1 ps_start = full_around.index(ps_seq) protospacer_locations = [(protospacer_name, ps_seq, ps_start, ps_strand)] for other_protospacer_name, other_protospacer_seq in other_protospacers: # Initial G may not match genome. if other_protospacer_seq.startswith('G'): other_protospacer_seq = other_protospacer_seq[1:] if other_protospacer_seq in full_around: ps_seq = other_protospacer_seq ps_strand = 1 else: ps_seq = utilities.reverse_complement(other_protospacer_seq) if ps_seq not in full_around: results[ 'failed'] = f'protospacer {other_protospacer_seq} not present near protospacer {protospacer_seq}' return results ps_strand = -1 ps_start = full_around.index(ps_seq) protospacer_locations.append( (other_protospacer_name, ps_seq, ps_start, ps_strand)) if 'effector' in info: effector_type = info['effector'] else: if donor_type == 'pegRNA': effector_type = 'SpCas9H840A' else: effector_type = 'SpCas9' effector = target_info.effectors[effector_type] for ps_name, ps_seq, ps_start, ps_strand in protospacer_locations: PAM_pattern = effector.PAM_pattern if (ps_strand == 1 and effector.PAM_side == 3) or (ps_strand == -1 and effector.PAM_side == 5): PAM_offset = len(ps_seq) PAM_transform = utilities.identity else: PAM_offset = -len(PAM_pattern) PAM_transform = utilities.reverse_complement PAM_start = ps_start + PAM_offset PAM = PAM_transform(full_around[PAM_start:PAM_start + len(PAM_pattern)]) pattern, *matches = Bio.SeqUtils.nt_search(PAM, PAM_pattern) if 0 not in matches and not offtargets: # Note: this could incorrectly fail if there are multiple exact matches for an other_protospacer # in full_around. results[ 'failed'] = f'bad PAM: {PAM} next to {ps_seq} (strand {ps_strand})' return results if primers[0] in full_around: leftmost_primer = primers[0] rightmost_primer = utilities.reverse_complement(primers[1]) if rightmost_primer not in full_around: results[ 'failed'] = f'primer {primers[1]} not present near protospacer' return results leftmost_primer_name = 'forward_primer' rightmost_primer_name = 'reverse_primer' else: leftmost_primer = primers[1] rightmost_primer = utilities.reverse_complement(primers[0]) if leftmost_primer not in full_around: results[ 'failed'] = f'primer {primers[1]} not present near protospacer' return results if rightmost_primer not in full_around: results[ 'failed'] = f'primer {primers[0]} not present near protospacer' return results leftmost_primer_name = 'reverse_primer' rightmost_primer_name = 'forward_primer' leftmost_start = full_around.index(leftmost_primer) rightmost_start = full_around.index(rightmost_primer) if leftmost_start >= rightmost_start: results['failed'] = f'primers don\'t flank protospacer' return results # Now that primers have been located, redefine the target sequence to include a fixed # window on either side of the primers. final_window_around = 500 offset = leftmost_start - final_window_around final_start = leftmost_start - final_window_around final_end = rightmost_start + len( rightmost_primer) + final_window_around target_seq = full_around[final_start:final_end] leftmost_location = FeatureLocation(leftmost_start - offset, leftmost_start - offset + len(leftmost_primer), strand=1) rightmost_location = FeatureLocation(rightmost_start - offset, rightmost_start - offset + len(rightmost_primer), strand=-1) colors = { 'HA_1': '#c7b0e3', 'HA_RT': '#c7b0e3', 'HA_2': '#85dae9', 'HA_PBS': '#85dae9', 'forward_primer': '#75C6A9', 'reverse_primer': '#9eafd2', 'sgRNA': '#c6c9d1', 'donor_specific': '#b1ff67', 'PCR_adapter_1': '#F8D3A9', 'PCR_adapter_2': '#D59687', 'protospacer': '#ff9ccd', 'scaffold': '#b7e6d7', } target_features = [ SeqFeature( location=leftmost_location, id=leftmost_primer_name, type='misc_feature', qualifiers={ 'label': leftmost_primer_name, 'ApEinfo_fwdcolor': colors[leftmost_primer_name], }, ), SeqFeature( location=rightmost_location, id=rightmost_primer_name, type='misc_feature', qualifiers={ 'label': rightmost_primer_name, 'ApEinfo_fwdcolor': colors[rightmost_primer_name], }, ), ] if leftmost_primer_name == 'forward_primer': start = leftmost_start - offset start_location = FeatureLocation(start, start + 5, strand=1) else: start = rightmost_start - offset + len(rightmost_primer) - 5 start_location = FeatureLocation(start, start + 5, strand=-1) target_features.extend([ SeqFeature( location=start_location, id='sequencing_start', type='misc_feature', qualifiers={ 'label': 'sequencing_start', }, ), SeqFeature( location=start_location, id='anchor', type='misc_feature', qualifiers={ 'label': 'anchor', }, ), ]) sgRNA_features = [] for sgRNA_i, (ps_name, ps_seq, ps_start, ps_strand) in enumerate(protospacer_locations): sgRNA_feature = SeqFeature( location=FeatureLocation(ps_start - offset, ps_start - offset + len(ps_seq), strand=ps_strand), id=f'sgRNA_{ps_name}', type=f'sgRNA_{effector.name}', qualifiers={ 'label': f'sgRNA_{ps_name}', 'ApEinfo_fwdcolor': colors['sgRNA'], }, ) target_features.append(sgRNA_feature) sgRNA_features.append(sgRNA_feature) results['gb_Records'] = {} if has_donor: if not defer_HA_identification: # If multiple sgRNAs are given, the edited one must be listed first. sgRNA_feature = sgRNA_features[0] cut_after_offset = [ offset for offset in effector.cut_after_offset if offset is not None ][0] if sgRNA_feature.strand == 1: # sgRNA_feature.end is the first nt of the PAM cut_after = sgRNA_feature.location.end + cut_after_offset else: # sgRNA_feature.start - 1 is the first nt of the PAM cut_after = sgRNA_feature.location.start - 1 - cut_after_offset - 1 if donor_type == 'pegRNA': HA_info = identify_pegRNA_homology_arms( donor_seq, target_seq, cut_after, protospacer_seq, colors) else: HA_info = identify_homology_arms(donor_seq, donor_type, target_seq, cut_after, colors) if 'failed' in HA_info: results['failed'] = HA_info['failed'] return results donor_Seq = Seq(HA_info['possibly_flipped_donor_seq']) donor_features = HA_info['donor_features'] target_features.extend(HA_info['target_features']) else: donor_Seq = Seq(donor_seq) donor_features = [] donor_Record = SeqRecord(donor_Seq, name=donor_name, features=donor_features, annotations={'molecule_type': 'DNA'}) results['gb_Records']['donor'] = donor_Record target_Seq = Seq(target_seq) target_Record = SeqRecord(target_Seq, name=target_name, features=target_features, annotations={'molecule_type': 'DNA'}) results['gb_Records']['target'] = target_Record if has_nh_donor: nh_donor_Seq = Seq(nh_donor_seq) nh_donor_Record = SeqRecord(nh_donor_Seq, name=nh_donor_name, annotations={'molecule_type': 'DNA'}) results['gb_Records']['nh_donor'] = nh_donor_Record return results good_candidates = [] bad_candidates = [] for al in perfect_als: results = evaluate_candidate(al) if 'failed' in results: bad_candidates.append(results) else: good_candidates.append(results) if len(good_candidates) == 0: if len(bad_candidates) == 0: print( f'Error building {name}: no perfect matches to sgRNA {protospacer} found in {genome}' ) print(imperfect_als) return else: print( f'Error building {name}: no valid genomic locations for {name}' ) for results in bad_candidates: print(f'\t{results["location"]}: {results["failed"]}') return elif len(good_candidates) > 1: print(f'Warning: multiple valid genomic locations for {name}:') for results in good_candidates: print(f'\t{results["location"]}') best_candidate = good_candidates[0] print(f'Arbitrarily choosing {best_candidate["location"]}') else: best_candidate = good_candidates[0] truncated_name_i = 0 with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=BiopythonWarning) for which_seq, Record in best_candidate['gb_Records'].items(): try: Bio.SeqIO.write(Record, gb_fns[which_seq], 'genbank') except ValueError: # locus line too long, can't write genbank file with BioPython old_name = Record.name truncated_name = f'{Record.name[:11]}_{truncated_name_i}' Record.name = truncated_name Bio.SeqIO.write(Record, gb_fns[which_seq], 'genbank') Record.name = old_name truncated_name_i += 1 manifest_fn = target_dir / 'manifest.yaml' sources = [target_name] if has_donor: sources.append(donor_name) extra_Records = [] if info.get('extra_sequences') is not None: for extra_seq_name, extra_seq in info['extra_sequences']: sources.append(extra_seq_name) extra_Records.append(SeqRecord(extra_seq, name=extra_seq_name), annotations={'molecule_type': 'DNA'}) manifest = { 'sources': sources, 'target': target_name, } if has_donor: manifest['donor'] = donor_name manifest['donor_specific'] = 'donor_specific' if donor_type is not None: manifest['donor_type'] = donor_type if has_nh_donor: manifest['nonhomologous_donor'] = nh_donor_name manifest['features_to_show'] = [ [target_name, 'forward_primer'], [target_name, 'reverse_primer'], ] if has_donor: if donor_type == 'pegRNA': manifest['features_to_show'].extend([ [donor_name, 'scaffold'], [donor_name, 'protospacer'], [donor_name, 'HA_RT'], [donor_name, 'HA_PBS'], [target_name, 'HA_RT'], [target_name, 'HA_PBS'], ]) else: manifest['features_to_show'].extend([ [donor_name, 'HA_1'], [donor_name, 'HA_2'], [donor_name, 'donor_specific'], [donor_name, 'PCR_adapter_1'], [donor_name, 'PCR_adapter_2'], [target_name, 'HA_1'], [target_name, 'HA_2'], ]) manifest['genome_source'] = genome manifest_fn.write_text(yaml.dump(manifest, default_flow_style=False)) gb_records = list(best_candidate['gb_Records'].values()) + extra_Records ti = target_info.TargetInfo(base_dir, name, gb_records=gb_records) ti.make_references() ti.make_protospacer_fastas() ti.map_protospacers(genome) ti.identify_degenerate_indels() shutil.rmtree(protospacer_dir)
def evaluate_candidate(al): results = { 'location': f'{al.reference_name} {al.reference_start:,} {sam.get_strand(al)}', } full_window_around = 5000 full_around = region_fetcher( al.reference_name, al.reference_start - full_window_around, al.reference_end + full_window_around).upper() if sam.get_strand(al) == '+': ps_seq = protospacer_seq ps_strand = 1 else: ps_seq = utilities.reverse_complement(protospacer_seq) ps_strand = -1 ps_start = full_around.index(ps_seq) protospacer_locations = [(protospacer_name, ps_seq, ps_start, ps_strand)] for other_protospacer_name, other_protospacer_seq in other_protospacers: # Initial G may not match genome. if other_protospacer_seq.startswith('G'): other_protospacer_seq = other_protospacer_seq[1:] if other_protospacer_seq in full_around: ps_seq = other_protospacer_seq ps_strand = 1 else: ps_seq = utilities.reverse_complement(other_protospacer_seq) if ps_seq not in full_around: results[ 'failed'] = f'protospacer {other_protospacer_seq} not present near protospacer {protospacer_seq}' return results ps_strand = -1 ps_start = full_around.index(ps_seq) protospacer_locations.append( (other_protospacer_name, ps_seq, ps_start, ps_strand)) if 'effector' in info: effector_type = info['effector'] else: if donor_type == 'pegRNA': effector_type = 'SpCas9H840A' else: effector_type = 'SpCas9' effector = target_info.effectors[effector_type] for ps_name, ps_seq, ps_start, ps_strand in protospacer_locations: PAM_pattern = effector.PAM_pattern if (ps_strand == 1 and effector.PAM_side == 3) or (ps_strand == -1 and effector.PAM_side == 5): PAM_offset = len(ps_seq) PAM_transform = utilities.identity else: PAM_offset = -len(PAM_pattern) PAM_transform = utilities.reverse_complement PAM_start = ps_start + PAM_offset PAM = PAM_transform(full_around[PAM_start:PAM_start + len(PAM_pattern)]) pattern, *matches = Bio.SeqUtils.nt_search(PAM, PAM_pattern) if 0 not in matches and not offtargets: # Note: this could incorrectly fail if there are multiple exact matches for an other_protospacer # in full_around. results[ 'failed'] = f'bad PAM: {PAM} next to {ps_seq} (strand {ps_strand})' return results if primers[0] in full_around: leftmost_primer = primers[0] rightmost_primer = utilities.reverse_complement(primers[1]) if rightmost_primer not in full_around: results[ 'failed'] = f'primer {primers[1]} not present near protospacer' return results leftmost_primer_name = 'forward_primer' rightmost_primer_name = 'reverse_primer' else: leftmost_primer = primers[1] rightmost_primer = utilities.reverse_complement(primers[0]) if leftmost_primer not in full_around: results[ 'failed'] = f'primer {primers[1]} not present near protospacer' return results if rightmost_primer not in full_around: results[ 'failed'] = f'primer {primers[0]} not present near protospacer' return results leftmost_primer_name = 'reverse_primer' rightmost_primer_name = 'forward_primer' leftmost_start = full_around.index(leftmost_primer) rightmost_start = full_around.index(rightmost_primer) if leftmost_start >= rightmost_start: results['failed'] = f'primers don\'t flank protospacer' return results # Now that primers have been located, redefine the target sequence to include a fixed # window on either side of the primers. final_window_around = 500 offset = leftmost_start - final_window_around final_start = leftmost_start - final_window_around final_end = rightmost_start + len( rightmost_primer) + final_window_around target_seq = full_around[final_start:final_end] leftmost_location = FeatureLocation(leftmost_start - offset, leftmost_start - offset + len(leftmost_primer), strand=1) rightmost_location = FeatureLocation(rightmost_start - offset, rightmost_start - offset + len(rightmost_primer), strand=-1) colors = { 'HA_1': '#c7b0e3', 'HA_RT': '#c7b0e3', 'HA_2': '#85dae9', 'HA_PBS': '#85dae9', 'forward_primer': '#75C6A9', 'reverse_primer': '#9eafd2', 'sgRNA': '#c6c9d1', 'donor_specific': '#b1ff67', 'PCR_adapter_1': '#F8D3A9', 'PCR_adapter_2': '#D59687', 'protospacer': '#ff9ccd', 'scaffold': '#b7e6d7', } target_features = [ SeqFeature( location=leftmost_location, id=leftmost_primer_name, type='misc_feature', qualifiers={ 'label': leftmost_primer_name, 'ApEinfo_fwdcolor': colors[leftmost_primer_name], }, ), SeqFeature( location=rightmost_location, id=rightmost_primer_name, type='misc_feature', qualifiers={ 'label': rightmost_primer_name, 'ApEinfo_fwdcolor': colors[rightmost_primer_name], }, ), ] if leftmost_primer_name == 'forward_primer': start = leftmost_start - offset start_location = FeatureLocation(start, start + 5, strand=1) else: start = rightmost_start - offset + len(rightmost_primer) - 5 start_location = FeatureLocation(start, start + 5, strand=-1) target_features.extend([ SeqFeature( location=start_location, id='sequencing_start', type='misc_feature', qualifiers={ 'label': 'sequencing_start', }, ), SeqFeature( location=start_location, id='anchor', type='misc_feature', qualifiers={ 'label': 'anchor', }, ), ]) sgRNA_features = [] for sgRNA_i, (ps_name, ps_seq, ps_start, ps_strand) in enumerate(protospacer_locations): sgRNA_feature = SeqFeature( location=FeatureLocation(ps_start - offset, ps_start - offset + len(ps_seq), strand=ps_strand), id=f'sgRNA_{ps_name}', type=f'sgRNA_{effector.name}', qualifiers={ 'label': f'sgRNA_{ps_name}', 'ApEinfo_fwdcolor': colors['sgRNA'], }, ) target_features.append(sgRNA_feature) sgRNA_features.append(sgRNA_feature) results['gb_Records'] = {} if has_donor: if not defer_HA_identification: # If multiple sgRNAs are given, the edited one must be listed first. sgRNA_feature = sgRNA_features[0] cut_after_offset = [ offset for offset in effector.cut_after_offset if offset is not None ][0] if sgRNA_feature.strand == 1: # sgRNA_feature.end is the first nt of the PAM cut_after = sgRNA_feature.location.end + cut_after_offset else: # sgRNA_feature.start - 1 is the first nt of the PAM cut_after = sgRNA_feature.location.start - 1 - cut_after_offset - 1 if donor_type == 'pegRNA': HA_info = identify_pegRNA_homology_arms( donor_seq, target_seq, cut_after, protospacer_seq, colors) else: HA_info = identify_homology_arms(donor_seq, donor_type, target_seq, cut_after, colors) if 'failed' in HA_info: results['failed'] = HA_info['failed'] return results donor_Seq = Seq(HA_info['possibly_flipped_donor_seq']) donor_features = HA_info['donor_features'] target_features.extend(HA_info['target_features']) else: donor_Seq = Seq(donor_seq) donor_features = [] donor_Record = SeqRecord(donor_Seq, name=donor_name, features=donor_features, annotations={'molecule_type': 'DNA'}) results['gb_Records']['donor'] = donor_Record target_Seq = Seq(target_seq) target_Record = SeqRecord(target_Seq, name=target_name, features=target_features, annotations={'molecule_type': 'DNA'}) results['gb_Records']['target'] = target_Record if has_nh_donor: nh_donor_Seq = Seq(nh_donor_seq) nh_donor_Record = SeqRecord(nh_donor_Seq, name=nh_donor_name, annotations={'molecule_type': 'DNA'}) results['gb_Records']['nh_donor'] = nh_donor_Record return results
genesh = open(workdir + "genes.fasta", "w") # pseudo = open(workdir + "proteins.fasta","w") for genebank_gz in tqdm(glob(workdir + "*.gz")): assembly = genebank_gz.split(workdir)[1].split( "_genomic.gbff.gz")[0] for contig in tqdm(bpio.parse(gzip.open(genebank_gz), "gb")): for f in contig.features: if f.type == "CDS": if "translation" in f.qualifiers: locus_tag = f.qualifiers["locus_tag"][0] desc = f.qualifiers["protein_id"][ 0] + " " + assembly seq = SeqRecord( id=locus_tag, seq=Seq(f.qualifiers["translation"][0]), description=desc, name=locus_tag) bpio.write(seq, proth, "fasta") seq = f.extract(contig) seq.id = locus_tag seq.name = locus_tag seq.descripion = desc bpio.write(seq, genesh, "fasta") finally: proth.close() genesh.close() #cd-hit -c 0.9 -i proteins.fasta -o proteins_90.fasta -g 1 -aS 0.8 -p 1 #cd-hit -c 0.5 -i proteins.fasta -o proteins_50.fasta -g 1 -aS 0.8 -p 1
if args.gff: with open(args.gbk, 'r') as fh_gbk, open(basename + '.gff', 'w') as fh_gff: GFF.write(SeqIO.parse(fh_gbk, 'genbank'), fh_gff) with open(args.gbk, 'r') as fh_gbk: if args.contigs: fh_fna = open(basename + '.fna', 'w') if args.aminoacids: fh_faa = open(basename + '.faa', 'w') if args.nucleotides: fh_fcn = open(basename + '.fcn', 'w') for seq_record_gbk in SeqIO.parse(fh_gbk, 'genbank'): if args.contigs: seq_record_fna = SeqRecord(seq_record_gbk.seq) # seq_record_fna.seq.alphabet = IUPAC.extended_dna seq_record_fna.id = seq_record_gbk.id seq_record_fna.description = seq_record_gbk.description SeqIO.write(seq_record_fna, fh_fna, 'fasta') if args.aminoacids or args.nucleotides: for feature in seq_record_gbk.features: if feature.type == 'CDS': if('pseudo' in feature.qualifiers) or ('pseudogene' in feature.qualifiers)\ or ('translation' not in feature.qualifiers): continue seq_record_faa = SeqRecord(Seq(feature.qualifiers['translation'][0], IUPAC.extended_protein)) if args.aminoacids: seq_record_faa.id = feature.qualifiers['protein_id'][0] if 'product' in feature.qualifiers:
def gapMismatches(alignment): """ Given an alignment (an MSA with just a reference and a query), replace any mismatches with gaps in each sequence. Return the processed alignment. """ # Make lists of characters that we will join into the new reference and # query sequences. gappedReference = [] gappedQuery = [] # How many mismatches did we gap? mismatches_gapped = 0 # How many aligned bases did we check? bases_checked = 0 # Where are we in the alignment? for column in xrange(len(alignment[0])): # Just go through all the columns in the alignment's reference. # Pull out the reference and query characters at this position. refChar = alignment[0, column] queryChar = alignment[1, column] bases_checked += 1 if "-" in [refChar, queryChar] or refChar == queryChar: # We have a gap or a match. Pass it through to bioth sequences. gappedReference.append(refChar) gappedQuery.append(queryChar) else: # We have a mismatch. Gap one and then the other. gappedReference.append("-") gappedQuery.append(queryChar) gappedReference.append(refChar) gappedQuery.append("-") mismatches_gapped += 1 # Now we need to manufacture the MultipleSeqAlignment to return from these # lists of characters. # What names do the sequences in this alignment have? seqNames = [record.id for record in alignment] # Make a SeqRecord for each list of properly gapped-out characters, with the # appropriate name. seqRecords = [ SeqRecord(Seq("".join(alignedList)), name) for alignedList, name in zip([gappedReference, gappedQuery], seqNames) ] for i in xrange(len(seqRecords)): # Se tannotations on all the new records seqRecords[i].annotations = alignment[i].annotations if float(mismatches_gapped) / bases_checked > 0.5 and bases_checked > 100: # If this gets too high, it means we have a bad offset somewhere. Yell # at the user. logging.warning("{}/{} bases gapped due to mismatch".format( mismatches_gapped, bases_checked)) # Make the records into a proper MSA and return it. return Align.MultipleSeqAlignment(seqRecords)
def convert_genbank(genbank_tuple): genbank_path, db_directory, error_fname, do_protein = genbank_tuple record_list = [] seq_record = next(SeqIO.parse(open(genbank_path), "genbank")) print((seq_record.annotations)) accession = seq_record.id organism = seq_record.annotations['organism'].replace(' ', '_') err_log = [] gc_list = [] # no need for this right now, but leaving in # loop over the genbank file for fnum, feature in enumerate(seq_record.features): err_flag = False error_in_field = False if feature.type == 'CDS': #print dir(feature.location) try: start = int(feature.location.start) stop = int(feature.location.end) except: error_in_field = True strand = feature.strand dna_seq = seq_record.seq[start:stop] #print "dna_seq", type(dna_seq), dna_seq gc = GC(dna_seq) gc_list.append(gc) gc = "%3.2f" % gc try: locus = feature.qualifiers['locus_tag'][0] except: try: locus = feature.qualifiers['gene'][0] except: locus = 'error' print(("Error in the organism %s with NC # %s" % (organism, accession))) err_flag = True err_log.append([organism, accession]) if do_protein: #seq = seq.translate() #print type(seq) #print feature.qualifiers.keys() #seq = dir(feature) try: if 'translation' in list(feature.qualifiers.keys()): # prot_seq = Seq(''.join(feature.qualifiers['translation']), IUPAC.protein) prot_seq = Seq(''.join( feature.qualifiers['translation'])) #print "prot_seq", type(prot_seq), prot_seq if 'gene' in feature.qualifiers: gene = feature.qualifiers['gene'][0] #record_list.append(SeqRecord(prot_seq, id = '|'.join([accession, organism, locus, gene, str(start), str(stop), str(strand), gc]).replace(' ', ''), description = '')) seq_rec_to_store = SeqRecord(prot_seq, id='|'.join([ accession, organism, locus, gene, str(start), str(stop), str(strand), gc ]).replace(' ', ''), description='') else: #record_list.append(SeqRecord(prot_seq, id = '|'.join([accession, organism, locus, 'unknown', str(start), str(stop), str(strand), gc]).replace(' ', ''),description = '')) seq_rec_to_store = SeqRecord(prot_seq, id='|'.join([ accession, organism, locus, 'unknown', str(start), str(stop), str(strand), gc ]).replace(' ', ''), description='') #print prot_seq else: print("This was not a protein sequence") error_in_field = True #print "This was not a protein sequence" except: print( "Error in function convert_genbank(genbank_tuple) from the format_db.py script, unhandled error in the genbank parse." ) error_in_field = True else: # put something in here that will deal with RNA later, if we plan to go that route. pass if not error_in_field: record_list.append(seq_rec_to_store) else: print("a record was omitted") ''' #print len(seq) if len(seq) < 2: #pass print "len seq", len(seq) elif do_protein: if 'gene' in feature.qualifiers: gene = feature.qualifiers['gene'][0] record_list.append(SeqRecord(seq, id = '|'.join([accession, organism, locus, gene, str(start), str(stop), str(strand), gc]).replace(' ', ''), description = '')) else: record_list.append( SeqRecord(seq, id = '|'.join([accession, organism, locus, 'unknown', str(start), str(stop), str(strand), gc]).replace(' ', ''), description = '')) else: if 'gene' in feature.qualifiers: gene = feature.qualifiers['gene'][0] record_list.append(SeqRecord(seq, id = '|'.join([accession, organism, locus, gene, str(start), str(stop), str(strand), gc]).replace(' ', ''), description = '')) else: record_list.append( SeqRecord(seq, id = '|'.join([accession, organism, locus, 'unknown', str(start), str(stop), str(strand), gc]).replace(' ', ''), description = '')) ''' #if os.path.isfile(gc_outfile): # os.remove(gc_outfile) #GCAnalysis(accession, organism, gc_list, seq_record.seq, gc_outfile) handle = open(error_fname, 'a') for i in err_log: handle.write('\t'.join(i) + '\n') handle.close() if not err_flag: outpath = db_directory + os.path.splitext( os.path.basename(genbank_path))[0] + '.ffc' #print outpath out_handle = open(outpath, "w") SeqIO.write(record_list, out_handle, "fasta") out_handle.close() if do_protein: cmd = "makeblastdb -in %s -dbtype prot" % (outpath) #print "got here" else: cmd = "makeblastdb -in %s -dbtype prot" % (outpath) os.system(cmd) #print "Passed main loop" return outpath, err_flag
for number in ec_number: print >> ec_out, number qualifiers['EC_number'] = ec_number start = int(temp.loc[hit, 'start']) end = int(temp.loc[hit, 'end']) location = SeqFeature.FeatureLocation(start, end) new_feature = SeqFeature.SeqFeature(type = 'CDS', qualifiers = qualifiers) new_feature.location = location features.append(new_feature) new_record = SeqRecord(Seq('nnnn', alphabet = IUPAC.ambiguous_dna), id = genome, name = genome, features = features) SeqIO.write(new_record, open(pathos_output_dir + name + '.' + genome + '.gbk', 'w'), 'genbank') print >> all_genetic_elements, 'ID' + '\t' + genome print >> all_genetic_elements, 'NAME' + '\t' + genome print >> all_genetic_elements, 'TYPE' + '\t' + ':CHRSM' print >> all_genetic_elements, 'CIRCULAR?' + '\t' + 'Y' print >> all_genetic_elements, 'ANNOT-FILE' + '\t' + name + '.' + genome + '.gbk' print >> all_genetic_elements, '//' ## Now create the organism-params.dat file. with open(pathos_output_dir + 'organism-params.dat', 'w') as all_organism_params: print >> all_organism_params, 'ID' + '\t' + name print >> all_organism_params, 'Storage' + '\t' + 'File' print >> all_organism_params, 'Name' + '\t' + name
def create_euk_files(d): ## First create a df mapping protein id to SwissProt accession number. columns = ['prot_id', 'swissprot', 'description'] spt = pd.read_csv(ref_dir_domain + 'refseq/' + d + '/swissprot.gff3', index_col = 0, comment = '#', names = columns, usecols = [0,8,10], sep = ';|\t', engine = 'python') spt['swissprot'] = spt['swissprot'].str.replace('Name=Swiss-Prot:', '') spt['description'] = spt['description'].str.replace('Description=Swiss-Prot:', '') ## Create empty list to hold gene features pulled from cds.fa. features = [] ## Artificial start, stops are needed. combined_length = 1 print('generating genbank format files for', d + '...') ## Some directory names differ from the accession number. Rename these ## directories to match the accession number. for f in os.listdir(ref_dir_domain + 'refseq/' + d): if f.endswith('.pep.fa'): a = f.split('.pep.fa')[0] if a != d: os.rename(ref_dir_domain + 'refseq/' + d, ref_dir_domain + 'refseq/' + a) print('directory', d, 'is now', a) for record in SeqIO.parse(ref_dir_domain + 'refseq/' + a + '/' + a + '.pep.fa', 'fasta'): ## The swissprot annotations are indexed by MMETSP record locator, not ## by the actual record.id. sprot_name = str(record.description).split('NCGR_PEP_ID=')[1] sprot_name = sprot_name.split(' /')[0] try: temp_spt = spt.loc[sprot_name, 'swissprot'] except KeyError: continue temp_sprot = sprot_df[sprot_df.index.isin(list(temp_spt))] ecs = list(set(temp_sprot.ec)) descriptions = list(set(temp_sprot.name)) ## Embed all information necessary to create the Genbank file as qualifiers, then ## append to this list of records for that genome. qualifiers = {'protein_id':sprot_name, 'locus_tag':str(record.id), 'EC_number':ecs, 'product':descriptions, 'translation':str(record.seq)} new_feature = SeqFeature.SeqFeature(type = 'CDS', qualifiers = qualifiers) new_feature.location = SeqFeature.FeatureLocation(combined_length, combined_length + len(str(record.seq))) features.append(new_feature) combined_length = combined_length + len(str(record.seq)) ## Write the records in Genbank format. Even though you will ultimately ## want to use the gbk extension, to match the (silly) Genbank convention ## use gbff. new_record = SeqRecord(Seq('nnnn'), id = a, name = a, features = features) new_record.annotations['molecule_type'] = 'DNA' SeqIO.write(new_record, open(ref_dir_domain + 'refseq/' + a + '/' + a + '.gbff', 'w'), 'genbank')
return hairpins_string #end def def _format_hairpins(self, hairpins, seq): hairpins_string = '' if not hairpins: return '0' #print header hairpins_string += self._format_hairpins_header(hairpins, seq) #print hairpins hairpins_string += self._format_hairpin(hairpins[0], seq) return hairpins_string #end def #end class if __name__ == "__main__": structs = SecStructures(SeqRecord(Seq('GATCTGATGCATGAGATCGCATCAGATC'))) print 'seq1 self dimer', str(structs).split()[0] print 'seq1 hairpin', str(structs).split()[1] # for ET_record in SeqIO.parse('example.txt', "fasta"): # structs = SecStructures(SeqRecord(Seq('GATCTGATGCATGAGATCGCATCAGATC'))) # print 'seq1 self dimer',str(structs).split()[0] # print 'seq1 hairpin',str(structs).split()[1] # for PT_record in SeqIO.parse('example2.txt', "fasta"): # structs = SecStructures(PT_record) # structs = SecStructures(ET_record,PT_record) # print 'set3 cross dimer',str(structs)
def _record_formatter(self, trim, name): """return a string formatted as a biopython sequence record""" return SeqRecord(Seq(trim, Gapped(IUPAC.ambiguous_dna, "-?")), id=name, name=name, description=name)
def analyze(self, blueThresh, redThresh, bluePeakThresh, redPeakThresh): self.blueThresh = blueThresh self.redThresh = redThresh self.bluePeakThresh = bluePeakThresh self.redPeakThresh = redPeakThresh if self.seqplotlist != [[]]: for x in self.seqplotlist: self.p1.getRoiPlot().removeItem(x) self.p1.getRoiPlot().autoRange() if self.firingplotlist != []: for x in self.firingplotlist: self.p1.getRoiPlot().removeItem(x) self.p1.getRoiPlot().autoRange() self.seqplotlist = [[]] self.firingplotlist = [] dntpnames = ["dCTP", "dATP", "dGTP", "dTTP"] cdf = pd.DataFrame({'a': [], 't': [], 'g': [], 'c': []}) dntps = [[], [], [], []] zpro = [[], [], [], []] n = len(dntps) dntpdirec = r'C:\Users\Noah PC\PycharmProjects\ZMW analysis\ZMW\02082017' fn = 'dntpss.h5' hfile = h5py.File(os.path.join(dntpdirec, fn)) for i, x in enumerate(dntpnames): fn = 'dntps.h5' zpro[i] = np.array(hfile[x]).astype(float) zpro[i] -= zpro[i].mean() zpro[i] /= zpro[i].std() composite = zpro[0] + zpro[1] + zpro[2] + zpro[3] ly, lx = composite.shape p0 = [composite.mean(1).max(), (ly / 2) - 4, (ly / 2) + 4, 1.] coeff1, var_matrix1 = curve_fit(dubgauss, np.linspace(0, ly - 1, ly), composite.mean(1), p0=p0) p0 = [self.zpro.mean(1).max(), (ly / 2) - 4, (ly / 2) + 4, 1.] coeff2, var_matrix2 = curve_fit(dubgauss, np.linspace(0, ly - 1, ly), self.zpro.mean(1), p0=p0) shifty = np.mean((coeff2[1], coeff2[2])) - np.mean( (coeff1[1], coeff1[2])) p0 = [composite.mean(1).max(), lx / 2, 1.] coeff1, var_matrix1 = curve_fit(gauss, np.linspace(0, lx - 1, lx), composite.mean(0), p0=p0) p0 = [self.zpro.mean(1).max(), lx / 2, 1.] coeff2, var_matrix2 = curve_fit(gauss, np.linspace(0, lx - 1, lx), self.zpro.mean(0), p0=p0) shiftx = coeff2[1] - coeff1[1] self.czpro = [[], [], [], []] for i, x in enumerate(dntps): self.czpro[i] = zpro[i] self.czpro[i] = ird.transform_img(self.czpro[i], tvec=[shifty, shiftx]) seqdf = self.peakdetection(blueThresh, redThresh, bluePeakThresh, redPeakThresh) predictedseq = seqdf.base.str.cat() fn = self.datafilename[:-3] + '_seq.fasta' predictedseq = Seq.Seq(predictedseq, generic_dna) predictedseq = SeqRecord(predictedseq, id=os.path.split(fn)[1]) SeqIO.write(predictedseq, fn, "fasta")
names = set() def fasta_record(rec, cnt): name = '{}_fragment_{}.fasta'.format(file_input.split('.')[0], cnt) names.add(name) print(name) SeqIO.write(rec, name, "fasta") seq_count = 0 records = [] cnt = 0 for record in SeqIO.parse(file_input, 'fasta'): cnt += 1 seq_count += 1 if seq_count < SEQ_LIMIT: records.append( SeqRecord(seq=record.seq, id=record.id, description=record.description)) else: fasta_record(records, cnt) records = [] seq_count = 0 fasta_record(records, cnt) print('The number of records: ', cnt)
#!/usr/bin/env python import sys from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq new_sequences = [] gbinput = str(sys.argv[1]) if len(sys.argv) < 3: faout = gbinput.split(".")[0] + ".fasta" else: faout = str(sys.argv[2]) #faout = gbinput.split(".")[0] + ".fasta" for i in SeqIO.parse(gbinput, "gb"): for f in i.features: if (f.type == "CDS" and f.qualifiers['gene'] == ['pol']): new_sequences.append( SeqRecord(Seq(*(f.qualifiers['translation'])), id="_".join( [i.description, *(f.qualifiers['protein_id'])]), description="")) # print(">{0}_{1}\n{2}".format( # i.description, *(f.qualifiers['protein_id']), *(f.qualifiers['translation'])), #end="\n") if len(new_sequences) == 0: print("No sequences were converted") else: SeqIO.write(new_sequences, faout, format="fasta")
#Now try setting it afterwards to a bad value... rec = SeqRecord(Seq("ACGT", generic_dna), id="Test", name="Test", description="Test") try: rec.letter_annotations = {"test": [1, 2, 3]} self.assertTrue(False, "Changing to bad letter_annotations should fail!") except (TypeError, ValueError), e: pass #Now try setting it at creation time to a bad value... try: rec = SeqRecord(Seq("ACGT", generic_dna), id="Test", name="Test", description="Test", letter_annotations={"test": [1, 2, 3]}) self.assertTrue(False, "Wrong length letter_annotations should fail!") except (TypeError, ValueError), e: pass class SeqRecordMethods(unittest.TestCase): """Test SeqRecord methods.""" def setUp(self): f0 = SeqFeature(FeatureLocation(0, 26), type="source", qualifiers={"mol_type": ["fake protein"]}) f1 = SeqFeature(FeatureLocation(0, ExactPosition(10)))
try: os.stat(directory) except: os.mkdir(directory) fi = sys.argv[1] # fasta file with sequences diri = sys.argv[2] # input directory diro = sys.argv[3] # output directory mkdir(diro + "/plasmids/") mkdir(diro + "/chromosomes/") for ix, record in enumerate( SeqIO.parse(diri + "/" + fi + "/" + fi + ".fna", "fasta")): gid = record.description seq = record.seq if "plasmid" in gid: gtype = "plasmids" else: gtype = "chromosomes" nrecord = SeqRecord(record.seq, id=record.id, name='', description=record.description) SeqIO.write([nrecord], open(diro + "/" + gtype + '/' + fi + "_" + str(ix) + '.fasta', 'w'), 'fasta')
def _get_codon_rec(pro, nucl, span_mode, alphabet, gap_char="-", codon_table=default_codon_table, complete_protein=False, max_score=10): """Generate codon alignment based on regular re match (PRIVATE) span_mode is a tuple returned by _check_corr. The first element is the span of a re search, and the second element is the mode for the match. mode - 0: direct match - 1: mismatch (no indels) - 2: frameshift """ import re from Bio.Seq import Seq nucl_seq = nucl.seq.ungap(gap_char) codon_seq = "" span = span_mode[0] mode = span_mode[1] aa2re = _get_aa_regex(codon_table) if mode in (0, 1): if len(pro.seq.ungap(gap_char)) * 3 != (span[1] - span[0]): raise ValueError("Protein Record {0} and Nucleotide Record {1} " "do not match!".format((pro.id, nucl.id))) aa_num = 0 for aa in pro.seq: if aa == "-": codon_seq += "---" elif complete_protein and aa_num == 0: this_codon = nucl_seq._data[span[0]:span[0] + 3] if not re.search(_codons2re[codon_table.start_codons], this_codon.upper()): max_score -= 1 warnings.warn("start codon of {0} ({1} {2}) does not " "correspond to {3} " "({4})".format(pro.id, aa, aa_num, nucl.id, this_codon), BiopythonWarning) if max_score == 0: raise RuntimeError("max_score reached for {0}! Please " "raise up the tolerance to get an " "alignment in anyway".format(nucl.id)) codon_seq += this_codon aa_num += 1 else: this_codon = nucl_seq._data[(span[0] + 3 * aa_num): (span[0] + 3 * (aa_num + 1))] if not str(Seq(this_codon.upper()).translate(table=codon_table)) == aa: max_score -= 1 warnings.warn("%s(%s %d) does not correspond to %s(%s)" % (pro.id, aa, aa_num, nucl.id, this_codon), BiopythonWarning) if max_score == 0: raise RuntimeError("max_score reached for {0}! Please " "raise up the tolerance to get an " "alignment in anyway".format(nucl.id)) codon_seq += this_codon aa_num += 1 return SeqRecord(CodonSeq(codon_seq, alphabet=alphabet), id=nucl.id) elif mode == 2: from collections import deque shift_pos = deque([]) shift_start = [] match = span_mode[2] m_groupdict = list(match.groupdict().keys()) # backward frameshift for i in m_groupdict: shift_pos.append(match.span(i)) shift_start.append(match.start(i)) rf_table = [] i = match.start() while True: rf_table.append(i) i += 3 if i in shift_start and \ m_groupdict[shift_start.index(i)].isupper(): shift_index = shift_start.index(i) shift_val = 6 - (shift_pos[shift_index][1] - shift_pos[shift_index][0]) rf_table.append(i) rf_table.append(i + 3 - shift_val) i = shift_pos[shift_index][1] elif i in shift_start and \ m_groupdict[shift_start.index(i)].islower(): i = shift_pos[shift_start.index(i)][1] if i >= match.end(): break aa_num = 0 for aa in pro.seq: if aa == "-": codon_seq += "---" elif complete_protein and aa_num == 0: this_codon = nucl_seq._data[rf_table[0]:rf_table[0] + 3] if not re.search(_codons2re[codon_table.start_codons], this_codon.upper()): max_score -= 1 warnings.warn("start codon of {0}({1} {2}) does not " "correspond to {3}({4})".format( pro.id, aa, aa_num, nucl.id, this_codon), BiopythonWarning) codon_seq += this_codon aa_num += 1 else: if aa_num < len(pro.seq.ungap('-')) - 1 and \ rf_table[aa_num + 1] - rf_table[aa_num] - 3 < 0: max_score -= 1 start = rf_table[aa_num] end = start + (3 - shift_val) ngap = shift_val this_codon = nucl_seq._data[start:end] + '-' * ngap elif rf_table[aa_num] - rf_table[aa_num - 1] - 3 > 0: max_score -= 1 start = rf_table[aa_num - 1] + 3 end = rf_table[aa_num] ngap = 3 - (rf_table[aa_num] - rf_table[aa_num - 1] - 3) this_codon = nucl_seq._data[start:end] + '-' * ngap + \ nucl_seq._data[rf_table[aa_num]:rf_table[aa_num] + 3] else: start = rf_table[aa_num] end = start + 3 this_codon = nucl_seq._data[start:end] if not str(Seq(this_codon.upper()).translate(table=codon_table)) == aa: max_score -= 1 warnings.warn("Codon of {0}({1} {2}) does not " "correspond to {3}({4})".format( pro.id, aa, aa_num, nucl.id, this_codon), BiopythonWarning) if max_score == 0: raise RuntimeError("max_score reached for {0}! Please " "raise up the tolerance to get an " "alignment in anyway".format(nucl.id)) codon_seq += this_codon aa_num += 1 return SeqRecord(CodonSeq(codon_seq, alphabet=alphabet, rf_table=rf_table), id=nucl.id)
def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False): from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation blast_records = NCBIXML.parse(blastxml) for idx_record, record in enumerate(blast_records): # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 match_type = { # Currently we can only handle BLASTN, BLASTP 'BLASTN': 'nucleotide_match', 'BLASTP': 'protein_match', }.get(record.application, 'match') recid = record.query if ' ' in recid: recid = recid[0:recid.index(' ')] rec = SeqRecord(Seq("ACTG"), id=recid) for idx_hit, hit in enumerate(record.alignments): for idx_hsp, hsp in enumerate(hit.hsps): qualifiers = { "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp), "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, "hit_titles": hit.title.split(' >'), } if include_seq: qualifiers.update({ 'blast_qseq': hsp.query, 'blast_sseq': hsp.sbjct, 'blast_mseq': hsp.match, }) for prop in ('score', 'bits', 'identities', 'positives', 'gaps', 'align_length', 'strand', 'frame', 'query_start', 'query_end', 'sbjct_start', 'sbjct_end'): qualifiers['blast_' + prop] = getattr(hsp, prop, None) desc = hit.title.split(' >')[0] qualifiers['description'] = desc[desc.index(' '):] # This required a fair bit of sketching out/match to figure out # the first time. # # the match_start location must account for queries and # subjecst that start at locations other than 1 parent_match_start = hsp.query_start - hsp.sbjct_start # The end is the start + hit.length because the match itself # may be longer than the parent feature, so we use the supplied # subject/hit length to calculate the real ending of the target # protein. parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') # If we trim the left end, we need to trim without losing information. used_parent_match_start = parent_match_start if trim: if parent_match_start < 1: used_parent_match_start = 0 if trim or trim_end: if parent_match_end > hsp.query_end: parent_match_end = hsp.query_end + 1 # The ``match`` feature will hold one or more ``match_part``s top_feature = SeqFeature( FeatureLocation(used_parent_match_start, parent_match_end), type=match_type, strand=0, qualifiers=qualifiers ) # Unlike the parent feature, ``match_part``s have sources. part_qualifiers = { "source": "blast", } top_feature.sub_features = [] for idx_part, (start, end, cigar) in \ enumerate(generate_parts(hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap)): part_qualifiers['Gap'] = cigar part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part) # Otherwise, we have to account for the subject start's location match_part_start = parent_match_start + hsp.sbjct_start + start - 1 # We used to use hsp.align_length here, but that includes # gaps in the parent sequence # # Furthermore align_length will give calculation errors in weird places # So we just use (end-start) for simplicity match_part_end = match_part_start + (end - start) top_feature.sub_features.append( SeqFeature( FeatureLocation(match_part_start, match_part_end), type="match_part", strand=0, qualifiers=copy.deepcopy(part_qualifiers)) ) rec.features.append(top_feature) rec.annotations = {} yield rec
if __name__ == '__main__': #define file name without extentions. This is usually virus family name file_path = os.path.splitext(os.path.basename(sys.argv[1]))[0] handle = open(sys.argv[1], 'r') for record in SeqIO.parse(handle, "fasta"): #only 1% of Ns is allowed if (float(str(record.seq).count('N'))/float(len(str(record.seq)))*100) <= 1: #only sequences with more than 2000 bp will be chopped if len(str(record.seq)) > 2000: #length (about 1000 pb ) of the block is defined by folowing formula: int(round(len(record.seq.tostring())/round(len(record.seq.tostring())/1000))) for pos, block in enumerate(blocks(str(record.seq), int(round(len(str(record.seq))/round(len(str(record.seq))/1000))))): #safe only blocks with >=800bp length if len(block) >= 800: block_record = SeqRecord(Seq(block, record.seq.alphabet),id=record.id, name=record.name, description=record.description) outfile = "%s.%s-%d_fasta" % (file_path,record.id,pos) SeqIO.write(block_record, open(outfile, 'w'), "fasta") #reverse transcribe and do the similar as above for pos, block in enumerate(blocks(str(record.seq.reverse_complement()), int(round(len(str(record.seq.reverse_complement()))/round(len(str(record.seq.reverse_complement()))/1000))))): if len(block) >= 800: block_record = SeqRecord(Seq(block, record.seq.alphabet),id=record.id, name=record.name, description=record.description) outfile = "%s.%s-%drev_fasta" % (file_path,record.id,pos) SeqIO.write(block_record, open(outfile, 'w'), "fasta") else: #if length is less than 2000bp than just safe both strands outfile = "%s.%s-1_fasta" % (file_path,record.id) SeqIO.write(record, open(outfile, 'w'), "fasta") outfile = "%s.%s-1rev_fasta" % (file_path,record.id) reverse_record = SeqRecord(Seq(str(record.seq.reverse_complement()), record.seq.alphabet),id=record.id, name=record.name, description=record.description) SeqIO.write(reverse_record, open(outfile, 'w'), "fasta")
min_frag_len = max_frag_len - 1 # Get the number of sets that will have the maximum length sequences to # balance the distribution per set big_sets = max_len % num_sets small_sets = num_sets - big_sets # Minimum string length for the given number of sets (with zero-filling) num_zeros = len(str(num_sets)) # Generate all the set ids and their corresponding starting site first_range = big_sets * max_frag_len frag_list = [('cset{}'.format(str(i).zfill(num_zeros)), value) for i, value in enumerate(range(0, first_range, max_frag_len), 1)] frag_list += [('cset{}'.format(str(i).zfill(num_zeros)), value) for i, value in enumerate(range(first_range, max_len, min_frag_len), big_sets+1)] set_dict = {} for record in iter(record_list) : for set_id, start in frag_list : if ( len(record) < start ) : # The current sequence can't be divided into more sets break else : # len(record) >= start end = start + max_frag_len frag_record = SeqRecord(record.seq[start:end], id=record.id, name=record.name, description=set_id) set_dict.setdefault(set_id, []).append(frag_record) return ( set_dict ) #-------------------------------------------------------------------------------
continue # clustering fieldrange = [int(bedfield[1]), int(bedfield[2])] # parse all exons exonlen = [int(x) for x in bedfield[10][:-1].split(',')] exonstart = [int(x) + fieldrange[0] for x in bedfield[11][:-1].split(',')] if not bedfield[0] in refkeys: print('Warning: ' + bedfield[0] + ' not in the reference. Ignore...', file=sys.stderr) continue if bedfield[0] != prevchr: print('Switching to %s ...' % bedfield[0], file=sys.stderr) prevchr = bedfield[0] previndex = seqref[bedfield[0]] # extract sequences thisseq = SeqRecord('') for i in range(len(exonlen)): thisseq += previndex[exonstart[i]:(exonstart[i] + exonlen[i])] if forcelength: if sum(exonlen) < readlength: thisseq += filledseq * (readlength - sum(exonlen)) thisseq.id = bedfield[3] thisseq.description = '' # mutation nmut = numpy.random.poisson(errrate) if nmut > 0: newseq = thisseq.seq for n in range(nmut): if len(posweight) == 0: # uniform distrib modifyposition = random.choice(range(len(newseq)))
def smart_adjoin(msa1, msa2, sequence_source): """ Given two Multiple Sequence Alignments (MSAs) on the same source sequences, with correct annotations, concatenate them together, with the intervening sequences unaligned. Either MSA may be None, in which case the other is returned. Requires a function that, when passed a sequence ID, returns the SeqRecord for the full sequence. Requires that there be a valid way to attach the two sequences together (i.e. the same sequence doesn't run in different directions in the two blocks). Raises a RuntimeError if the two MSAs cannot be adjoined. """ if msa1 is None: # Nothing plus something equals that thing. return msa2 if msa2 is None: # Nothing plus something equals that thing. return msa1 logging.debug("Adjoining {}bp and {}bp reference alignments".format( msa1[0].annotations["size"], msa2[0].annotations["size"])) for seq1, seq2 in itertools.izip(msa1, msa2): # Check all the sequences if seq1.annotations["strand"] != seq2.annotations["strand"]: # These alignments are to opposite reference strands and cannot be # adjoined. raise RuntimeError("Can't adjoin alignments on opposite strands") if msa2[0].annotations["start"] < msa1[0].annotations["start"]: # Whatever strand we're on for the first sequence, alignment 2 needs to # happen first. msa2, msa1 = msa1, msa2 # We're going to get the sequence needed to go from the end of MSA1 to the # start of MSA2. intervening_sequences = [] for seq1, seq2 in itertools.izip(msa1, msa2): # For each pair of sequence pieces, we need the sequence from #1 to #2, # on the appropriate strand. # Where does the intervening sequence start along the strand in # question? Remember MAF coordinates are 0-based. intervening_start = seq1.annotations["start"] + seq1.annotations["size"] # And where does it end? (1 past the end) intervening_end = seq2.annotations["start"] if intervening_end < intervening_start: # We're always going up in strand-local coordinates. raise RuntimeError("Sequence is trying to go backwards!") if seq1.annotations["strand"] == -1: # Convert to the correct strand. intervening_start = seq1.annotations["srcSize"] - intervening_start intervening_end = seq1.annotations["srcSize"] - intervening_end intervening_start, intervening_end = (intervening_end, intervening_start) # Go get and clip out the intervening sequence. intervening_sequence = sequence_source( seq1.id)[intervening_start:intervening_end] if seq1.annotations["strand"] == -1: # Make sure it is on the correct strand intervening_sequence = intervening_sequence.reverse_complement() # Put the clipped-out, correctly-oriented unaligned sequence in the # list. intervening_sequences.append(intervening_sequence) # We'll tack these additional alignments onto msa1 to_return = msa1 for i in xrange(len(intervening_sequences)): # Now for each intervening sequence, I need an MSA consisting of that # sequence in its correct row and gaps in all the other rows. # Make all the rows for this bit of unaligned sequence, as SeqRecords. alignment_rows = [ SeqRecord(Seq("-" * len(intervening_sequences[i]))) if j != i else intervening_sequences[i] for j in xrange(len(intervening_sequences)) ] # Make them into an alignment and stick it on to_return = to_return + Align.MultipleSeqAlignment(alignment_rows) # Now stick on msa2 to_return = to_return + msa2 for i in xrange(len(to_return)): # Do the annotations for each record in the alignment # Set the ID to_return[i].id = msa1[i].id # Start with the annotations from msa1, so start is correct to_return[i].annotations.update(msa1[i].annotations) # Compute the actual sequence length that outght to be used here. to_return[i].annotations["size"] = (msa2[i].annotations["start"] + msa2[i].annotations["size"] - msa1[i].annotations["start"]) # Make sure size is correct correct. assert (len(str(to_return[i].seq).replace( "-", "")) == to_return[i].annotations["size"]) # Give back the final adjoined alignment return to_return