def load_ids(filename, filetype): if filetype == "tabular": for line in open(filename): line = line.rstrip("\n") if line and not line.startswith("#"): yield line.split("\t", 1)[0] elif filetype == "fasta": for line in open(filename): if line.startswith(">"): yield line[1:].rstrip("\n").split(None, 1)[0] elif filetype.startswith("fastq"): # Use the Galaxy library not Biopython to cope with CS from galaxy_utils.sequence.fastq import fastqReader handle = open(filename, "rU") for record in fastqReader(handle): # The [1:] is because the fastaReader leaves the @ on the identifer. yield record.identifier.split()[0][1:] handle.close() elif filetype == "sff": try: from Bio.SeqIO import index except ImportError: sys.exit("Require Biopython 1.54 or later (to read SFF files)") # This will read the SFF index block if present (very fast) for name in index(filename, "sff"): yield name else: sys.exit("Unexpected file type %s" % filetype)
def main(): tree = read(sys.argv[1], 'newick') seqs = index(sys.argv[2], 'fasta') if not tree.rooted: tree.root_at_midpoint() tree.ladderize(reverse=True) for leaf in tree.get_terminals(): write(seqs[leaf.name], sys.stdout, 'fasta')
hmmResultsDir = "hmm/results" # crisprFiles = load(open("pickles/CRISPRs.p","rb")) # casOperons = CasOperons(gene) # casOperons.hasCas9(hmmResultsDir,crisprFiles) casOperons = load(open("pickles/Cas9_Operons_HMM.p", 'rb')) #Get unique chrs and the proteins they are associated with allCasAsmFile = "assemblies/All_%s_Unique_Assemblies.fasta" % (gene) allCasAAsFile = "proteins/All_%s-Like.faa" % (gene) casOperons.uniqueNukeSeqs(allCasAsmFile, allCasAAsFile) # Calls dump when it finishes # Launch the domain search for the faa file created above system( "sbatch /mnt/research/germs/shane/transActRNA/scripts/hpc/DomainSearch.sb") casAAs = dict(index(allCasAAsFile, "fasta")) unUsed = set(casOperons.seqMap.protToAsm).difference(casAAs) deletedOperons = {} pres, absnt, hasSeq = 0, 0, 0 for protID in unUsed: try: operon = casOperons.operons[casOperons.seqMap[protID]] deletedOperons[protID] = operon except: absnt += 1 from pickle import dump dump( deletedOperons, open("/mnt/research/germs/shane/transActRNA/data/pickles/DeletedOperons.p", "wb"))
return right_dist else: return -left_dist if __name__ == "__main__": data = pd.read_table(sys.argv[1], sep='\s+', comment='#', names=[ 'feature_id', '_1', '_2', '_3', 'position', '_5', '_6', '_7', 'score', '_9', '_10', '_11' ], index_col='feature_id', usecols=['feature_id', 'position', 'score']) seqindex = index(sys.argv[2], 'fasta') for feature_id, (_position, score) in data.iterrows(): position = int(_position) assert position == _position position -= 1 seq = seqindex[feature_id].seq c_pos = relative_pos_closest(str(seq), position, 'C') if c_pos is None: # Print an empty string if there is no 'C'. c_pos = '' print( feature_id, position, round(score, 3),
type=int, dest='min_length', default=1000, help="Minimum contig length to output [%(default)s]") p.add_argument('seq_path', type=str, metavar="FASTA", help="Sequences to be trimmed.") p.add_argument( 'corr_handle', type=argparse.FileType('r'), metavar="CORR", help="Correlation table from calculate_per_position_stats.py") args = p.parse_args() seqs = index(args.seq_path, 'fasta') data = pd.read_table( args.corr_handle, names=['contig_id', 'position', 'total_depth', 'cosine_similarity']) data.contig_id = data.contig_id.astype(str) # FIXME: Brittle data.position = data.position - 1 # Convert to zero-indexed. tally_seqs = 0 tally_nucs = 0 for contig_id in tqdm(list(seqs.keys())): seq = seqs[contig_id].seq if len(seq) < args.min_length: # print(f"Contig {contig_id} too short.", file=sys.stderr) continue d = data[data.contig_id == contig_id] if d.empty: print("\rWARNING: {} not found in corr data.".format(contig_id),
#!/usr/bin/env python3 from Bio.SeqIO import index from copy import deepcopy import sys if __name__ == "__main__": seq_index = index(sys.argv[1], 'fasta') for frag in sys.argv[2:]: seq_id, *indices = frag.rsplit(':', 1) rec = deepcopy(seq_index[seq_id]) if indices: left, right = indices[0].split('-') left = int(left) right = int(right) else: left, right = 0, len(rec) if left > right: rec.seq = rec.seq[right:left].reverse_complement() else: rec.seq = rec.seq[left:right] print(f'>{rec.id}\n{rec.seq}')
for codon in TRANSLATION: POSITIONS[codon] = _count_positions(codon) def count_positions(codon): return POSITIONS[codon] if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument('align1', metavar='FASTA1') p.add_argument('align2', metavar='FASTA2', nargs='?') args = p.parse_args() if args.align2: rec_index1 = index(args.align1, 'fasta') rec_index2 = index(args.align2, 'fasta') comparisons = [] for idA, idB in zip(rec_index1.keys(), rec_index2.keys()): comparisons.append((idA, rec_index1[idA], idB, rec_index2[idB])) else: rec_index = index(args.align1, 'fasta') comparisons = [] ids = list(rec_index.keys()) for i, idA in enumerate(ids): for j, idB in enumerate(ids[i + 1:]): comparisons.append((idA, rec_index[idA], idB, rec_index[idB])) for idA, recA, idB, recB in comparisons: seqA = recA.seq seqB = recB.seq
gene = "Cas9" assemblyDir = "assemblies/assemblies_W_%s/" % (gene) cas9Assemblies = listdir(assemblyDir) goodDomIDS = load(open("pickles/%s_GoodDomainIDS.p" % (gene), "rb")) goodDomMap = load(open("pickles/%s_GoodDomMap.p" % (gene), "rb")) hmm_parser = load(open("pickles/%s_HMM_Parsing_Results.p" % (gene), "rb")) print("All loaded") #Copy unique nucleotide sequence from Bio.SeqIO import index nukSeqHash, protSeqHash = set(), set() alreadyGotIt, count = 0, 0 for assembly in cas9Assemblies: baseID = assembly[:-6] allAssemblySeqs = index(assemblyDir + assembly, "fasta") overlap = goodDomIDS.intersection(allAssemblySeqs.keys()) for recID in overlap: seq = str(allAssemblySeqs[recID].seq).upper() if seq in nukSeqHash and len(goodDomMap[recID]) == 1: alreadyGotIt += 1 continue nukSeqHash.add(seq) #There may be more than 1 protein on the pseudochromosome, save both as separate files if len(goodDomMap[recID]) > 1: print("%i Cas9s on %s %s" % (len(goodDomMap[recID]), recID, baseID)) for orfID in goodDomMap[recID]: # protSeq = str(hmm_parser.results[baseID].proteins[orfID].seq).upper() with open("assemblies/pseudoChromos/%s.fasta" % (orfID), "w") as fh: