def getGoodDivergenceAlignedTrimmedSeqPair(seqId, seq, hitSeqId, hitSeq, workPath): ''' aligns seq to hit. trims aligned seq and hit seq. returns: pairs of pairs of id and aligned trimmed sequences for sequences in hits, and a predicate function that, given a divergence threshold, says if the divergence of the sequences exceeds the threshold. e.g. ((seqId, alignedTrimmedSeq), (hitSeqId, alignedTrimmedHitSeq), divergencePredicateFunc) ''' # ALIGN SEQ and HIT # need to align the sequences so we'z can study the rate of evolution per site inputFasta = '>%s\n%s\n>%s\n%s\n'%(seqId, seq, hitSeqId, hitSeq) if USE_CLUSTALW: alignedFasta = alignFastaClustalw(inputFasta, workPath) else: alignedFasta = alignFastaKalign(inputFasta) # try to recover from rare, intermittent failure of fasta alignment if not alignedFasta: logging.error('fasta alignment failed.\ninputFasta=%s\n' + 'alignedFasta=%s\nSleep and retry alignment.', inputFasta, alignedFasta) time.sleep(0.1) alignedFasta = alignFastaKalign(inputFasta) try: # parse the aligned fasta into sequence ids and sequences namelinesAndSeqs = list(fasta.readFasta(cStringIO.StringIO(alignedFasta))) idAndSeqs = [(fasta.idFromName(seqNameline), seq) for seqNameline, seq in namelinesAndSeqs] alignedIdAndSeq, alignedHitIdAndSeq = idAndSeqs except Exception as e: e.args += (inputFasta, alignedFasta) raise # CHECK FOR EXCESSIVE DIVERGENCE AND TRIMMING # find most diverged sequence # sort sequences by dash count. why? divIdSeqs = [] for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq): dashCount = seq.count('-') div = dashCount / float(len(seq)) g = (dashCount, div, id, seq) divIdSeqs.append(g) divIdSeqs.sort() # check for excessive divergence leastDivergedDashCount, leastDivergedDiv, leastDivergedId, leastDivergedSeq = divIdSeqs[0] # check for excessive divergence and generate dashtrim. mostDivergedDashCount, mostDivergedDiv, mostDivergedId, mostDivergedSeq = divIdSeqs[1] # dashtrim = dashlen_check(mostDivergedSeq, divergence) startTrim, endTrim, trimDivergence = dashlen_check(mostDivergedSeq) # logging.debug('dashtrim='+str(dashtrim)) # trim and add seqs to output def divergencePredicate(divergenceThreshold): '''Why this logic? Ask Dennis. Function closed over local variables that returns whether or not the alignment of the sequences is too diverged.''' if leastDivergedSeq and leastDivergedDiv > divergenceThreshold: return True if (startTrim or endTrim) and trimDivergence >= divergenceThreshold: return True return False alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq = [(id, seq[startTrim:(len(seq)-endTrim)]) for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq)] return alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq, divergencePredicate
def makeGetSeqForId(genomeFastaPath): ''' genomeFastaPath: location of fasta file. also location/name of blast formatted indexes of the fasta file. ''' # suck fasta file into memory, converting it into a map from id to sequence # in memory dict performs much better than on-disk retrieval with xdget or fastacmd. # and genome fasta files do not take much space (on a modern computer). fastaMap = {} for (seqNameline, seq) in fasta.readFasta(genomeFastaPath): seqId = fasta.idFromName(seqNameline) fastaMap[seqId] = seq def getSeqForIdInMemory(seqId): return fastaMap[seqId] return getSeqForIdInMemory
def create_fv_files(): filename_fasta = inputFile filename_profile = profile filename_fv = outputFile fasta_dict = readFasta(filename_fasta) profile_dict = read_profiles(filename_profile) pool = mp.Pool(processes=8) results = [ pool.apply_async(form_feature_vector, args=(prot_id, fasta_dict[prot_id], profile_dict)) for prot_id in fasta_dict ] fv_dict_raw = dict() for p in results: (prot_id, fv) = p.get() fv_dict_raw[prot_id] = fv write_feature_vector(filename_fv, fv_dict_raw) return
count[posdict[c]] += 1 return ranks #Gather arguments from the user if (len(sys.argv) < 4): #If in the incorrect form, return an error message print("Arguments must be of the form : referencefile, readsfile, k, dmax.") exit(0) #referencefile and readsfile must be file names, k and dmax integers. referencefile, readsfile, kmerLength, dmax = sys.argv[1], sys.argv[2], int( sys.argv[3]), int(sys.argv[4]) #Initialization of the reference file #readFasta only take the sequence of bases, and the $ is for the BWT, to mark the end of the string. reference = (fasta.readFasta(referencefile)).lower() + "$" #Initialization of reads and readsInv, its reverse complementary reads, readsBioPalind = [], [] for line in open(readsfile, "r"): if line[0] != ">": #lines with > do not contain sequences, but merely comments about the sequences. reads.append( line[:-1].lower() ) #-1 to remove \n. To lower case for practical reasons when calling posdict. readsBioPalind.append(biologicalPalyndrome( line[:-1].lower())) #We also stock the biological palyndromes #We create SA, BWT, Rank and F from reference print("generating SA")
def getGoodDivergenceAlignedTrimmedSeqPair(seqId, seq, hitSeqId, hitSeq, workPath): ''' aligns seq to hit. trims aligned seq and hit seq. returns: pairs of pairs of id and aligned trimmed sequences for sequences in hits, and a predicate function that, given a divergence threshold, says if the divergence of the sequences exceeds the threshold. e.g. ((seqId, alignedTrimmedSeq), (hitSeqId, alignedTrimmedHitSeq), divergencePredicateFunc) ''' # ALIGN SEQ and HIT # need to align the sequences so we'z can study the rate of evolution per site inputFasta = '>%s\n%s\n>%s\n%s\n' % (seqId, seq, hitSeqId, hitSeq) if USE_CLUSTALW: alignedFasta = alignFastaClustalw(inputFasta, workPath) else: alignedFasta = alignFastaKalign(inputFasta) # try to recover from rare, intermittent failure of fasta alignment if not alignedFasta: logging.error( 'fasta alignment failed.\ninputFasta=%s\n' + 'alignedFasta=%s\nSleep and retry alignment.', inputFasta, alignedFasta) time.sleep(0.1) alignedFasta = alignFastaKalign(inputFasta) try: # parse the aligned fasta into sequence ids and sequences namelinesAndSeqs = list( fasta.readFasta(cStringIO.StringIO(alignedFasta))) idAndSeqs = [(fasta.idFromName(seqNameline), seq) for seqNameline, seq in namelinesAndSeqs] alignedIdAndSeq, alignedHitIdAndSeq = idAndSeqs except Exception as e: e.args += (inputFasta, alignedFasta) raise # CHECK FOR EXCESSIVE DIVERGENCE AND TRIMMING # find most diverged sequence # sort sequences by dash count. why? divIdSeqs = [] for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq): dashCount = seq.count('-') div = dashCount / float(len(seq)) g = (dashCount, div, id, seq) divIdSeqs.append(g) divIdSeqs.sort() # check for excessive divergence leastDivergedDashCount, leastDivergedDiv, leastDivergedId, leastDivergedSeq = divIdSeqs[ 0] # check for excessive divergence and generate dashtrim. mostDivergedDashCount, mostDivergedDiv, mostDivergedId, mostDivergedSeq = divIdSeqs[ 1] # dashtrim = dashlen_check(mostDivergedSeq, divergence) startTrim, endTrim, trimDivergence = dashlen_check(mostDivergedSeq) # logging.debug('dashtrim='+str(dashtrim)) # trim and add seqs to output def divergencePredicate(divergenceThreshold): '''Why this logic? Ask Dennis. Function closed over local variables that returns whether or not the alignment of the sequences is too diverged.''' if leastDivergedSeq and leastDivergedDiv > divergenceThreshold: return True if (startTrim or endTrim) and trimDivergence >= divergenceThreshold: return True return False alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq = [ (id, seq[startTrim:(len(seq) - endTrim)]) for id, seq in (alignedIdAndSeq, alignedHitIdAndSeq) ] return alignedTrimmedIdAndSeq, alignedTrimmedHitIdAndSeq, divergencePredicate