def write_insertion_repeat_dir(data,outRepeat): insSeq = data['contigSeqGenomeDir'][data['leftBpContigCoord']:data['rightBpContigCoord']-1] rmLines = read_rm_file(data['contigSeqFileNameRM']) # figure if sine is on + or - rmDir = figure_sine_strand(rmLines) if rmDir == '-': insSeq = genutils.revcomp(insSeq) outS = genutils.add_breaks_to_line(insSeq) outRepeat.write('>%s\n%s\n' % (data['siteID'],outS))
def get_contig_seq(data): print_dictionary(data) contigsFileName = data['originalContigsDir'] + '/' + 'combined.scaffolds.fa' contigSeq = genutils.read_fasta_file_to_list(contigsFileName) cSeq = contigSeq[data['contigName']]['seq'] if data['contigDir'] == '-': cSeq = genutils.revcomp(cSeq) data['contigSeqFileName'] = data['alignOutDir'] + '/' + 'Contig.genomedir.fa' data['contigSeqGenomeDir'] = cSeq contigSeqStr = genutils.add_breaks_to_line(cSeq) outFile = open(data['contigSeqFileName'],'w') outFile.write('>contig\n%s\n' % contigSeqStr) outFile.close() contigSeq = genutils.read_fasta_to_string(data['contigSeqFileName']) data['contigLen'] = len(data['contigSeqGenomeDir'])
def check_seq(fq,myData): minScore = 49.0 result = {} result['passChecks'] = False # do not know the alignment orientation, so check both and take best score alignRes = pairwise2.align.globalms(myData['linkerSeq'], fq['seq'], 2, -1, -.5, -.2,penalize_end_gaps=False) alignResLinkerRC = pairwise2.align.globalms(myData['linkerSeqRC'], fq['seq'], 2, -1, -.5, -.2,penalize_end_gaps=False) # compare scores if alignRes[0][2] >= alignResLinkerRC[0][2]: ls = myData['linkerSeq'] else: ls = myData['linkerSeqRC'] alignRes = alignResLinkerRC result['align'] = alignRes # should only be one alignment. Otherwise if len(alignRes) != 1: # print 'have mulitple potential alignments' # print fq['seq'] result['passChecks'] = False return result # check score if alignRes[0][2] < minScore: result['passChecks'] = False return result #figure out coordinates # go through keeping track of pos to go form column number to bp number # do the sequence in 1 based coordinates seq1ColToPos = [] current = 0 for i in range(len(alignRes[0][0])): #[1,2,3,4], looking at alignment if alignRes[0][0][i] != '-': current += 1 seq1ColToPos.append(current) seq2ColToPos = [] current = 0 for i in range(len(alignRes[0][1])): if alignRes[0][1][i] != '-': current += 1 seq2ColToPos.append(current) linkerColStart = -1 linkerColEnd = -1 for i in range(len(seq1ColToPos)): if seq1ColToPos[i] == 1 and linkerColStart == -1: linkerColStart = i if seq1ColToPos[i] == len(myData['linkerSeq']) and linkerColEnd == -1: linkerColEnd = i # extract sequences -1 because python 0 based, colToSeq 1 based, leftSeq = fq['seq'][0:seq2ColToPos[linkerColStart]-1] linkerSeq = fq['seq'][seq2ColToPos[linkerColStart]-1:seq2ColToPos[linkerColEnd]] rightSeq = fq['seq'][seq2ColToPos[linkerColEnd]:] result['passChecks'] = True # passess, so take out the sequence and quals leftSeqQual = fq['qual33Str'][0:seq2ColToPos[linkerColStart]-1] rightSeqQual = fq['qual33Str'][seq2ColToPos[linkerColEnd]:] # need to reverse comp R1 due to structure of the library leftSeq = genutils.revcomp(leftSeq) leftSeqQual = leftSeqQual[::-1] result['seq1'] = leftSeq result['seq1Qual'] = leftSeqQual result['seq2'] = rightSeq result['seq2Qual'] = rightSeqQual return result
############################################################################### if options.outDir[-1] != '/': options.outDir += '/' # setup file location info myData = {} myData['filesToDelete'] = [] myData['filesToGzip'] = [] myData['r1fq'] = options.r1fq myData['r2fq'] = options.r2fq myData['sampleName'] = options.sampleName myData['outDir'] = options.outDir myData['linkerSeq'] = 'CTGCTGTACCGTTCTCCGTACAGCAG' # rev of linker is also possible, since do not know orietnation myData['linkerSeqRC'] = genutils.revcomp(myData['linkerSeq']) print 'Processing %s' % myData['sampleName'] #run pear to join together reads that overlap run_pear(myData) count_num_not_assembled(myData) print '%i reads were not assembled' % myData['numNotAssem'] count_num_discarded(myData) print '%i reads were discarded' % myData['numDiscarded'] process_assembled(myData) print '%i reads were assembled' % myData['numAssembled'] print '%i assembled reads failed the check' % myData['numFail'] print '%i assembled reads that passed the check but failed the length test (< 22bp)' % myData['lenFail'] print '%i total reads in original fastq' % myData['totReads']
outReads.write(nl) inFile.close() print 'Writing formated output read sequences to',outSeqName inFile = open(samFile,'r') for line in inFile: line = line.rstrip() line = line.split('\t') samRec = genutils.parse_sam_line(line) seq = samRec['seq'] qual = samRec['qual'] if samRec['reverseStrand'] is True: qual = qual[::-1] seq = genutils.revcomp(seq) if samRec['isFirst'] is True: i = 0 else: i = 1 nl = [samRec['seqName'],sN,str(i),seq,qual] nl = '\t'.join(nl) + '\n' outSeq.write(nl) inFile.close() outSeq.close() outReads.close() print 'Now doing window bed to get reads associated with each call' print 'Output to',outReadsNameIntersect