Пример #1
0
def write_insertion_repeat_dir(data,outRepeat):
    insSeq = data['contigSeqGenomeDir'][data['leftBpContigCoord']:data['rightBpContigCoord']-1] 
    rmLines = read_rm_file(data['contigSeqFileNameRM'])
    # figure if sine is on + or -
    rmDir = figure_sine_strand(rmLines)
    if rmDir == '-':
        insSeq = genutils.revcomp(insSeq)
    
    outS = genutils.add_breaks_to_line(insSeq)
    outRepeat.write('>%s\n%s\n' % (data['siteID'],outS))    
Пример #2
0
def get_contig_seq(data):
    print_dictionary(data)
    contigsFileName = data['originalContigsDir'] + '/' + 'combined.scaffolds.fa'
    contigSeq = genutils.read_fasta_file_to_list(contigsFileName)
    
    cSeq = contigSeq[data['contigName']]['seq']
    if data['contigDir'] == '-':
        cSeq = genutils.revcomp(cSeq)
        
    data['contigSeqFileName'] = data['alignOutDir'] + '/' + 'Contig.genomedir.fa'
    data['contigSeqGenomeDir'] = cSeq
    contigSeqStr = genutils.add_breaks_to_line(cSeq)    
    outFile = open(data['contigSeqFileName'],'w')
    outFile.write('>contig\n%s\n' % contigSeqStr)
    outFile.close()
    
    contigSeq = genutils.read_fasta_to_string(data['contigSeqFileName'])
    data['contigLen'] = len(data['contigSeqGenomeDir'])       
Пример #3
0
def check_seq(fq,myData):
    minScore = 49.0
    result = {}
    result['passChecks'] = False    

    # do not know the alignment orientation, so check both and take best score
    alignRes = pairwise2.align.globalms(myData['linkerSeq'], fq['seq'], 2, -1, -.5, -.2,penalize_end_gaps=False)
    alignResLinkerRC = pairwise2.align.globalms(myData['linkerSeqRC'], fq['seq'], 2, -1, -.5, -.2,penalize_end_gaps=False)    
    # compare scores
    if alignRes[0][2] >= alignResLinkerRC[0][2]:
        ls = myData['linkerSeq']
    else:
        ls = myData['linkerSeqRC']
        alignRes = alignResLinkerRC        
    result['align'] = alignRes

    # should only be one alignment.  Otherwise 
    if len(alignRes) != 1:
#        print 'have mulitple potential alignments'
#        print fq['seq']
        result['passChecks'] = False
        return result
        
 
    # check score
    if alignRes[0][2] < minScore:
        result['passChecks'] = False
        return result

    #figure out coordinates
    # go through keeping track of pos to go form column number to bp number
    # do the sequence in 1 based coordinates
    seq1ColToPos = []
    current = 0
    for i in range(len(alignRes[0][0])): #[1,2,3,4], looking at alignment
        if alignRes[0][0][i] != '-':
            current += 1
        seq1ColToPos.append(current)
    seq2ColToPos = []
    current = 0
    for i in range(len(alignRes[0][1])):
        if alignRes[0][1][i] != '-':
            current += 1
        seq2ColToPos.append(current) 

    linkerColStart = -1
    linkerColEnd = -1
    for i in range(len(seq1ColToPos)):
        if seq1ColToPos[i] == 1 and linkerColStart == -1:
            linkerColStart = i
        if seq1ColToPos[i] == len(myData['linkerSeq']) and linkerColEnd == -1:
            linkerColEnd = i
    

    # extract sequences -1 because python 0 based, colToSeq 1 based, 
    leftSeq = fq['seq'][0:seq2ColToPos[linkerColStart]-1]
    linkerSeq = fq['seq'][seq2ColToPos[linkerColStart]-1:seq2ColToPos[linkerColEnd]]
    rightSeq = fq['seq'][seq2ColToPos[linkerColEnd]:]    

    
    

    result['passChecks'] = True    
    # passess, so take out the sequence and quals
    leftSeqQual = fq['qual33Str'][0:seq2ColToPos[linkerColStart]-1]
    rightSeqQual = fq['qual33Str'][seq2ColToPos[linkerColEnd]:]
    
    # need to reverse comp R1 due to structure of the library
    leftSeq = genutils.revcomp(leftSeq)
    leftSeqQual = leftSeqQual[::-1]
    
    result['seq1'] = leftSeq
    result['seq1Qual'] = leftSeqQual
    result['seq2'] = rightSeq
    result['seq2Qual'] = rightSeqQual

    

    
    return result
Пример #4
0
###############################################################################
if options.outDir[-1] != '/':
    options.outDir += '/'

# setup file location info
myData = {}
myData['filesToDelete'] = []
myData['filesToGzip'] = []
myData['r1fq'] = options.r1fq
myData['r2fq'] = options.r2fq
myData['sampleName'] = options.sampleName
myData['outDir'] = options.outDir
myData['linkerSeq'] = 'CTGCTGTACCGTTCTCCGTACAGCAG'
# rev of linker is also possible, since do not know orietnation
myData['linkerSeqRC'] = genutils.revcomp(myData['linkerSeq'])


print 'Processing %s' % myData['sampleName']
#run pear to join together reads that overlap
run_pear(myData)
count_num_not_assembled(myData)
print '%i reads were not assembled' % myData['numNotAssem']
count_num_discarded(myData)
print '%i reads were discarded' % myData['numDiscarded']

process_assembled(myData)
print '%i reads were assembled' % myData['numAssembled']
print '%i assembled reads failed the check' % myData['numFail']
print '%i assembled reads that passed the check but failed the length test (< 22bp)' % myData['lenFail']
print '%i total reads in original fastq' % myData['totReads']
    outReads.write(nl)
inFile.close()

print 'Writing formated output read sequences to',outSeqName    

inFile = open(samFile,'r')
for line in inFile:
    line = line.rstrip()
    line = line.split('\t')
    samRec = genutils.parse_sam_line(line)

    seq = samRec['seq']
    qual = samRec['qual']
    if samRec['reverseStrand'] is True:
        qual = qual[::-1]
        seq = genutils.revcomp(seq)
    if samRec['isFirst'] is True:
        i = 0
    else:
        i = 1
    nl = [samRec['seqName'],sN,str(i),seq,qual]
    nl = '\t'.join(nl) + '\n'
    outSeq.write(nl)
inFile.close()

outSeq.close()
outReads.close()

print 'Now doing window bed to get reads associated with each call'
print 'Output to',outReadsNameIntersect