예제 #1
0
        try:
            bead[barcode]['s'][record[1]] += 1
        except KeyError:
            bead[barcode] = {'s': {}}
            bead[barcode]['s'][record[1]] = 1
        seqCount += 1
        if seqCount % 1000000 == 0:
            with open('bead_dereplicate.log', 'w') as f:
                f.write('Processed {0} sequences. Currently found {1} beads.'\
                        .format(seqCount, len(bead)))
    else:
        pass

# Unasembled forward and reverse reads
twinFile = args.twin.split(',')
seqTwin = seqIO.sequence_twin(twinFile[0], twinFile[1], fastx='q')

for r1, r2 in seqTwin:
    barcode = r1[0].split(
        '/'
    )[-1]  # There is no check up for the correspondence of barcode in R1 and R2!
    seq = r1[1] + '_' + r2[1]
    if seq.find('N') == -1:  # Keep sequence without N
        try:
            bead[barcode]['t'][seq] += 1
        except KeyError:
            try:
                bead[barcode]['t'] = {}
                bead[barcode]['t'][seq] = 1
            except KeyError:
                bead[barcode] = {'t': {}}
예제 #2
0
print('Splited reads are writing under the folder {0} . . .'.format(
    outputFolder))

# Create the list of output files. There are 6x6x6 files, plus one for error.
outputFileDict = {(0, 0, 0): outputFolder + '/' + '0_0_0.fq'}
for x in range(1, 7):
    for y in range(1, 7):
        for z in range(1, 7):
            outputFileDict[(x, y, z)] = '{3}/{0}_{1}_{2}.gp.fq'.format(
                x, y, z, outputFolder)

# Assign read into bins using the first barcode as key
with open('stlfr_split_sm.log', 'w') as f1:
    count = 0
    error_count = 0
    seqs = seqIO.sequence_twin(r1File, r2File)
    for r1, r2 in seqs:
        count += 1
        if count // 1000000 >= 1:
            f1.write('\tProcessed {8.2f} M reads.'.format(count // 1000000))
        bead = number_tuple(barcode_set(r2[1]), NoSnpDict, OneSnpDict)
        if bead:
            b = '_'.join([str(i) for i in bead])
            r1[0] = r1[0][:-2] + '/' + b + '/1'
            r2[0] = r2[0][:-2] + '/' + b + '/2'
            r2[1] = r2[1][:100]
            r2[3] = r2[3][:100]
            seqIO.write_seqs([r1, r2],
                             outputFileDict[number2ord(bead)],
                             fastx='q',
                             mode='a',
예제 #3
0
# Plus the original 21 barcodes, the total candidates will be 399
eeBarcodesMut = {}  # This is the final index for splitting
for key, value in mutationPool[1].items():
    for string in value:
        eeBarcodesMut[string] = key
for key, value in eeBarcodes.items():  # add the origin 21 barcodes
    eeBarcodesMut[key] = value
print('The final lookin up dictionary has {0} barcodes.'.format(
    len(eeBarcodesMut)))

#%% Split
# Reads will be read in in pair, and assign to each barcode if eligible
print('Start splitting {0} ...'.format(' '.join(inputFile)))
count = [0, 0]
split = {i + 1: [] for i in range(21)}
seqHandle = seqIO.sequence_twin(inputFile[0], inputFile[1])
for r1, r2 in seqHandle:
    barcodeR1 = eeBarcodesMut.get(r1[1][:6], 999)
    barcodeR2 = eeBarcodesMut.get(r2[1][:6], 998)
    if barcodeR1 == barcodeR2:
        split[barcodeR1].append((r1, r2))
        count[0] += 1
    else:
        count[1] += 1
s = sum([len(i) >= threshold for i in split.values()])
scount = sum([len(i) for i in split.values() if len(i) >= threshold])
print('Finished splitting:')
print('\tTotal reads:\t\t{0}'.format(sum(count)))
print('\tEligible reads:\t\t{0}\t{1:3.2f}%.'.format(
    count[0], count[0] / sum(count) * 100))
print('\tReads passed threshold:\t{0}\t{1:3.2f}%'.format(