try: bead[barcode]['s'][record[1]] += 1 except KeyError: bead[barcode] = {'s': {}} bead[barcode]['s'][record[1]] = 1 seqCount += 1 if seqCount % 1000000 == 0: with open('bead_dereplicate.log', 'w') as f: f.write('Processed {0} sequences. Currently found {1} beads.'\ .format(seqCount, len(bead))) else: pass # Unasembled forward and reverse reads twinFile = args.twin.split(',') seqTwin = seqIO.sequence_twin(twinFile[0], twinFile[1], fastx='q') for r1, r2 in seqTwin: barcode = r1[0].split( '/' )[-1] # There is no check up for the correspondence of barcode in R1 and R2! seq = r1[1] + '_' + r2[1] if seq.find('N') == -1: # Keep sequence without N try: bead[barcode]['t'][seq] += 1 except KeyError: try: bead[barcode]['t'] = {} bead[barcode]['t'][seq] = 1 except KeyError: bead[barcode] = {'t': {}}
print('Splited reads are writing under the folder {0} . . .'.format( outputFolder)) # Create the list of output files. There are 6x6x6 files, plus one for error. outputFileDict = {(0, 0, 0): outputFolder + '/' + '0_0_0.fq'} for x in range(1, 7): for y in range(1, 7): for z in range(1, 7): outputFileDict[(x, y, z)] = '{3}/{0}_{1}_{2}.gp.fq'.format( x, y, z, outputFolder) # Assign read into bins using the first barcode as key with open('stlfr_split_sm.log', 'w') as f1: count = 0 error_count = 0 seqs = seqIO.sequence_twin(r1File, r2File) for r1, r2 in seqs: count += 1 if count // 1000000 >= 1: f1.write('\tProcessed {8.2f} M reads.'.format(count // 1000000)) bead = number_tuple(barcode_set(r2[1]), NoSnpDict, OneSnpDict) if bead: b = '_'.join([str(i) for i in bead]) r1[0] = r1[0][:-2] + '/' + b + '/1' r2[0] = r2[0][:-2] + '/' + b + '/2' r2[1] = r2[1][:100] r2[3] = r2[3][:100] seqIO.write_seqs([r1, r2], outputFileDict[number2ord(bead)], fastx='q', mode='a',
# Plus the original 21 barcodes, the total candidates will be 399 eeBarcodesMut = {} # This is the final index for splitting for key, value in mutationPool[1].items(): for string in value: eeBarcodesMut[string] = key for key, value in eeBarcodes.items(): # add the origin 21 barcodes eeBarcodesMut[key] = value print('The final lookin up dictionary has {0} barcodes.'.format( len(eeBarcodesMut))) #%% Split # Reads will be read in in pair, and assign to each barcode if eligible print('Start splitting {0} ...'.format(' '.join(inputFile))) count = [0, 0] split = {i + 1: [] for i in range(21)} seqHandle = seqIO.sequence_twin(inputFile[0], inputFile[1]) for r1, r2 in seqHandle: barcodeR1 = eeBarcodesMut.get(r1[1][:6], 999) barcodeR2 = eeBarcodesMut.get(r2[1][:6], 998) if barcodeR1 == barcodeR2: split[barcodeR1].append((r1, r2)) count[0] += 1 else: count[1] += 1 s = sum([len(i) >= threshold for i in split.values()]) scount = sum([len(i) for i in split.values() if len(i) >= threshold]) print('Finished splitting:') print('\tTotal reads:\t\t{0}'.format(sum(count))) print('\tEligible reads:\t\t{0}\t{1:3.2f}%.'.format( count[0], count[0] / sum(count) * 100)) print('\tReads passed threshold:\t{0}\t{1:3.2f}%'.format(