args = parser.parse_args() inputFile = args.i outputRaw = args.rawout outputThreshold = args.tout threshold = args.threshold kmerSize = args.k # Read in JSON-Bead file #Calculate kmer pools for all beads kmerPool = [] beadCount = 0 with open(inputFile, 'r') as f: for line in f: b = bead.beadSequence(json.loads(line)) kmerPool.append(kmer.kmerCount(b, kmerSize)) beadCount += 1 print('Found {0} beads.'.format(beadCount)) # Calculate kmer distance for all pairs edge = [] edgeThreshold = [] n1 = 0 n2 = 0 for pair in combinations(kmerPool, 2): D = kmer.kmerDistance((pair[0].set, pair[1].set)).mashDistance() edge.append((pair[0].barcode, pair[1].barcode, D)) if threshold[0] <= D <= threshold[1]: edgeThreshold.append((pair[0].barcode, pair[1].barcode, D)) n2 += 1 n1 += 1
default='kmerReport.tsv', help='Report file on Kmers.') args = parser.parse_args() inputFile = args.i outputFile = args.o kmerSize = args.k reportFile = args.report kmerPool = [] report = {} kmerFrag = [] with open(inputFile, 'r') as f: for line in f: b = bead.beadSequence(json.loads(line)) kmers = kmer.kmerCount(b, kmerSize) barcode = b.barcode kmerPool.append({barcode: kmers.kmers}) kmerNumber = len(kmers.kmers) fragNumber = len(b.fragments) kmerFrag.append((kmerNumber, fragNumber)) report[kmerNumber] = report.get(kmerNumber, 1) + 1 with open(outputFile, 'w') as f: for line in kmerPool: f.write('{0}\n'.format(json.dumps(line))) report = sorted([x for x in report.items()], key=lambda i: i[0]) with open(reportFile, 'w') as f: f.write('KmerNumber\tCount\n') for line in report:
def main(): # Read in JSON-Bead file #Calculate kmer pools for all beads kmerPool = [] beadCount = 0 with open(inputFile, 'r') as f: for line in f: b = bead.beadSequence(json.loads(line)) kmerPool.append(kmer.kmerCount(b, kmerSize)) beadCount += 1 print('Found {0} beads.'.format(beadCount)) # Setup the parallel enviroment # Create shared list for store edge list and progress counter manager = Manager() edge = manager.list([[]] * job) # n list for edge list count = manager.list([0] * job) # n list for count print('Starting mash distance ...') # Divide the kmer pair pool pairPool = [] for pair in combinations(kmerPool, 2): pairPool.append(pair) size = len(pairPool) print('Total is {0} pairs.'.format(size)) step = size // job print('Step is {0}'.format(step)) start = 0 workers = [] print(len(pairPool)) for i in range(job): if i + 1 < job: # not the last job workers.append( Process(target=kmerDistanceWorker, args=(pairPool[start:start + step], edge, i, count))) start += step print('Start change to {0}'.format(start)) else: workers.append( Process(target=kmerDistanceWorker, args=(pairPool[start:], edge, i, count))) print('Starting %i jobs ...' % job) count_worker = 1 for j in workers: j.start() print('Starting thread No. %i ...' % count_worker) count_worker += 1 job_alive = True while job_alive: time.sleep(0.01) job_alive = False for j in workers: if j.is_alive(): job_alive = True progress = str(sum(count) / size * 100) + "\r" sys.stderr.write(progress) #print(len(edge[0])) for j in workers: j.join() print('Finished dereplicating.') with open(outputRaw, 'w') as f: f.write('Source\tTarget\tDistance\n') for item in edge: for line in item: f.write('{0}\t{1}\t{2}\n'.format(line[0], line[1], line[2]))
default='kmerReport.tsv', help='A report on kmer distribuion on Kmers.') args = parser.parse_args() inputFile = args.i outputFile = args.o k = args.k reportFile = args.report kmerPool = [] report = {} kmerFrag = [] with open(inputFile, 'r') as f: for line in f: b = bead.beadSequence(json.loads(line)) kmers = kmer.kmerCount(b, k) barcode = b.barcode kmerPool.append({barcode: kmers.kmers}) kmerNumber = len(kmers.kmers) fragNumber = len(b.fragments) kmerFrag.append((kmerNumber, fragNumber)) report[kmerNumber] = report.get(kmerNumber, 1) + 1 with open(outputFile, 'w') as f: for line in kmerPool: f.write('{0}\n'.format(json.dumps(line))) report = sorted([x for x in report.items()], key=lambda i: i[0]) with open(reportFile, 'w') as f: f.write('KmerNumber\tCount\n') for line in report: