def count_barcodes(dataset, VERBOSE=0): '''Count the abundance of each barcode''' # Get the read filenames data_filenames = get_raw_read_files(dataset) datafile = data_filenames['adapter'] # Count the abundance of each barcode bc_counts = defaultdict(int) rc = 0 with open(datafile, 'r') as infile: for read in SeqIO.parse(infile, 'fastq'): bc_counts[read.seq.tostring()] += 1 rc += 1 if rc == maxreads: break print sorted(bc_counts.items(), key=lambda x:x[1], reverse=True)[:20] # Plot results plt.figure() ax=plt.subplot(111) plt.plot(range(1,len(bc_counts)+1), sorted(bc_counts.values(), reverse=True)) ax.set_yscale('log') ax.set_xscale('log') plt.xlabel('barcode rank') plt.ylabel('abundance') plt.ion() plt.show()
if summary: with open(get_demultiplex_summary_filename(data_folder), 'w') as f: f.write('Call: python demultiplex.py --run ' + seq_run + ' --verbose ' + str(VERBOSE) + '\n') adapters_designed = get_adapters_designed(dataset, VERBOSE=VERBOSE, summary=summary) make_output_folders(data_folder, adapters_designed, VERBOSE=VERBOSE, summary=summary) data_filenames = get_raw_read_files(dataset) # Is it a dual index library? if '-' not in adapters_designed[0][0]: demultiplex_reads_single_index(data_folder, data_filenames, adapters_designed, maxreads=maxreads, VERBOSE=VERBOSE, summary=summary) else: demultiplex_reads_dual_index(data_folder, data_filenames, adapters_designed, maxreads=maxreads, VERBOSE=VERBOSE,
# If submit, outsource to the cluster if submit: fork_self(seq_run, VERBOSE=VERBOSE, maxreads=maxreads, summary=summary) sys.exit() # Specify the dataset dataset = MiSeq_runs[seq_run] data_folder = dataset['folder'] if summary: with open(get_demultiplex_summary_filename(data_folder), 'w') as f: f.write('Call: python demultiplex.py --run '+seq_run+' --verbose '+str(VERBOSE)+'\n') adapters_designed = get_adapters_designed(dataset, VERBOSE=VERBOSE, summary=summary) make_output_folders(data_folder, adapters_designed, VERBOSE=VERBOSE, summary=summary) data_filenames = get_raw_read_files(dataset) # Is it a dual index library? if '-' not in adapters_designed[0][0]: demultiplex_reads_single_index(data_folder, data_filenames, adapters_designed, maxreads=maxreads, VERBOSE=VERBOSE, summary=summary) else: demultiplex_reads_dual_index(data_folder, data_filenames, adapters_designed, maxreads=maxreads, VERBOSE=VERBOSE, summary=summary)