def precompile_clusters_sam_nobcs(rawdata_file,write_output): rawdata = [] try: with open(rawdata_file, 'rb') as csvfile: print 'opened csv' csvreader = csv.reader(csvfile, delimiter=',') rawdata = [row for row in csvreader] except: sys_ops.throw_exception("Could not open file " + rawdata_file) sys.exit(1) rawdata_sort = sorted(rawdata) chrids = map(itemgetter(0), rawdata_sort) ct = 0 unique_chrids = {} for chrid in chrids: if chrid not in unique_chrids: unique_chrids[chrid] = [ct,0] else: unique_chrids[chrid][1] = ct ct+=1 if write_output: print 'writing clusters' for chrid in unique_chrids: out_file = rawdata_file.replace('.csv','_'+str(chrid)+'.csv') with open(out_file,'w') as csvfile: mywriter = csv.writer(csvfile, delimiter=',') for row in rawdata_sort[unique_chrids[chrid][0]:unique_chrids[chrid][1]]: mywriter.writerow(row+[ct])
def fastq_get_unique_bcs_inds(bc_fastqfile, BC_READS_THRESHOLD): try: print "fastq_get_unique_bcs, opening " + bc_fastqfile bc_handle = open(bc_fastqfile, "rU") except: sys_ops.throw_exception("Could not find file " + bc_fastqfile) return #read all unique bcs into list and record counts for each bc ct = 0 BCSEQUNIQUE = {} for record in SeqIO.parse(bc_handle, "fastq"): bc = str(record.seq) #sort bcs into dict by sequence if (hash(bc) not in BCSEQUNIQUE): BCSEQUNIQUE[hash(bc)] = [[], bc] BCSEQUNIQUE[hash(bc)][0].append(ct) else: BCSEQUNIQUE[hash(bc)][0].append(ct) ct += 1 bc_handle.close() BCSEQ = [] for bchash in BCSEQUNIQUE: bc = BCSEQUNIQUE[bchash] if bc[0] >= BC_READS_THRESHOLD: BCSEQ.append(BCSEQUNIQUE[bchash]) print "number of reads containing bc for clustering:" print ct return BCSEQ
def write_sams_location_orientation(sam_filename,sampid,outfile_clust_ts,outfile_clust_bs,CLUSTER_READS_THRESHOLD,write_output): print "reading input files" try: samfile = pysam.Samfile(sam_filename) except: sys_ops.throw_exception("Could not find file "+sam_filename) return sam_alignments_ts = [] sam_alignments_bs = [] with open(outfile_clust_bs,'w') as csvfile_bs: mywriter_bs = csv.writer(csvfile_bs, delimiter=',') with open(outfile_clust_ts,'w') as csvfile_ts: mywriter_ts = csv.writer(csvfile_ts, delimiter=',') for read in samfile.fetch(): #check that read is mapped (mapq>0) and then check read orientation if read.mapq>0: if read.is_reverse: #using len(read.seq) is not entirely accurate (should make function to parse cigar string and output mapping length) mywriter_bs.writerow([read.rname,(read.aend-1),(read.aend-1),(read.aend-1),1,1,0,0,1,hash(sampid)]) else: mywriter_ts.writerow([read.rname,read.pos,read.pos,read.pos,1,1,0,0,0,hash(sampid)]) samfile.close return 1
def fastq_get_unique_bcs_inds(bc_fastqfile,BC_READS_THRESHOLD): try: print "fastq_get_unique_bcs, opening "+bc_fastqfile bc_handle = open(bc_fastqfile, "rU") except: sys_ops.throw_exception("Could not find file "+bc_fastqfile) return #read all unique bcs into list and record counts for each bc ct = 0 BCSEQUNIQUE = {} for record in SeqIO.parse(bc_handle, "fastq"): bc = str(record.seq) #sort bcs into dict by sequence if (hash(bc) not in BCSEQUNIQUE): BCSEQUNIQUE[hash(bc)] = [[], bc] BCSEQUNIQUE[hash(bc)][0].append(ct) else: BCSEQUNIQUE[hash(bc)][0].append(ct) ct+=1 bc_handle.close() BCSEQ = [] for bchash in BCSEQUNIQUE: bc = BCSEQUNIQUE[bchash] if bc[0]>=BC_READS_THRESHOLD: BCSEQ.append(BCSEQUNIQUE[bchash]) print "number of reads containing bc for clustering:" print ct return BCSEQ
def compile_clusters_sam_nobcs(rawdata_filename): #assemble output for parallel clustering jobs outfile = rawdata_filename.replace('.csv','c.csv') finished = post_align_ops_v204.compile_clusters_sam_nobcs(rawdata_filename,outfile,SAMPLE_CLUSTER_READS_THRESHOLD,write_cluster_output) if os.path.isfile(outfile)==False: sys_ops.throw_exception("Failed at compile_clusters_sam on processing " + outfile) return
def compile_clusters_sam_nobcs(rawdata_filename): #assemble output for parallel clustering jobs outfile = rawdata_filename.replace('.csv', 'c.csv') finished = post_align_ops_v204.compile_clusters_sam_nobcs( rawdata_filename, outfile, SAMPLE_CLUSTER_READS_THRESHOLD, write_cluster_output) if os.path.isfile(outfile) == False: sys_ops.throw_exception( "Failed at compile_clusters_sam on processing " + outfile) return
def fastq_get_seq_dict(fastqfile): try: fq_handle = open(fastqfile, "rU") except: sys_ops.throw_exception("Could not find file " + fastqfile) return ct = 0 READS = {} for record in SeqIO.parse(fq_handle, "fastq"): READS[hash(record.name)] = str(record.seq) fq_handle.close() return READS
def fastq_get_seq_dict(fastqfile): try: fq_handle = open(fastqfile, "rU") except: sys_ops.throw_exception("Could not find file "+fastqfile) return ct = 0 READS = {} for record in SeqIO.parse(fq_handle, "fastq"): READS[hash(record.name)] = str(record.seq) fq_handle.close() return READS
def run_sample_write_sam_location_orientation(sam_filename): #cluster sams, then ibcs for sam clusts, then lbcs for ibc clusts sam_filename2 = sam_filename.replace(sam_subpath,ibclbc_subpath) outfile_ts = sam_filename.replace('.sam','_sam_clusters_ts.csv') outfile_bs = sam_filename.replace('.sam','_sam_clusters_bs.csv') ############################################################ sampid = sam_filename.split('.')[0] print sampid ############################################################ finished = post_align_ops_v204.write_sams_location_orientation(sam_filename,sampid,outfile_ts,outfile_bs,SAMPLE_CLUSTER_READS_THRESHOLD,write_cluster_output) if (os.path.isfile(outfile_ts)==False) or (os.path.isfile(outfile_bs)==False): sys_ops.throw_exception("Failed at cluster_sams on processing " + outfile) return
def cluster_bcs(BCSEQ, BC_THRESHOLD, BC_READS_THRESHOLD, BC_EXACT_MATCH, UMI_CLUSTER_METHOD): #aggregating all NSWMT sequences by sequence, just those that cluster in UMI space if str.find(UMI_CLUSTER_METHOD, 'explicit') >= 0: BCSEQC = threshold_cluster_uid_explicit(BCSEQ, BC_THRESHOLD) elif str.find(UMI_CLUSTER_METHOD, 'prelinked') >= 0: TEMP = threshold_cluster_uid_prelinked_setup(BCSEQ, BC_THRESHOLD) BCSEQC = threshold_cluster_uid_prelinked(TEMP, BC_THRESHOLD) else: sys_ops.throw_exception( "Options for cluster_bcs must be either 'explicit' or 'prelinked'. Exiting..." ) return #print 'number of unique BC clusters before filtering:' #print len(BCSEQC) #filter our all clusters with only n reads less than read threshold BCSEQC2 = [] #iterate through all clusters and find clusters for bcs in BCSEQC: #if greater than 1 barcode in the cluser or greater than 1 read in a sincle bc cluster, cluster is not junk if BC_EXACT_MATCH == 1: bcs2 = [bcs[0]] else: bcs2 = bcs ct = 0 for bc in bcs2: ct = ct + bc[1] if (ct >= BC_READS_THRESHOLD): BCSEQC2.append(bcs2) break #print 'number of unique BC clusters after filtering:' #print len(BCSEQC2) return BCSEQC2
def run_sample_write_sam_location_orientation(sam_filename): #cluster sams, then ibcs for sam clusts, then lbcs for ibc clusts sam_filename2 = sam_filename.replace(sam_subpath, ibclbc_subpath) outfile_ts = sam_filename.replace('.sam', '_sam_clusters_ts.csv') outfile_bs = sam_filename.replace('.sam', '_sam_clusters_bs.csv') ############################################################ sampid = sam_filename.split('.')[0] print sampid ############################################################ finished = post_align_ops_v204.write_sams_location_orientation( sam_filename, sampid, outfile_ts, outfile_bs, SAMPLE_CLUSTER_READS_THRESHOLD, write_cluster_output) if (os.path.isfile(outfile_ts) == False) or (os.path.isfile(outfile_bs) == False): sys_ops.throw_exception("Failed at cluster_sams on processing " + outfile) return
def cluster_bcs(BCSEQ,BC_THRESHOLD,BC_READS_THRESHOLD,BC_EXACT_MATCH,UMI_CLUSTER_METHOD): #aggregating all NSWMT sequences by sequence, just those that cluster in UMI space if str.find(UMI_CLUSTER_METHOD,'explicit')>=0: BCSEQC = threshold_cluster_uid_explicit(BCSEQ,BC_THRESHOLD) elif str.find(UMI_CLUSTER_METHOD,'prelinked')>=0: TEMP = threshold_cluster_uid_prelinked_setup(BCSEQ,BC_THRESHOLD) BCSEQC = threshold_cluster_uid_prelinked(TEMP,BC_THRESHOLD) else: sys_ops.throw_exception("Options for cluster_bcs must be either 'explicit' or 'prelinked'. Exiting...") return #print 'number of unique BC clusters before filtering:' #print len(BCSEQC) #filter our all clusters with only n reads less than read threshold BCSEQC2 = [] #iterate through all clusters and find clusters for bcs in BCSEQC: #if greater than 1 barcode in the cluser or greater than 1 read in a sincle bc cluster, cluster is not junk if BC_EXACT_MATCH==1: bcs2 = [bcs[0]] else: bcs2 = bcs ct = 0 for bc in bcs2: ct = ct + bc[1] if (ct>=BC_READS_THRESHOLD): BCSEQC2.append(bcs2) break #print 'number of unique BC clusters after filtering:' #print len(BCSEQC2) return BCSEQC2
def get_params(readparam_fileName): readparam_file = [] try: with open(readparam_fileName, 'rb') as csvfile: csvreader = csv.reader(csvfile, delimiter=',') readparam_file = [row for row in csvreader] except: sys_ops.throw_exception("Could not open read-params " + readparam_fileName) sys.exit(1) ############################## f_barcodes = readparam_file[0] if '' in f_barcodes: temp_index = f_barcodes.index('') f_barcodes = f_barcodes[0:temp_index] for i in range(0, len(f_barcodes)): f_barcodes[i] = str(f_barcodes[i]) r_barcodes = readparam_file[1] if '' in r_barcodes: temp_index = r_barcodes.index('') r_barcodes = r_barcodes[0:temp_index] for i in range(0, len(r_barcodes)): r_barcodes[i] = str(r_barcodes[i]) groups = readparam_file[2] if '' in groups: temp_index = groups.index('') groups = groups[0:temp_index] for i in range(0, len(groups)): groups[i] = str(groups[i]) ############################## return [f_barcodes, r_barcodes, groups]
def generate_sample_jobfile(param_filename, process_filename, jobfile_filename): [f_barcodes, r_barcodes, groups] = get_params(param_filename) process_list_file = [] try: with open(process_filename, 'rb') as csvfile: csvreader = csv.reader(csvfile, delimiter=',') process_file = [row for row in csvreader] except: sys_ops.throw_exception("Could not open process-list " + process_filename) sys.exit(1) job_list = [] for row in process_file: print row #columns in process_list_file must be, in order: f_barcode_checkList = [ f_barcode in str(row[0]) for f_barcode in f_barcodes ] r_barcode_checkList = [ r_barcode in str(row[1]) for r_barcode in r_barcodes ] #f_barcode_checkList2 = [f_barcode in str(row[3]) for f_barcode in f_barcodes] #r_barcode_checkList2 = [r_barcode in str(row[4]) for r_barcode in r_barcodes] #NOTE: in each of the above cases, the reader assumes the stored sequences in the function get_sequences() are necessary in their entirety to exist as substrings of the sequences in the process-list #if((True in f_barcode_checkList) and (True in r_barcode_checkList) and (True in f_barcode_checkList2) and (True in r_barcode_checkList2)): if 0: print row if ((len(row[0]) == 0) and (len(row[1]) == 0)): job_list.append([str(0), str(-1) + '_' + str(-1)]) job_list.append([str(0), str(-1) + '_' + str(-1)]) elif (len(row[0]) == 0): job_list.append([ str(0), str(-1) + '_' + str(r_barcode_checkList.index(True)) ]) job_list.append([ str(0), str(-1) + '_' + str(r_barcode_checkList2.index(True)) ]) elif (len(row[1]) == 0): job_list.append([ str(0), str(f_barcode_checkList.index(True)) + '_' + str(-1) ]) job_list.append([ str(0), str(f_barcode_checkList.index2(True)) + '_' + str(-1) ]) else: job_list.append([ str(0), str(f_barcode_checkList.index(True)) + '_' + str(r_barcode_checkList.index(True)) ]) job_list.append([ str(0), str(f_barcode_checkList2.index(True)) + '_' + str(r_barcode_checkList2.index(True)) ]) elif ((True in f_barcode_checkList) and (True in r_barcode_checkList)): print row if ((len(row[0]) == 0) and (len(row[1]) == 0)): job_list.append([str(0), str(-1) + '_' + str(-1)]) elif (len(row[0]) == 0): job_list.append([ str(0), str(-1) + '_' + str(r_barcode_checkList.index(True)) ]) elif (len(row[1]) == 0): job_list.append([ str(0), str(f_barcode_checkList.index(True)) + '_' + str(-1) ]) else: job_list.append([ str(0), str(f_barcode_checkList.index(True)) + '_' + str(r_barcode_checkList.index(True)) ]) #include 0 as first element to indicate that the currently-written job has NOT been completed elif ((True in f_barcode_checkList) or (True in r_barcode_checkList) or (True in f_barcode_checkList2) or (True in r_barcode_checkList2)): sys_ops.throw_exception( "Process list row " + str(row) + " contains mixture of identifiable and unidentifiable sequences. Exiting." ) sys.exit(1) print job_list with open(jobfile_filename, 'w') as csvfile: mywriter = csv.writer(csvfile, delimiter=',') for job in job_list: mywriter.writerow(job) return
def compile_clusters_sam_nobcs(rawdata_file,outfile_clust,CLUSTER_READS_THRESHOLD,write_output): rawdata = [] try: with open(rawdata_file, 'rb') as csvfile: print 'opened csv' csvreader = csv.reader(csvfile, delimiter=',') rawdata = [row for row in csvreader] except: sys_ops.throw_exception("Could not open file " + rawdata_file) sys.exit(1) rawdata_chrid = map(itemgetter(0), rawdata) rawdata_chrlb = map(itemgetter(1), rawdata) rawdata_chrub = map(itemgetter(2), rawdata) rawdata_chrmed = map(itemgetter(3), rawdata) rawdata_readct = map(itemgetter(4), rawdata) rawdata_uniquect = map(itemgetter(5), rawdata) rawdata_empty1 = map(itemgetter(6), rawdata) rawdata_empty2 = map(itemgetter(7), rawdata) rawdata_orientid = map(itemgetter(8), rawdata) rawdata_sampid = map(itemgetter(9), rawdata) rawdata_repreadct = map(itemgetter(10), rawdata) ct = 0 repreadcts = {} repreadct0 = 0 sam_alignments = [] for el in rawdata_chrid: repreadct = rawdata_repreadct[ct] if str(repreadct) not in repreadcts: repreadcts[str(repreadct)] = 1 repreadct0+=int(rawdata_repreadct[ct]) sam_alignments.append([int(rawdata_chrid[ct]),int(rawdata_chrlb[ct]),int(rawdata_chrub[ct]),ct]) ct+=1 #cluster all reads for an individual ibc by position clusters_loc = clustering_ops.nncluster_chr_positions(sam_alignments,'cbb',CLUSTER_READS_THRESHOLD,ALIGN_NN_THRESHOLD) name_clusters = clusters_loc[0] alignment_clusters = clusters_loc[1] ct = 0 clusters = [] #find lbcs corresponding to names in name_clusters with open(outfile_clust,'w') as csvfile: mywriter = csv.writer(csvfile, delimiter=',') for name_cluster in name_clusters: chrid = alignment_clusters[ct][0][0] chrlb = min([int(rawdata_chrlb[c]) for c in name_cluster]) chrub = max([int(rawdata_chrub[c]) for c in name_cluster]) chrmed = numpy.median(numpy.array([float(rawdata_chrmed[c]) for c in name_cluster])) chrlocs = [str([rawdata_chrlb[c],rawdata_chrub[c],rawdata_chrmed[c]]) for c in name_cluster] readct = len(name_cluster) uniquect = {} for loc in chrlocs: if hash(str(loc)) not in uniquect: uniquect[hash(str(loc))] = loc reps = {} repstr = '' locs_ts = [] locs_bs = [] locs_ts2 = [] locs_bs2 = [] locs_tsu = [] locs_bsu = [] locs_tsu2 = [] locs_bsu2 = [] for c in name_cluster: sampid = int(rawdata_sampid[c]) if sampid not in reps: reps[sampid] = 1 repstr+=(':'+str(sampid)) if int(rawdata_orientid[c])==0: locs_ts.append(int(rawdata_chrlb[c])) elif int(rawdata_orientid[c])==1: locs_bs.append(int(rawdata_chrlb[c])) #remove bias due to PCR amplification bias, take only unique mappings locs_tsu = numpy.unique(numpy.array(locs_ts)).tolist() locs_bsu = numpy.unique(numpy.array(locs_bs)).tolist() if (len(locs_ts)>0 and len(locs_bs)>0): locs_ts2 = numpy.matrix([locs_ts]*len(locs_bs)) locs_bs2 = numpy.matrix([locs_bs]*len(locs_ts)) locs_diff = locs_ts2-numpy.transpose(locs_bs2) locs_g0 = float((locs_diff>=OVERLAP_THRESHOLD).sum())/float((locs_diff.shape[0]*locs_diff.shape[1])) locs_l0 = float((locs_diff<OVERLAP_THRESHOLD).sum())/float((locs_diff.shape[0]*locs_diff.shape[1])) locs_tsu2 = numpy.matrix([locs_tsu]*len(locs_bsu)) locs_bsu2 = numpy.matrix([locs_bsu]*len(locs_tsu)) locs_diffu = locs_tsu2-numpy.transpose(locs_bsu2) locs_g0u = float((locs_diffu>=OVERLAP_THRESHOLD).sum())/float((locs_diffu.shape[0]*locs_diffu.shape[1])) locs_l0u = float((locs_diffu<OVERLAP_THRESHOLD).sum())/float((locs_diffu.shape[0]*locs_diffu.shape[1])) with open(str.replace(outfile_clust,'.csv','')+'_'+str(chrlb)+'_'+str(chrub)+'.csv','w') as csvfile: mywriterc = csv.writer(csvfile, delimiter=',') for row in locs_diff: mywriterc.writerow(row.tolist()[0]) else: locs_g0 = -1 locs_l0 = -1 locs_g0u = -1 locs_l0u = -1 orientid = rawdata_orientid[name_cluster[0]] if locs_g0u != -1: #if one is zero, all the others are zero mywriter.writerow([chrid,chrlb,chrub,chrmed,readct,len(uniquect),0,0,len(reps),repreadct0,0,0,locs_g0u,locs_l0u]) ct+=1 return 1