def CF_call(args): try: h5file_in_fn = str(args.input) h5file_in = openFile(h5file_in_fn, mode='r') except IOError as e: print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn sys.exit(0) try: callfile_fn = str(args.output) callfile_f = open(callfile_fn, mode='w') except IOError as e: print '[ERROR] Cannot open output file for writing: ', callfile_fn sys.exit(0) chrs_to_process = [] for chr in h5file_in.root: if chr._v_title not in ('probes', 'samples'): chrs_to_process.append(chr._v_title.replace("chr", "")) h5file_in.close() print '[INIT] Initializing caller at threshold = %f' % (args.threshold) r = cf.rpkm_reader(h5file_in_fn) all_calls = [] for chr in chrs_to_process: print '[RUNNING] Now processing chr%s' % chr data = r.getExonValuesByRegion(chr) #raw_data = copy.copy(data) _ = data.smooth() mean = np.mean(data.rpkm, axis=1) sd = np.std(data.rpkm, axis=1) for sample in r.getSampleList(): sample_data = data.getSample([sample]).flatten() #sample_raw_data = raw_data.getSample([sample]).flatten() dup_mask = sample_data >= args.threshold del_mask = sample_data <= -1 * args.threshold dup_bkpoints = cf.getbkpoints( dup_mask ) #returns exon coordinates for this chromosome (numpy array coords) del_bkpoints = cf.getbkpoints(del_mask) dups = [] for start, stop in dup_bkpoints: try: new_start = np.max( np.where(sample_data[:start] < (mean[:start] + 3 * sd[:start]))) except ValueError: new_start = 0 try: new_stop = stop + np.min( np.where(sample_data[stop:] < (mean[stop:] + 3 * sd[stop:]))) except ValueError: new_stop = data.shape[1] - 1 dups.append({ "sampleID": sample, "chromosome": cf.chrInt2Str(chr), "start": data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup" }) dels = [] for start, stop in del_bkpoints: try: new_start = np.max( np.where(sample_data[:start] > (-1 * mean[:start] - 3 * sd[:start]))) except ValueError: new_start = 0 try: new_stop = stop + np.min( np.where(sample_data[stop:] > (-1 * mean[stop:] - 3 * sd[stop:]))) except ValueError: new_stop = data.shape[1] - 1 dels.append({ "sampleID": sample, "chromosome": cf.chrInt2Str(chr), "start": data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del" }) dels = cf.mergeCalls(dels) #merges overlapping calls dups = cf.mergeCalls(dups) #print sampleID, len(dels), len(dups) all_calls.extend(list(dels)) all_calls.extend(list(dups)) # print calls to file header = ['sampleID', 'chromosome', 'start', 'stop', 'state'] callfile_f.write('\t'.join(header) + "\n") for call in all_calls: print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]) callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])) sys.exit(0)
def CF_call(args): try: h5file_in_fn = str(args.input) h5file_in = openFile(h5file_in_fn, mode='r') except IOError as e: print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn sys.exit(0) try: callfile_fn = str(args.output) callfile_f = open(callfile_fn, mode='w') except IOError as e: print '[ERROR] Cannot open output file for writing: ', callfile_fn sys.exit(0) chrs_to_process = [] for chr in h5file_in.root: if chr._v_title not in ('probes','samples'): chrs_to_process.append(chr._v_title.replace("chr","")) h5file_in.close() print '[INIT] Initializing caller at threshold = %f' % (args.threshold) r = cf.rpkm_reader(h5file_in_fn) all_calls = [] for chr in chrs_to_process: print '[RUNNING] Now processing chr%s' % chr data = r.getExonValuesByRegion(chr) #raw_data = copy.copy(data) _ = data.smooth() mean= np.mean(data.rpkm,axis=1) sd = np.std(data.rpkm,axis=1) for sample in r.getSampleList(): sample_data = data.getSample([sample]).flatten() #sample_raw_data = raw_data.getSample([sample]).flatten() dup_mask = sample_data >= args.threshold del_mask = sample_data <= -1*args.threshold dup_bkpoints = cf.getbkpoints(dup_mask) #returns exon coordinates for this chromosome (numpy array coords) del_bkpoints = cf.getbkpoints(del_mask) dups = [] for start,stop in dup_bkpoints: try: new_start = np.max(np.where(sample_data[:start] < (mean[:start] + 3*sd[:start]))) except ValueError: new_start = 0 try: new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3*sd[stop:]))) except ValueError: new_stop = data.shape[1]-1 dups.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup"}) dels = [] for start,stop in del_bkpoints: try: new_start = np.max(np.where(sample_data[:start] > (-1*mean[:start] - 3*sd[:start]))) except ValueError: new_start = 0 try: new_stop = stop + np.min(np.where(sample_data[stop:] > (-1*mean[stop:] - 3*sd[stop:]))) except ValueError: new_stop = data.shape[1]-1 dels.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del"}) dels = cf.mergeCalls(dels) #merges overlapping calls dups = cf.mergeCalls(dups) #print sampleID, len(dels), len(dups) all_calls.extend(list(dels)) all_calls.extend(list(dups)) # print calls to file header = ['sampleID','chromosome','start','stop','state'] callfile_f.write('\t'.join(header) + "\n") for call in all_calls: print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]) callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])) sys.exit(0)