def fetchIntensityInGWAWithinRBDictGivenArrayIDFromTilingIntensity(tilingIntensityData, array_id, rbDict, gwr_name=None,\ min_reciprocal_overlap=0.6): """ 2010-3-18 tilingIntensityData is of type SNPData. """ sys.stderr.write("Getting intensity data within the chosen segments for array %s ..."%array_id) col_index = tilingIntensityData.col_id2col_index.get(array_id) if col_index is None: sys.stderr.write("Error: No tiling intensity.\n") return None from SNP import GenomeWideResult, DataObject gwr = GenomeWideResult(name=gwr_name) # 2010-3-18 custom gwr.array_id = array_id #gwr.ecotype_id = array.maternal_ecotype_id #gwr.nativename = ecotype_nativename genome_wide_result_id = id(gwr) no_of_rows = len(tilingIntensityData.row_id_ls) for i in range(no_of_rows): chr_pos = tilingIntensityData.row_id_ls[i] chr, pos = map(int, chr_pos) cnvSegmentKey = CNVSegmentBinarySearchTreeKey(chromosome=chr, span_ls=[pos],\ min_reciprocal_overlap=min_reciprocal_overlap) if cnvSegmentKey in rbDict: probeIntensity = tilingIntensityData.data_matrix[i][col_index] data_obj = DataObject(chromosome=chr, position=pos, value=probeIntensity) data_obj.comment = '' data_obj.genome_wide_result_name = gwr_name data_obj.genome_wide_result_id = genome_wide_result_id gwr.add_one_data_obj(data_obj) sys.stderr.write(" %s probes. Done.\n"%(len(gwr.data_obj_ls))) return gwr
def getCNVDataFromFileInGWA(input_fname_ls, array_id, max_amp=-0.33, min_amp=-0.33, min_size=50, min_no_of_probes=None, \ report=False): """ 2009-10-31 get deletion (below max_amp) or duplication (above min_amp) from files (output by RunGADA.py) """ sys.stderr.write("Getting CNV calls for array %s, min_size %s, min_no_of_probes %s from %s ..."%\ (array_id, min_size, min_no_of_probes, repr(input_fname_ls))) gwr_name = "(a-id %s)"%(array_id) gwr = GenomeWideResult(name=gwr_name) gwr.data_obj_ls = [] #list and dictionary are crazy references. gwr.data_obj_id2index = {} genome_wide_result_id = id(gwr) amp_ls = [] array_id2array = {} counter = 0 real_counter = 0 no_of_segments = 0 input_handler = fileinput.input(input_fname_ls) header = input_handler.readline().strip().split('\t') col_name2index = getColName2IndexFromHeader(header) ecotype_id = None for line in input_handler: if line.find("array_id")!=-1: continue line = line.strip() row = line.split('\t') cnv_array_id = int(row[col_name2index['array_id']]) cnv_ecotype_id = int(row[col_name2index.get('ecotype_id', col_name2index['array_id'])]) counter += 1 if cnv_array_id==array_id: no_of_segments += 1 if ecotype_id is None: ecotype_id = cnv_ecotype_id start_probe = row[col_name2index['start_probe']].split('_') # split chr_pos start_probe = map(int, start_probe) start_probe_id = row[col_name2index.get('start_probe_id', col_name2index['start_probe'])] stop_probe = row[col_name2index['end_probe']].split('_') stop_probe = map(int, stop_probe) end_probe_id = row[col_name2index.get('end_probe_id', col_name2index['end_probe'])] no_of_probes = int(row[col_name2index['length']]) if min_no_of_probes is not None and no_of_probes<min_no_of_probes: continue amplitude = float(row[col_name2index['amplitude']]) segment_chromosome = start_probe[0] segment_start_pos = start_probe[1]-12 segment_stop_pos = stop_probe[1]+12 segment_length = abs(segment_stop_pos-segment_start_pos) if min_size is not None and segment_length<min_size: continue if amplitude<=max_amp or amplitude>=min_amp: real_counter += 1 data_obj = DataObject(chromosome=segment_chromosome, position=segment_start_pos, stop_position=segment_stop_pos, \ value=amplitude) data_obj.comment = 'start probe-id %s, end probe-id %s, no of probes %s'%\ (start_probe_id, end_probe_id, no_of_probes) data_obj.genome_wide_result_id = genome_wide_result_id gwr.add_one_data_obj(data_obj) if report and counter%10000==0: sys.stderr.write('%s%s\t%s\t%s'%('\x08'*80, counter, no_of_segments, real_counter)) sys.stderr.write("\n") if gwr.max_value<3: # insertion at y=3 gwr.max_value=3 if gwr.min_value>-1: # deletion at y = -1 gwr.min_value = -1 gwr.name = '%s '%ecotype_id + gwr.name setattr(gwr, 'ecotype_id', ecotype_id) sys.stderr.write(" %s segments. Done.\n"%(len(gwr.data_obj_ls))) return gwr