def main(): """Parse command line args, and call appropriate functions.""" usage = """\nusage: %prog [options]\n""" parser = optparse.OptionParser(usage=usage) #Other option types are int and float, string is default. #Note there is also a default parameter. parser.add_option('-d', '--dir', dest="hmm_fit_dir", type="string") parser.add_option('-o', '--out', dest="out_path", type="string") parser.add_option('-t', '--thresh', dest="pnathresh", type="float", default=.03) opts, args = parser.parse_args( ) #Args taken from sys.argv[1:] by default, parsed using GNU/POSIX syntax. if not opts.hmm_fit_dir and opts.out_path: parser.error( "A directory for locating hmm_fit data and output file path is required." ) print "Starting hmmprob_to_est.py with parameters:", str(opts) print "Free memory is %s MB" % get_free_memory() all_files = grab_files(opts.hmm_fit_dir) print "Found %s files" % len(all_files) d_ests = transform(all_files, opts.pnathresh) print "Free memory now is %s MB" % get_free_memory() write_csv(d_ests, opts.out_path)
def write_csv(d_ests, out_path): """ Takes a (filtered) dict (See transform function for explanation of what d_ests is and an example of what it could contain.) Writes it out into a CSV file, putting everything together in one matrix. """ #Write to CSV outfile = open(out_path, 'wb') outcsv = csv.writer(outfile) #Set up data to fill in, and be written to file #Header rows: header_row, chrom_row, gen_map_pos_row = ['individual'],[''],[''] #seed with ind names (Make first column of each data row hold the ind name.) csv_data = [] for d_inds in d_ests.values(): for ind_name in d_inds: csv_data.append((ind_name,)) #Sort and make sure there are no duplicates csv_data = sorted(list(set(csv_data))) #change rows from tuples to lists csv_data = [list(row) for row in csv_data] #print csv_data #build an index of our csv_data so we can quickly put an indivudals data in the right place r = row_by_ind_name = {} for i,row in enumerate(csv_data): r[row[0]] = i #fill in data for chrom, d_inds in d_ests.items(): #Make a sorted list of all positions in this chromosome all_positions = set() for d_ests in d_inds.values(): all_positions |= set(d_ests.keys()) all_positions = sorted(list(all_positions)) if all_positions: #only include chroms with data #Update header rows with these positions / chomosomes header_row += ['%s-%s' % (chrom, v) for v in all_positions] chrom_row += ([chrom] * len(all_positions)) gen_map_pos_row += [i+1 for i in range(len(all_positions))] #Store actual data to be written for ind_name, ests_by_pos in d_inds.items(): outrow = csv_data[r[ind_name]] for pos in all_positions: outrow.append(ests_by_pos.get(pos,'-')) outcsv.writerow(header_row) outcsv.writerow(chrom_row) outcsv.writerow(gen_map_pos_row) outcsv.writerows(csv_data) print "Free memory after writing CSV is %s MB" % get_free_memory() outfile.close()
def main(): """Parse command line args, and call appropriate functions.""" usage="""\nusage: %prog [options]\n""" parser = optparse.OptionParser(usage=usage) #Other option types are int and float, string is default. #Note there is also a default parameter. parser.add_option('-d','--dir',dest="hmm_fit_dir",type="string") parser.add_option('-o','--out',dest="out_path",type="string") parser.add_option('-t','--thresh',dest="pnathresh",type="float",default=.03) opts,args=parser.parse_args() #Args taken from sys.argv[1:] by default, parsed using GNU/POSIX syntax. if not opts.hmm_fit_dir and opts.out_path: parser.error("A directory for locating hmm_fit data and output file path is required.") print "Starting hmmprob_to_est.py with parameters:", str(opts) print "Free memory is %s MB" % get_free_memory() all_files = grab_files(opts.hmm_fit_dir) print "Found %s files" % len(all_files) d_ests = transform(all_files, opts.pnathresh) print "Free memory now is %s MB" % get_free_memory() write_csv(d_ests, opts.out_path)
def write_csv(d_ests, out_path): """ Takes a (filtered) dict (See transform function for explanation of what d_ests is and an example of what it could contain.) Writes it out into a CSV file, putting everything together in one matrix. """ #Write to CSV outfile = open(out_path, 'wb') outcsv = csv.writer(outfile) #Set up data to fill in, and be written to file #Header rows: header_row, chrom_row, gen_map_pos_row = ['individual'], [''], [''] #seed with ind names (Make first column of each data row hold the ind name.) csv_data = [] for d_inds in d_ests.values(): for ind_name in d_inds: csv_data.append((ind_name, )) #Sort and make sure there are no duplicates csv_data = sorted(list(set(csv_data))) #change rows from tuples to lists csv_data = [list(row) for row in csv_data] #print csv_data #build an index of our csv_data so we can quickly put an indivudals data in the right place r = row_by_ind_name = {} for i, row in enumerate(csv_data): r[row[0]] = i #fill in data for chrom, d_inds in d_ests.items(): #Make a sorted list of all positions in this chromosome all_positions = set() for d_ests in d_inds.values(): all_positions |= set(d_ests.keys()) all_positions = sorted(list(all_positions)) if all_positions: #only include chroms with data #Update header rows with these positions / chomosomes header_row += ['%s-%s' % (chrom, v) for v in all_positions] chrom_row += ([chrom] * len(all_positions)) gen_map_pos_row += [i + 1 for i in range(len(all_positions))] #Store actual data to be written for ind_name, ests_by_pos in d_inds.items(): outrow = csv_data[r[ind_name]] for pos in all_positions: outrow.append(ests_by_pos.get(pos, '-')) outcsv.writerow(header_row) outcsv.writerow(chrom_row) outcsv.writerow(gen_map_pos_row) outcsv.writerows(csv_data) print "Free memory after writing CSV is %s MB" % get_free_memory() outfile.close()
def transform(file_list, pnathresh): """ Groups position ests by individual and by chromosome and filters out positions with less than pnathresh % coverage. """ #d_ests stores estimates by position by individual by chromosome. #example: # {'2R': {'indivA12_AATAAG': {'1000992': '1', # '10065531': '3', # '9987712': '1'}, # 'indivE12_GTATCG': {'10002269': '3', # '10022498': '3', # '10079005': '3'}, # }, # '3R': ... # } d_ests = {} chrom_pos_count = {} #count of individuals with a given (chrom,position) #Fill up data structure from all files for path in file_list: ind_name, chrom = parse_path(path) if not chrom in d_ests: d_ests[chrom] = {} if not ind_name in d_ests[chrom]: d_ests[chrom][ind_name] = {} csv_reader = csv.reader(open(path, 'rb')) csv_reader.next() #skip header row for row in csv_reader: pos, count, est = row[COL_POS], row[COL_COUNT], row[COL_EST] d_ests[chrom][ind_name][pos] = est chrom_pos_count[(chrom, pos)] = chrom_pos_count.get( (chrom, pos), 0) + 1 print "(mid transform function) Free memory now is %s MB" % get_free_memory( ) #Remove positions with less individuals than pna thresh % #(example: If pna thresh is .1, that means for a given chromosome location #we'd throw out the whole position if it exists for less than 10% of individuals) num_inds = max([len(d_inds) for d_inds in d_ests.values()]) print "There are %s individuals" % num_inds count_thresh = int(round(pnathresh * num_inds)) print "Will throw out chrom/positions with less than %s individuals." % count_thresh print "(that's int(round(pna_thresh %s * %s individuals)) = %s )" % ( pnathresh, num_inds, count_thresh) for chrom, d_inds in d_ests.items(): for ind_name, ests_by_pos in d_inds.items(): for pos in ests_by_pos.keys(): if chrom_pos_count[(chrom, pos)] < count_thresh: del d_ests[chrom][ind_name][pos] return d_ests
def transform(file_list, pnathresh): """ Groups position ests by individual and by chromosome and filters out positions with less than pnathresh % coverage. """ #d_ests stores estimates by position by individual by chromosome. #example: # {'2R': {'indivA12_AATAAG': {'1000992': '1', # '10065531': '3', # '9987712': '1'}, # 'indivE12_GTATCG': {'10002269': '3', # '10022498': '3', # '10079005': '3'}, # }, # '3R': ... # } d_ests = {} chrom_pos_count = {} #count of individuals with a given (chrom,position) #Fill up data structure from all files for path in file_list: ind_name, chrom = parse_path(path) if not chrom in d_ests: d_ests[chrom] = {} if not ind_name in d_ests[chrom]: d_ests[chrom][ind_name] = {} csv_reader = csv.reader(open(path, 'rb')) csv_reader.next() #skip header row for row in csv_reader: pos, count, est = row[COL_POS], row[COL_COUNT], row[COL_EST] d_ests[chrom][ind_name][pos] = est chrom_pos_count[(chrom,pos)] = chrom_pos_count.get((chrom,pos),0) + 1 print "(mid transform function) Free memory now is %s MB" % get_free_memory() #Remove positions with less individuals than pna thresh % #(example: If pna thresh is .1, that means for a given chromosome location #we'd throw out the whole position if it exists for less than 10% of individuals) num_inds = max([len(d_inds) for d_inds in d_ests.values()]) print "There are %s individuals" % num_inds count_thresh = int(round(pnathresh * num_inds)) print "Will throw out chrom/positions with less than %s individuals." % count_thresh print "(that's int(round(pna_thresh %s * %s individuals)) = %s )" % (pnathresh, num_inds, count_thresh) for chrom, d_inds in d_ests.items(): for ind_name, ests_by_pos in d_inds.items(): for pos in ests_by_pos.keys(): if chrom_pos_count[(chrom,pos)] < count_thresh: del d_ests[chrom][ind_name][pos] return d_ests
def main(): """Parse command line args, and call appropriate functions.""" # disable garbage collection for a 10% speed boost gc.disable() usage = """\nusage: %prog [options]\n""" parser = optparse.OptionParser(usage=usage) # Other option types are int and float, string is default. # Note there is also a default parameter. parser.add_option("-d", "--dir", dest="hmm_fit_dir", type="string") # ?? Need these ?? -c $params{'chroms'} -p $params{'chroms2plot'} -d hmm_fit -t $params{'thinfac'} -f $params{'difffac'} -b $params{'barcodes'} -n $params{'pnathresh'} # parser.add_option('-o','--out',dest="out_path",type="string") # parser.add_option('-t','--thresh',dest="pnathresh",type="float",default=.03) opts, args = parser.parse_args() # Args taken from sys.argv[1:] by default, parsed using GNU/POSIX syntax. if not opts.hmm_fit_dir: parser.error("A directory for locating hmm_fit data is required.") print "Starting combine.py with parameters:", str(opts) print "Free memory is %s MB" % get_free_memory() merge(opts.hmm_fit_dir)
def main(): """Parse command line args, and call appropriate functions.""" #disable garbage collection for a 10% speed boost gc.disable() usage = """\nusage: %prog [options]\n""" parser = optparse.OptionParser(usage=usage) #Other option types are int and float, string is default. #Note there is also a default parameter. parser.add_option('-d', '--dir', dest="hmm_fit_dir", type="string") #?? Need these ?? -c $params{'chroms'} -p $params{'chroms2plot'} -d hmm_fit -t $params{'thinfac'} -f $params{'difffac'} -b $params{'barcodes'} -n $params{'pnathresh'} #parser.add_option('-o','--out',dest="out_path",type="string") #parser.add_option('-t','--thresh',dest="pnathresh",type="float",default=.03) opts, args = parser.parse_args( ) #Args taken from sys.argv[1:] by default, parsed using GNU/POSIX syntax. if not opts.hmm_fit_dir: parser.error("A directory for locating hmm_fit data is required.") print "Starting combine.py with parameters:", str(opts) print "Free memory is %s MB" % get_free_memory() merge(opts.hmm_fit_dir)
def merge(dir): """ Combine all individuals and datapoints with one row per individual, with columns being chrom:position. Interpolate missing values in some cases. (The R code that we're trying to replicate was funny with this so there are a few special cases, see code) Write out one tsv file for each parent. """ # Combine all individuals/positions into a big dictionary (think of it like a sparse table) # for each parent dp1, dp2 = {}, {} for (array, ind, chrom) in input_data_sets(dir): print ind, chrom, len(array), "records" for x in array: key = (ind, chrom, int(x["pos"])) dp1[key] = x["par1"] dp2[key] = x["par2"] gc.collect() print "Done loading rdata files." print "Free memory is %s MB" % get_free_memory() # write out to files and interpolate as we go. The R code we're replacing had some weird special cases so look out for those. for (fname, dp) in (("ancestry-probs-par1.tsv", dp1), ("ancestry-probs-par2.tsv", dp2)): if DEBUG: fname = "test." + fname print "Compiling data for file", fname # Get all positions (chrom,pos) sorted by chrom, then by position positions = sorted(set([(k[1], k[2]) for k in dp.keys()])) header = [""] + ["".join((p[0], ":", str(p[1]))) for p in positions] # Get all individuals, sorted inds = sorted(set([k[0] for k in dp.keys()])) # Build up each row to be written to the file (all individuals x all positions) outrows = [] for ind in inds: print " ", ind # initialize/clear out bookkeeping variables last_pos_w_val, last_val, last_chrom, to_interpolate = None, None, None, [] outrow = [ind] # first column is individual name for (chrom, pos) in positions: # Handle switching to new chromosome if chrom != last_chrom: # set any positions waiting for interpolation to 0 since we've reached the end of the chrom # however we wan't to leave as NA and not interpolate between last_pos_w_val and end of chrom # because that's what R did. for (update_pos, insert_loc) in to_interpolate: if update_pos < last_pos_w_val: outrow[insert_loc] = "0" # clear out bookkeeping vars on new chrom last_pos_w_val, last_val, last_chrom, to_interpolate = None, None, None, [] key = (ind, chrom, pos) if (key in dp) and ((dp[key] > 0.0000005) or (last_val and last_val > 0.0000005)): # This condition is checking if A. data exists for this position and it's non-zero OR B. data exists and the last value seen was non-zero. # These are cases were we want to use this value and last seen value to interpolate positions in the interpolation queue. # Store value in outrow to be written to file outrow.append("%.6f" % round(dp[key], 6)) # interpolate any positions waiting for a new value for (update_pos, insert_loc) in to_interpolate: if update_pos < last_pos_w_val: outrow[ insert_loc ] = ( "0" ) # zero out any pending positions before the last value we saw since this is what R did. else: insert_val = last_val + ( (dp[key] - last_val) * (float(update_pos - last_pos_w_val) / (pos - last_pos_w_val)) ) outrow[insert_loc] = "%.6f" % round(insert_val, 6) to_interpolate = [] # since all pending positions have been interpolated, clear this out last_pos_w_val, last_val = pos, dp[key] elif last_val and not (key in dp): # If a value has been seen for this chrom, we'll want to start interpolating # Add a placeholder to outrow outrow.append("NA") # # Mark position for later interpolation to_interpolate.append((pos, len(outrow) - 1)) else: # don't interpolate if key in dp: # data exists for key but it's 0, Store value in outrow, but update bookkeeping vars outrow.append("%.6f" % round(dp[key], 6)) # should be 0 # still count 0 as a last value for interpolation last_pos_w_val, last_val = pos, dp[key] else: outrow.append("NA") last_chrom = chrom # set any positions waiting for interpolation to 0 since we've reached the end of the individual # however we wan't to leave as NA and not interpolate between last_pos_w_val and end # because that's what R did. for (update_pos, insert_loc) in to_interpolate: if update_pos < last_pos_w_val: outrow[insert_loc] = "0" outrows.append(outrow) fix_values(outrows) print "Writing file", fname csvout = csv.writer(open(fname, "wb"), delimiter="\t", quoting=csv.QUOTE_MINIMAL) csvout.writerow(header) csvout.writerows(outrows) gc.collect()
def merge(dir): """ Combine all individuals and datapoints with one row per individual, with columns being chrom:position. Interpolate missing values in some cases. (The R code that we're trying to replicate was funny with this so there are a few special cases, see code) Write out one tsv file for each parent. """ #Combine all individuals/positions into a big dictionary (think of it like a sparse table) #for each parent dp1, dp2 = {}, {} for (array, ind, chrom) in input_data_sets(dir): print ind, chrom, len(array), "records" for x in array: key = (ind, chrom, int(x['pos'])) dp1[key] = x['par1'] dp2[key] = x['par2'] gc.collect() print "Done loading rdata files." print "Free memory is %s MB" % get_free_memory() #write out to files and interpolate as we go. The R code we're replacing had some weird special cases so look out for those. for (fname, dp) in (('ancestry-probs-par1.tsv', dp1), ('ancestry-probs-par2.tsv', dp2)): if DEBUG: fname = 'test.' + fname print "Compiling data for file", fname #Get all positions (chrom,pos) sorted by chrom, then by position positions = sorted(set([(k[1], k[2]) for k in dp.keys()])) header = [''] + [''.join((p[0], ':', str(p[1]))) for p in positions] #Get all individuals, sorted inds = sorted(set([k[0] for k in dp.keys()])) #Build up each row to be written to the file (all individuals x all positions) outrows = [] for ind in inds: print " ", ind #initialize/clear out bookkeeping variables last_pos_w_val, last_val, last_chrom, to_interpolate = None, None, None, [] outrow = [ind] #first column is individual name for (chrom, pos) in positions: # Handle switching to new chromosome if chrom != last_chrom: #set any positions waiting for interpolation to 0 since we've reached the end of the chrom #however we wan't to leave as NA and not interpolate between last_pos_w_val and end of chrom #because that's what R did. for (update_pos, insert_loc) in to_interpolate: if update_pos < last_pos_w_val: outrow[insert_loc] = "0" #clear out bookkeeping vars on new chrom last_pos_w_val, last_val, last_chrom, to_interpolate = None, None, None, [] key = (ind, chrom, pos) if (key in dp) and ((dp[key] > .0000005) or (last_val and last_val > .0000005)): # This condition is checking if A. data exists for this position and it's non-zero OR B. data exists and the last value seen was non-zero. # These are cases were we want to use this value and last seen value to interpolate positions in the interpolation queue. # Store value in outrow to be written to file outrow.append("%.6f" % round(dp[key], 6)) #interpolate any positions waiting for a new value for (update_pos, insert_loc) in to_interpolate: if update_pos < last_pos_w_val: outrow[ insert_loc] = "0" # zero out any pending positions before the last value we saw since this is what R did. else: insert_val = last_val + ( (dp[key] - last_val) * (float(update_pos - last_pos_w_val) / (pos - last_pos_w_val))) outrow[insert_loc] = "%.6f" % round(insert_val, 6) to_interpolate = [ ] #since all pending positions have been interpolated, clear this out last_pos_w_val, last_val = pos, dp[key] elif last_val and not (key in dp): #If a value has been seen for this chrom, we'll want to start interpolating #Add a placeholder to outrow outrow.append('NA') # #Mark position for later interpolation to_interpolate.append((pos, len(outrow) - 1)) else: #don't interpolate if key in dp: #data exists for key but it's 0, Store value in outrow, but update bookkeeping vars outrow.append("%.6f" % round(dp[key], 6)) #should be 0 #still count 0 as a last value for interpolation last_pos_w_val, last_val = pos, dp[key] else: outrow.append('NA') last_chrom = chrom #set any positions waiting for interpolation to 0 since we've reached the end of the individual #however we wan't to leave as NA and not interpolate between last_pos_w_val and end #because that's what R did. for (update_pos, insert_loc) in to_interpolate: if update_pos < last_pos_w_val: outrow[insert_loc] = "0" outrows.append(outrow) fix_values(outrows) print "Writing file", fname csvout = csv.writer(open(fname, 'wb'), delimiter='\t', quoting=csv.QUOTE_MINIMAL) csvout.writerow(header) csvout.writerows(outrows) gc.collect()