def set_seed(seed_option): seed_option = int(seed_option) if seed_option == 0: tmp_seed = random.randint(1, 32000) if seed_option > int(0): tmp_seed = seed_option print("Current Seed: {}".format(tmp_seed)) random.seed(tmp_seed)
def populate_sizes(flags): sizes = map(int, flags["-I"][0][1:]) if sys.version_info > (3, 0): sizes = list(sizes) if '-discovery' in flags: for discovery_pop_str in flags["-discovery"][0]: discovery_pop = int(discovery_pop_str) - 1 if "True" in flags['-random_discovery'][0]: sizes[discovery_pop] += random.randint(2, sizes[discovery_pop]) else: sizes[discovery_pop] += sizes[discovery_pop] return sizes
def prior_to_param_value(input_param_str): """ priorToParamValue: This is a helper function that takes a string in the form (###:###) Numbers are allowed to be either scientific notation or base 10 And returns a random value in that range, in the form of a string :param input_param_str: :return: """ assert isinstance(input_param_str, str), "priorToParamValue called without a string" temp_low = float(sci_to_float(input_param_str.split(":")[0][1:])) temp_high = float(sci_to_float(input_param_str.split(":")[1][:-1])) return_value = str(random.uniform(temp_low, temp_high)) return return_value
def getUnscaledValue(variables, tempNum, tempLow=False): tempVar = "" if tempNum in variables.keys(): tempVar = tempNum tempNum = variables[tempNum] tempNum = tempNum.strip() if ":" not in tempNum: tempNum = str(sci_to_float(tempNum)) returnValue = str(tempNum) else: # This means you want range tempLow = max(float(sci_to_float(tempNum.split(":")[0][1:])),float(tempLow)) tempHigh = sci_to_float(tempNum.split(":")[1][:-1]) returnValue = str(random.uniform(float(tempLow),float(tempHigh))) if tempVar in variables.keys(): variables[tempVar] = returnValue return returnValue
def get_param_value_bounded(temp_num, temp_low): """ :param temp_num: :param temp_low: :return: """ temp_num = temp_num.strip() if ":" not in temp_num: temp_num = str(sci_to_float(temp_num)) return_value = str(temp_num) else: # This means you want range temp_low = max(float(sci_to_float(temp_num.split(":")[0][1:])), float(temp_low)) temp_high = sci_to_float(temp_num.split(":")[1][:-1]) return_value = str(random.uniform(float(temp_low), float(temp_high))) return return_value
def pseudo_array(asc_panel, daf, pos, snps): Tasc_panel = zip(*asc_panel) print( 'number of sites in Tasc_panel:', len(Tasc_panel)) print( 'number of chromosomes in Tasc_panel:', len(Tasc_panel[0])) #######Array with the available sites given the frequency cut off ##array with the frequency of all the simulated snps sites_freq = [] ##array with the available sites, that pass the frequency cut-off avail_sites = [] ##this one has the positions of the snps index_avail_sites = [] ##this one has the indexes of the snps for n in range(len(Tasc_panel)): freq_site = float(Tasc_panel[n][0:len(asc_panel)].count('1')) / float(len(asc_panel)) if freq_site >= daf and freq_site <= 1 - daf: sites_freq.append(freq_site) avail_sites.append(pos[n]) index_avail_sites.append(n) nb_avail_sites = len(avail_sites) if (len(avail_sites) == len(snps)): debugPrint(3,"number of avail_sites is equal to the number of Array snps") pos_asc = [] pos_asc = index_avail_sites nbss_asc = len(pos_asc) flag_nb_asc_snps = 1 elif (len(avail_sites) > len(snps)): debugPrint(3,"number of avail_sites greater than number of Array snps") pos_asc = [None] * int(len(snps)) ##indexes of the SNPs that pass the frequency cut-off and position for i in range(len(snps)): # each snp on the snp array on a chromosome ## y is the position of the SNPs in the array y = snps[i] ##find the closest SNP in the array closestleft = find2(avail_sites, y) if (i > 0 and pos_asc[i - 1] == closestleft and closestleft + 1 < len(avail_sites)): ##avoid duplicates closestleft = closestleft + 1 ##move one position to the right pos_asc[i] = closestleft elif (i > 0 and pos_asc[i - 1] > closestleft and pos_asc[i - 1] + 1 < len(avail_sites)): closestleft = pos_asc[i - 1] + 1 pos_asc[i] = closestleft else: pos_asc[i] = closestleft ###if I have duplicates at this point, it means that there were not anyt more snps to choose from ###closestleft+1 or pos_asc[i-1]+1 == len(avail_sites) #####smoothing ##last index of the pos_asc i = len(pos_asc) - 1 ##check if there is another position that might work better for j in range(0, i): if (j == i - 1 and pos_asc[j] + 1 < pos_asc[j + 1] and pos_asc[j] < (len(avail_sites) - 1) and ( j + 1) < len(avail_sites)): d1 = abs(snps[j] - avail_sites[pos_asc[j]]) d2 = abs(snps[j] - avail_sites[pos_asc[j] + 1]) if (d2 < d1): pos_asc[j] = pos_asc[j] + 1 ##removes duplicates pos_asc = (list(set(pos_asc))) pos_asc.sort() nbss_asc = len(pos_asc) if (len(snps) == nbss_asc): flag_nb_asc_snps = 1 debugPrint(3,'Number of asc snps equal to nb array snps') if (len(snps) != len(pos_asc)): flag_nb_asc_snps = 0 debugPrint(3,'Number of asc snps not equal to nb array snps') diff = int(len(snps) - len(pos_asc)) for m in range(1, diff + 1): pos_asc2 = [] pos_asc2 = add_snps(avail_sites, nb_avail_sites, pos_asc, nbss_asc, nb_array_snps) pos_asc = pos_asc2 nbss_asc = len(pos_asc) if nbss_asc == len(snps): flag_nb_asc_snps = 1 break else: flag_nb_asc_snps = 0 if (flag_nb_asc_snps == 0): ##it means that the 1st index in pos_asc is 0; and the last is len(avail_sites)-1 diff = int(len(snps) - len(pos_asc)) while (len(pos_asc) != len(snps)): rand_numb = random.randint(0, len(avail_sites) - 1) # print( 'random',rand_numb) if rand_numb not in pos_asc: pos_asc.append(rand_numb) pos_asc.sort() nbss_asc = len(pos_asc) print( 'finished making pseudo array') return pos_asc, nbss_asc, index_avail_sites, avail_sites
def pseudo_array_bits(asc_panel_bits, daf, pos, snps): ''' Parameters: asc_panel_bits: bitarray daf: float (0.0264139586625) pos: list of floats in acsending order snps: list of ints Returns: pos_asc: list of ints (2481-2679) nbss_asc: 200 index_avail_sites: avail_sites: list of floats Errors: - the asc_panel_bits needs to be divisible by pos - daf cannot be negative or greater than 1 ''' n = asc_panel_bits.length()/len(pos) n = int(n) #######Array with the available sites given the frequency cut off ##array with the frequency of all the simulated snps sites_freq = [] ##array with the available sites, that pass the frequency cut-off avail_sites = [] ##this one has the positions of the snps index_avail_sites = [] ##this one has the indexes of the snps i = 0 for site in range(0, asc_panel_bits.length(), int(n)): freq_site = float(asc_panel_bits[site:site + n].count(1) / float(n)) if freq_site >= daf and freq_site <= 1 - daf: sites_freq.append(freq_site) avail_sites.append(pos[i]) index_avail_sites.append(i) i=i+1 nb_avail_sites = len(avail_sites) if (len(avail_sites) < len(snps)): print( "Error: There are not enough simulated sites in the discovery panel with allele frequency >=",daf,"and <=",1 - daf) sys.exit() if (len(avail_sites) == len(snps)): #debugPrint(3,"Number of avail_sites is equal to the number of Array snps") pos_asc = [] pos_asc = index_avail_sites nbss_asc = len(pos_asc) flag_nb_asc_snps = 1 elif (len(avail_sites) > len(snps)): #debugPrint(3,"Number of avail_sites greater than number of Array snps") pos_asc = [None] * int(len(snps)) ##indexes of the SNPs that pass the frequency cut-off and position for i in range(len(snps)): # each snp on the snp array on a chromosome ## y is the position of the SNPs in the array y = snps[i] ##find the closest SNP in the array closestleft = find2(avail_sites, y) if (i > 0 and pos_asc[i - 1] == closestleft and closestleft + 1 < len(avail_sites)): ##avoid duplicates closestleft = closestleft + 1 ##move one position to the right pos_asc[i] = closestleft elif (i > 0 and pos_asc[i - 1] > closestleft and pos_asc[i - 1] + 1 < len(avail_sites)): closestleft = pos_asc[i - 1] + 1 pos_asc[i] = closestleft else: pos_asc[i] = closestleft ###if I have duplicates at this point, it means that there were not anyt more snps to choose from ###closestleft+1 or pos_asc[i-1]+1 == len(avail_sites) #####smoothing ##last index of the pos_asc i = len(pos_asc) - 1 ##check if there is another position that might work better for j in range(0, i): if (j == i - 1 and pos_asc[j] + 1 < pos_asc[j + 1] and pos_asc[j] < (len(avail_sites) - 1) and ( j + 1) < len(avail_sites)): d1 = abs(snps[j] - avail_sites[pos_asc[j]]) d2 = abs(snps[j] - avail_sites[pos_asc[j] + 1]) if (d2 < d1): pos_asc[j] = pos_asc[j] + 1 ##removes duplicates pos_asc = (list(set(pos_asc))) pos_asc.sort() nbss_asc = len(pos_asc) nb_array_snps = len(snps) if (len(snps) == nbss_asc): flag_nb_asc_snps = 1 debugPrint(3,'nb of asc snps equal to nb array snps') if (len(snps) != len(pos_asc)): flag_nb_asc_snps = 0 #debugPrint(3,'nb of asc snps not equal to nb array snps') diff = int(len(snps) - len(pos_asc)) for m in range(1, diff + 1): pos_asc2 = [] pos_asc2 = add_snps(avail_sites, nb_avail_sites, pos_asc, nbss_asc, nb_array_snps) pos_asc = pos_asc2 nbss_asc = len(pos_asc) if nbss_asc == len(snps): flag_nb_asc_snps = 1 break else: flag_nb_asc_snps = 0 if (flag_nb_asc_snps == 0): ##it means that the 1st index in pos_asc is 0; and the last is len(avail_sites)-1 diff = int(len(snps) - len(pos_asc)) while (len(pos_asc) != len(snps)): rand_numb = random.randint(0, len(avail_sites) - 1) # print( 'random',rand_numb) if rand_numb not in pos_asc: pos_asc.append(rand_numb) pos_asc.sort() nbss_asc = len(pos_asc) #debugPrint(2,'finished making pseudo array') return pos_asc, nbss_asc, index_avail_sites, avail_sites
def processModelData(variables, modelData): """ """ debugPrint(2, "Starting: processModelData") processedData = {} flags = populateFlags(variables, modelData) if '-macs_file' in flags: macs_args = [flags['-macs_file'][0], flags['-length'][0][0], "-I", flags['-I'][0][0]] elif '-macsswig' in flags: macs_args = [flags['-macsswig'][0][0], flags['-length'][0][0], "-I", flags['-I'][0][0]] elif '-macs' in flags: macs_args = [flags['-macs'][0][0], flags['-length'][0][0], "-I", flags['-I'][0][0]] sizes = map(int, flags["-I"][0][1:]) if (sys.version_info > (3, 0)): sizes = list(sizes) if '-discovery' in flags: for discovery_pop_str in flags["-discovery"][0]: discovery_pop = int(discovery_pop_str)-1 if "True" in flags['-random_discovery'][0]: sizes[discovery_pop] += random.randint(2, sizes[discovery_pop]) else: sizes[discovery_pop] += sizes[discovery_pop] total = float(sum(sizes)) macs_args.insert(1,str(total)) sizes_str = map(str, sizes) if (sys.version_info > (3, 0)): sizes_str = list(sizes_str) macs_args.extend(sizes_str) # seasons is all the time based events seasons = [] Ne = findScaleValue(flags, variables) # processOrderedSeasons(flags, variables) debugPrint(3,"Processing flags in for macs_args") for flag in flags.keys(): debugPrint(3," {}: {}".format(flag,flags[flag])) for tempLine in flags[flag]: try: # debugPrint(3,flag + ": " + str(tempLine)) if flag == "-discovery": processedData['discovery'] = [int(s.strip()) for s in tempLine if s] continue if flag == "-sample": processedData['sample'] = [int(s.strip()) for s in tempLine if s] continue if flag == "-s": processedData['seed'] = tempLine[0] if flag == "-daf": processedData['daf'] = float(getUnscaledValue(variables, tempLine[0])) continue if flag == "-length": processedData['length'] = tempLine[0] continue if flag == "-macs": processedData['macs'] = tempLine[0] continue if flag == "-I": processedData["I"] = [int(s.strip()) for s in tempLine[1:] if s] continue if flag == "-macsswig": processedData['macsswig'] = tempLine[0] continue if flag == "-n": tmp = processedData.get('name', []) tmp.append(tempLine[1]) processedData['name'] = tmp #----------------------- For Added Arguments from Model_CSV ignoredFlags = ["-germline", "-array", "-nonrandom_discovery", "-random_discovery", "-pedmap"] if flag in ignoredFlags: continue if flag == "-Ne": tempLine[0] = getUnscaledValue(variables, tempLine[0]) if flag == "-em": tempLine[3] = getUnscaledValue(variables, tempLine[3]) tempLine[3] = str(float(4*(float(tempLine[3])*Ne))) elif flag == "-eM" or flag == "-g": tempLine[1] = getUnscaledValue(variables, tempLine[1]) tempLine[1] = str(float(4*(float(tempLine[1])*Ne))) elif flag == "-ema": for i in range(2,len(tempLine)): tempLine[i] = getUnscaledValue(variables, tempLine[i]) tempLine[i] = str(float(4*(float(tempLine[i])*Ne))) elif flag == "-eN" or flag == "-n": tempLine[1] = getUnscaledValue(variables, tempLine[1]) tempLine[1] = str(float((float(tempLine[1])/Ne))) elif flag == "-en": tempLine[2] = getUnscaledValue(variables, tempLine[2]) tempLine[2] = str(float((float(tempLine[2])/Ne))) elif flag == "-eg": tempLine[2] = getUnscaledValue(variables, tempLine[2]) tempLine[2] = str(float(4*(float(tempLine[2])*Ne))) elif flag == "-es": tempLine[2] = getUnscaledValue(variables, tempLine[2]) elif flag == "-m": tempLine[2] = getUnscaledValue(variables, tempLine[2]) tempLine[2] = str(float(4*(float(tempLine[2])*Ne))) elif flag == "-ma": for i in range(len(tempLine)): tempLine[i] = getUnscaledValue(variables, tempLine[i]) tempLine[i]=str(float(4*(float(tempLine[i])*Ne))) elif flag == "-t" or flag == "-r" or flag == "-G": # both <m> <r> <alpha> have same scaling factor tempLine[0] = getUnscaledValue(variables, tempLine[0]) tempLine[0] = str(float(4*(float(tempLine[0])*Ne))) if flag.startswith('-e'): # all <t>'s are scaled pass tempLine[0] = getUnscaledValue(variables, tempLine[0]) tempLine[0]=str(round(float(tempLine[0]))/(4*Ne)) seasons.append([flag] + tempLine) else: macs_args.append(flag.strip()) for subLine in tempLine: macs_args.append(subLine.strip()) except IndexError as e: print("There was an index error!\nThis most likely means your input file has a malformed flag.") print("Try running with -vv argument for last flag ran") sys.exit() if '-n' not in flags: tmp = list(range(1,int(flags['-I'][0][0])+1)) processedData['name'] = tmp if not processedData.get('discovery') or not processedData.get('sample') or not processedData.get('daf'): if not processedData.get('discovery') and not processedData.get('sample') and not processedData.get('daf'): debugPrint(2, "discovery, sample, and daf are all missing") else: print("discovery, sample, or daf is missing") quit() debugPrint(2, "Adding events data back to flag pool") for i in range(len(seasons)): seasons[i][1] = float(seasons[i][1]) seasons = sorted(seasons, key=itemgetter(1)) for i in range(len(seasons)): seasons[i][1] = str(seasons[i][1]) for season in seasons: macs_args.extend(season) processedData["macs_args"] = macs_args return processedData
def set_seed(seed_option): seed_option = int(seed_option) if seed_option == 0: random.seed() if seed_option > int(0): random.seed(seed_option)