예제 #1
0
def set_seed(seed_option):
    seed_option = int(seed_option)

    if seed_option == 0:
        tmp_seed = random.randint(1, 32000)
    if seed_option > int(0):
        tmp_seed = seed_option
    print("Current Seed: {}".format(tmp_seed))
    random.seed(tmp_seed)
예제 #2
0
def populate_sizes(flags):
    sizes = map(int, flags["-I"][0][1:])
    if sys.version_info > (3, 0):
        sizes = list(sizes)
    if '-discovery' in flags:
        for discovery_pop_str in flags["-discovery"][0]:
            discovery_pop = int(discovery_pop_str) - 1
            if "True" in flags['-random_discovery'][0]:
                sizes[discovery_pop] += random.randint(2, sizes[discovery_pop])
            else:
                sizes[discovery_pop] += sizes[discovery_pop]
    return sizes
예제 #3
0
def prior_to_param_value(input_param_str):
    """
    priorToParamValue:
    This is a helper function that takes a string in the form (###:###)
    Numbers are allowed to be either scientific notation or base 10
    And returns a random value in that range, in the form of a string

    :param input_param_str:
    :return:
    """

    assert isinstance(input_param_str, str), "priorToParamValue called without a string"

    temp_low = float(sci_to_float(input_param_str.split(":")[0][1:]))
    temp_high = float(sci_to_float(input_param_str.split(":")[1][:-1]))
    return_value = str(random.uniform(temp_low, temp_high))

    return return_value
예제 #4
0
def getUnscaledValue(variables, tempNum, tempLow=False):
    tempVar = ""
    if tempNum in variables.keys():
        tempVar = tempNum 
        tempNum = variables[tempNum]

    tempNum = tempNum.strip()
    if ":" not in tempNum:
        tempNum = str(sci_to_float(tempNum))
        returnValue = str(tempNum)
    else:
        # This means you want range
        tempLow = max(float(sci_to_float(tempNum.split(":")[0][1:])),float(tempLow))
        tempHigh = sci_to_float(tempNum.split(":")[1][:-1])
        returnValue = str(random.uniform(float(tempLow),float(tempHigh)))
    if tempVar in variables.keys():
        variables[tempVar] = returnValue
    return returnValue
예제 #5
0
def get_param_value_bounded(temp_num, temp_low):
    """

    :param temp_num:
    :param temp_low:
    :return:
    """

    temp_num = temp_num.strip()
    if ":" not in temp_num:
        temp_num = str(sci_to_float(temp_num))
        return_value = str(temp_num)
    else:
        # This means you want range
        temp_low = max(float(sci_to_float(temp_num.split(":")[0][1:])), float(temp_low))
        temp_high = sci_to_float(temp_num.split(":")[1][:-1])
        return_value = str(random.uniform(float(temp_low), float(temp_high)))

    return return_value
예제 #6
0
def pseudo_array(asc_panel, daf, pos, snps):
    Tasc_panel = zip(*asc_panel)
    print( 'number of sites in Tasc_panel:', len(Tasc_panel))
    print( 'number of chromosomes in Tasc_panel:', len(Tasc_panel[0]))

    #######Array with the available sites given the frequency cut off
    ##array with the frequency of all the simulated snps
    sites_freq = []
    ##array with the available sites, that pass the frequency cut-off
    avail_sites = []  ##this one has the positions of the snps
    index_avail_sites = []  ##this one has the indexes of the snps
    for n in range(len(Tasc_panel)):
        freq_site = float(Tasc_panel[n][0:len(asc_panel)].count('1')) / float(len(asc_panel))
        if freq_site >= daf and freq_site <= 1 - daf:
            sites_freq.append(freq_site)
            avail_sites.append(pos[n])
            index_avail_sites.append(n)
    nb_avail_sites = len(avail_sites)
    if (len(avail_sites) == len(snps)):
        debugPrint(3,"number of avail_sites is equal to the number of Array snps")
        pos_asc = []
        pos_asc = index_avail_sites
        nbss_asc = len(pos_asc)
        flag_nb_asc_snps = 1
    elif (len(avail_sites) > len(snps)):
        debugPrint(3,"number of avail_sites greater than number of Array snps")
        pos_asc = [None] * int(len(snps))  ##indexes of the SNPs that pass the frequency cut-off and position
        for i in range(len(snps)):  # each snp on the snp array on a chromosome
            ## y is the position of the SNPs in the array
            y = snps[i]
            ##find the closest SNP in the array
            closestleft = find2(avail_sites, y)
            if (i > 0 and pos_asc[i - 1] == closestleft and closestleft + 1 < len(avail_sites)):  ##avoid duplicates
                closestleft = closestleft + 1  ##move one position to the right
                pos_asc[i] = closestleft
            elif (i > 0 and pos_asc[i - 1] > closestleft and pos_asc[i - 1] + 1 < len(avail_sites)):
                closestleft = pos_asc[i - 1] + 1
                pos_asc[i] = closestleft
            else:
                pos_asc[i] = closestleft
                ###if I have duplicates at this point, it means that there were not anyt more snps to choose from
                ###closestleft+1 or pos_asc[i-1]+1 == len(avail_sites)
        #####smoothing
        ##last index of the pos_asc
        i = len(pos_asc) - 1
        ##check if there is another position that might work better
        for j in range(0, i):
            if (j == i - 1 and pos_asc[j] + 1 < pos_asc[j + 1] and pos_asc[j] < (len(avail_sites) - 1) and (
                        j + 1) < len(avail_sites)):
                d1 = abs(snps[j] - avail_sites[pos_asc[j]])
                d2 = abs(snps[j] - avail_sites[pos_asc[j] + 1])
                if (d2 < d1):
                    pos_asc[j] = pos_asc[j] + 1

        ##removes duplicates
        pos_asc = (list(set(pos_asc)))
        pos_asc.sort()

        nbss_asc = len(pos_asc)

        if (len(snps) == nbss_asc):
            flag_nb_asc_snps = 1
            debugPrint(3,'Number of asc snps equal to nb array snps')

        if (len(snps) != len(pos_asc)):
            flag_nb_asc_snps = 0
            debugPrint(3,'Number of asc snps not equal to nb array snps')
            diff = int(len(snps) - len(pos_asc))
            for m in range(1, diff + 1):
                pos_asc2 = []
                pos_asc2 = add_snps(avail_sites, nb_avail_sites, pos_asc, nbss_asc, nb_array_snps)
                pos_asc = pos_asc2
                nbss_asc = len(pos_asc)
                if nbss_asc == len(snps):
                    flag_nb_asc_snps = 1
                    break
                else:
                    flag_nb_asc_snps = 0

        if (flag_nb_asc_snps == 0):  ##it means that the 1st index in pos_asc is 0; and the last is len(avail_sites)-1
            diff = int(len(snps) - len(pos_asc))
            while (len(pos_asc) != len(snps)):
                rand_numb = random.randint(0, len(avail_sites) - 1)
                # print( 'random',rand_numb)
                if rand_numb not in pos_asc:
                    pos_asc.append(rand_numb)
            pos_asc.sort()
            nbss_asc = len(pos_asc)
    print( 'finished making pseudo array')
    return pos_asc, nbss_asc, index_avail_sites, avail_sites
예제 #7
0
def pseudo_array_bits(asc_panel_bits, daf, pos, snps):
    '''
    Parameters: 
    asc_panel_bits: bitarray
    daf: float (0.0264139586625)
    pos: list of floats in acsending order
    snps: list of ints

    Returns: pos_asc: list of ints (2481-2679)
    nbss_asc: 200
    index_avail_sites: 
    avail_sites: list of floats

    Errors: 
    - the asc_panel_bits needs to be divisible by pos
    - daf cannot be negative or greater than 1
    '''
    n = asc_panel_bits.length()/len(pos)
    n = int(n)
    #######Array with the available sites given the frequency cut off
    ##array with the frequency of all the simulated snps
    sites_freq = []
    ##array with the available sites, that pass the frequency cut-off
    avail_sites = []  ##this one has the positions of the snps
    index_avail_sites = []  ##this one has the indexes of the snps

    i = 0
    for site in range(0, asc_panel_bits.length(), int(n)):
        freq_site = float(asc_panel_bits[site:site + n].count(1) / float(n))
        if freq_site >= daf and freq_site <= 1 - daf:
            sites_freq.append(freq_site)
            avail_sites.append(pos[i])
            index_avail_sites.append(i)
        i=i+1
    nb_avail_sites = len(avail_sites)
    if (len(avail_sites) < len(snps)):
        print( "Error: There are not enough simulated sites in the discovery panel with allele frequency >=",daf,"and <=",1 - daf)
        sys.exit()

    if (len(avail_sites) == len(snps)):
        #debugPrint(3,"Number of avail_sites is equal to the number of Array snps")
        pos_asc = []
        pos_asc = index_avail_sites
        nbss_asc = len(pos_asc)
        flag_nb_asc_snps = 1

    elif (len(avail_sites) > len(snps)):
        #debugPrint(3,"Number of avail_sites greater than number of Array snps")
        pos_asc = [None] * int(len(snps))  ##indexes of the SNPs that pass the frequency cut-off and position
        for i in range(len(snps)):  # each snp on the snp array on a chromosome
            ## y is the position of the SNPs in the array
            y = snps[i]
            ##find the closest SNP in the array
            closestleft = find2(avail_sites, y)
            if (i > 0 and pos_asc[i - 1] == closestleft and closestleft + 1 < len(avail_sites)):  ##avoid duplicates
                closestleft = closestleft + 1  ##move one position to the right
                pos_asc[i] = closestleft
            elif (i > 0 and pos_asc[i - 1] > closestleft and pos_asc[i - 1] + 1 < len(avail_sites)):
                closestleft = pos_asc[i - 1] + 1
                pos_asc[i] = closestleft
            else:
                pos_asc[i] = closestleft
                ###if I have duplicates at this point, it means that there were not anyt more snps to choose from
                ###closestleft+1 or pos_asc[i-1]+1 == len(avail_sites)

        #####smoothing
        ##last index of the pos_asc
        i = len(pos_asc) - 1

        ##check if there is another position that might work better
        for j in range(0, i):
            if (j == i - 1 and pos_asc[j] + 1 < pos_asc[j + 1] and pos_asc[j] < (len(avail_sites) - 1) and (
                        j + 1) < len(avail_sites)):
                d1 = abs(snps[j] - avail_sites[pos_asc[j]])
                d2 = abs(snps[j] - avail_sites[pos_asc[j] + 1])
                if (d2 < d1):
                    pos_asc[j] = pos_asc[j] + 1

        ##removes duplicates
        pos_asc = (list(set(pos_asc)))
        pos_asc.sort()

        nbss_asc = len(pos_asc)
        nb_array_snps = len(snps)

        if (len(snps) == nbss_asc):
            flag_nb_asc_snps = 1
            debugPrint(3,'nb of asc snps equal to nb array snps')

        if (len(snps) != len(pos_asc)):
            flag_nb_asc_snps = 0
           #debugPrint(3,'nb of asc snps not equal to nb array snps')
            diff = int(len(snps) - len(pos_asc))
            for m in range(1, diff + 1):
                pos_asc2 = []
                pos_asc2 = add_snps(avail_sites, nb_avail_sites, pos_asc, nbss_asc, nb_array_snps)
                pos_asc = pos_asc2
                nbss_asc = len(pos_asc)
                if nbss_asc == len(snps):
                    flag_nb_asc_snps = 1
                    break
                else:
                    flag_nb_asc_snps = 0

        if (flag_nb_asc_snps == 0):  ##it means that the 1st index in pos_asc is 0; and the last is len(avail_sites)-1
            diff = int(len(snps) - len(pos_asc))
            while (len(pos_asc) != len(snps)):
                rand_numb = random.randint(0, len(avail_sites) - 1)
                # print( 'random',rand_numb)
                if rand_numb not in pos_asc:
                    pos_asc.append(rand_numb)
            pos_asc.sort()
            nbss_asc = len(pos_asc)
    #debugPrint(2,'finished making pseudo array')
    return pos_asc, nbss_asc, index_avail_sites, avail_sites
예제 #8
0
def processModelData(variables, modelData):
    """
    """
    debugPrint(2, "Starting: processModelData")
    processedData = {}
    
    flags = populateFlags(variables, modelData)

    if '-macs_file' in flags:
        macs_args = [flags['-macs_file'][0], flags['-length'][0][0], "-I", flags['-I'][0][0]]
    elif '-macsswig' in flags:
          macs_args = [flags['-macsswig'][0][0], flags['-length'][0][0], "-I", flags['-I'][0][0]]
    elif '-macs' in flags:
        macs_args = [flags['-macs'][0][0], flags['-length'][0][0], "-I", flags['-I'][0][0]]
    sizes = map(int, flags["-I"][0][1:])
    if (sys.version_info > (3, 0)):
        sizes = list(sizes)
    if '-discovery' in flags:
        for discovery_pop_str in flags["-discovery"][0]:
            discovery_pop = int(discovery_pop_str)-1
            if "True" in flags['-random_discovery'][0]:
                sizes[discovery_pop] += random.randint(2, sizes[discovery_pop])
            else:
                sizes[discovery_pop] += sizes[discovery_pop]
    total = float(sum(sizes))
    macs_args.insert(1,str(total))
    sizes_str = map(str, sizes)
    if (sys.version_info > (3, 0)):
        sizes_str = list(sizes_str)
    macs_args.extend(sizes_str)


    # seasons is all the time based events
    seasons = []

    Ne = findScaleValue(flags, variables)
    # processOrderedSeasons(flags, variables)
    debugPrint(3,"Processing flags in for macs_args")
    for flag in flags.keys():
        debugPrint(3,"  {}: {}".format(flag,flags[flag]))

        for tempLine in flags[flag]:
            try:
                # debugPrint(3,flag + ": " + str(tempLine))
                if flag == "-discovery":
                    processedData['discovery'] = [int(s.strip()) for s in tempLine if s]
                    continue
                if flag == "-sample":
                    processedData['sample'] = [int(s.strip()) for s in tempLine if s]
                    continue
                if flag == "-s":
                    processedData['seed'] = tempLine[0]
                if flag == "-daf":
                    processedData['daf'] = float(getUnscaledValue(variables, tempLine[0]))
                    continue
                if flag == "-length":
                    processedData['length'] = tempLine[0]
                    continue
                if flag == "-macs":
                    processedData['macs'] = tempLine[0]
                    continue
                if flag == "-I":
                    processedData["I"] = [int(s.strip()) for s in tempLine[1:] if s]
                    continue
                if flag == "-macsswig":
                    processedData['macsswig'] = tempLine[0]
                    continue
                if flag == "-n":
                    tmp = processedData.get('name', [])
                    tmp.append(tempLine[1])
                    processedData['name'] = tmp
                
                #----------------------- For Added Arguments from Model_CSV
                ignoredFlags = ["-germline",
                                "-array",
                                "-nonrandom_discovery",
                                "-random_discovery",
                                "-pedmap"]

                if flag in ignoredFlags:
                    continue

                if flag == "-Ne":
                    tempLine[0] = getUnscaledValue(variables, tempLine[0])
                if flag == "-em":
                    tempLine[3] = getUnscaledValue(variables, tempLine[3])
                    tempLine[3] = str(float(4*(float(tempLine[3])*Ne)))
                
                elif flag == "-eM" or flag == "-g":
                    tempLine[1] = getUnscaledValue(variables, tempLine[1])
                    tempLine[1] = str(float(4*(float(tempLine[1])*Ne)))

                elif flag == "-ema":
                    for i in range(2,len(tempLine)):
                        tempLine[i] = getUnscaledValue(variables, tempLine[i])
                        tempLine[i] = str(float(4*(float(tempLine[i])*Ne)))

                elif flag == "-eN" or flag == "-n":
                    tempLine[1] = getUnscaledValue(variables, tempLine[1])
                    tempLine[1] = str(float((float(tempLine[1])/Ne)))

                elif flag == "-en":
                    tempLine[2] = getUnscaledValue(variables, tempLine[2])
                    tempLine[2] = str(float((float(tempLine[2])/Ne)))

                elif flag == "-eg":
                    tempLine[2] = getUnscaledValue(variables, tempLine[2])
                    tempLine[2] = str(float(4*(float(tempLine[2])*Ne)))

                elif flag == "-es":
                    tempLine[2] = getUnscaledValue(variables, tempLine[2])

                elif flag == "-m":
                    tempLine[2] = getUnscaledValue(variables, tempLine[2])
                    tempLine[2] = str(float(4*(float(tempLine[2])*Ne)))

                elif flag == "-ma":
                    for i in range(len(tempLine)):
                        tempLine[i] = getUnscaledValue(variables, tempLine[i])
                        tempLine[i]=str(float(4*(float(tempLine[i])*Ne)))

                elif flag == "-t" or flag == "-r" or flag == "-G":
                    # both <m> <r> <alpha> have same scaling factor
                    tempLine[0] = getUnscaledValue(variables, tempLine[0])
                    tempLine[0] = str(float(4*(float(tempLine[0])*Ne)))

                if flag.startswith('-e'):
                    # all <t>'s are scaled
                    pass
                    tempLine[0] = getUnscaledValue(variables, tempLine[0])
                    tempLine[0]=str(round(float(tempLine[0]))/(4*Ne))
                    seasons.append([flag] + tempLine)
                else:
                    macs_args.append(flag.strip())
                    for subLine in tempLine:
                        macs_args.append(subLine.strip())
            except IndexError as e:
                print("There was an index error!\nThis most likely means your input file has a malformed flag.")
                print("Try running with -vv argument for last flag ran")
                sys.exit()

    if '-n' not in flags:
        tmp = list(range(1,int(flags['-I'][0][0])+1))
        processedData['name'] = tmp

    if not processedData.get('discovery') or not processedData.get('sample') or not processedData.get('daf'):
        if not processedData.get('discovery') and not processedData.get('sample') and not processedData.get('daf'):
            debugPrint(2, "discovery, sample, and daf are all missing")
        else:
            print("discovery, sample, or daf is missing")
            quit()
            

    debugPrint(2, "Adding events data back to flag pool")
    for i in range(len(seasons)):
        seasons[i][1] = float(seasons[i][1])
    seasons = sorted(seasons, key=itemgetter(1))
    for i in range(len(seasons)):
        seasons[i][1] = str(seasons[i][1])
    for season in seasons:
        macs_args.extend(season)

    processedData["macs_args"] = macs_args
    return processedData
예제 #9
0
def set_seed(seed_option):
    seed_option = int(seed_option)
    if seed_option == 0:
        random.seed()
    if seed_option > int(0):
        random.seed(seed_option)