示例#1
0
def populateFlags(variables, modelData):
    '''
    This will fill a dictionary with keys that equal the flags, and values that
    is a list of every time (in order) the flag is used
    '''
    debugPrint(2, "Starting: populateFlags ")
    flags = OrderedDict()
    orderedEvents = []
    lowTime = False
    # loops through all items in data
    for i, line in enumerate(modelData):
        lineSplit = line.split(',')

        flag = lineSplit[0]
        # if flag starts with -e it will be an event flag, thus, the order must be preserved
        if flag.startswith("-e") and "_" in flag:
            if len(lineSplit)>1:
                # striping any random whitepace
                lineSplit[1] = lineSplit[1].strip()
                if int(flag.split("_")[1]) > 1:
                    lowTime = modelData[i-1].split(',')[1]
                    if lineSplit[1] in variables:
                        lineSplit[1] = getUnscaledValue(variables, lineSplit[1], lowTime)
                else:
                    if lineSplit[1] in variables:
                        lineSplit[1] = getUnscaledValue(variables, lineSplit[1])
                if lineSplit[1] not in times and "inst" not in lineSplit[1]:
                    if lowTime:
                        lineSplit[1] = getUnscaledValue(variables, lineSplit[1], lowTime)
                    else:
                        lineSplit[1] = getUnscaledValue(variables, lineSplit[1])
                else:
                    Ne = findScaleValue(flags, variables)
                    lastTime = getUnscaledValue(variables, modelData[i-1].split(',')[1])
                    tempTime = str(float(lastTime) + 1)
                    while tempTime in times:
                        tempTime += 1
                    lineSplit[1] = tempTime
                times.append(lineSplit[1])
                flag = lineSplit[0].split("_")[0]



        if flag == "-t":
            flag.replace("Nachman","2.5e-8").replace("Other",'1.65e-8')

        if flag == "-F":
            my_file = Path(lineSplit[1:])
            if not my_file.is_file():
                raise ValueError("The file for -F is not in your file path.")

        if flag not in flags.keys():
            flags[flag] = [[x.strip() for x in lineSplit[1:] if x]]
        else:
            flags[flag].append([x.strip() for x in lineSplit[1:] if x])

        modelData[i] = ",".join(lineSplit)

    return flags
示例#2
0
def find_scale_value(flags):
    # used for scaling
    debugPrint(2, "Finding scaling value")
    ne = 10000
    if "-Ne" not in flags.keys():
        if "-n" in flags.keys():
            ne = float(flags["-n"][0][1])
    else:
        ne = float(flags['-Ne'][0][0])
    debugPrint(2, "Scaling factor found: {0}".format(ne))
    return ne
示例#3
0
def findScaleValue(flags = {}, variables = {}):
    # used for scaling
    debugPrint(2, "Finding scaling value")
    Ne=10000
    if "-Ne" not in flags.keys():
        if "-n" in flags.keys():
            Ne=float(getUnscaledValue(variables, flags["-n"][0][1]))
    else:
        Ne = float(getUnscaledValue(variables, flags['-Ne'][0][0]))
    debugPrint(2,"Scaling factor found: {0}".format(Ne))
    return Ne
示例#4
0
def populate_macs_args(macs_args, scaled_flags):
    for flag in scaled_flags.keys():
        # Looping through every key
        debugPrint(3, "FLAG:  {}: {}".format(flag, scaled_flags[flag]))
        for argument_raw in scaled_flags[flag]:
            # Looping through every argumentRaw
            try:
                debugPrint(3, flag + ": " + str(argument_raw))

                macs_args.append(flag.strip())
                for sub_line in argument_raw:
                    macs_args.append(sub_line.strip())
            except IndexError:
                print("There was an index error!\nThis most likely means your input file has a malformed flag.")
                print("Try running with -vv argument for last flag ran")
                sys.exit()
示例#5
0
def scale_flags(flags_raw):
    # find scale value
    ne = find_scale_value(flags_raw)

    flags = {}

    for flag in flags_raw.keys():
        # Looping through every key
        debugPrint(3, "FLAG:  {}: {}".format(flag, flags_raw[flag]))
        arguments = []
        for argument_raw in flags_raw[flag]:
            argument = generate_argument(argument_raw, flag, ne)

            arguments.append(argument)
        flags[flag] = arguments

    return flags
示例#6
0
文件: seqInfo.py 项目: ko43/SimPrily
def create_sequences(processedData, args):

    '''
    Parameters: args is a dictionary that maps the SNP file to 
    array_template
    args: a dictionary (seen below in the args parameter)
    processedData: a dictionary (seen below in the args parameter)
    
    Returns: instance types named [d1, s1] 
    '''
    
    debugPrint(2,"Running create_sequences:")
    sequences = []
    if 'discovery' in processedData and 'sample' in processedData and 'daf' in processedData:
    ### Initialize all discovery type sequence data
        for i, ind in enumerate(processedData.get('discovery')):
            tot_index = processedData['macs_args'].index("-I") + 1 + ind
            tot = int(processedData['macs_args'][tot_index]) # total number of individuals used in simulation
            name = processedData.get('name').pop(0)
            seq = SeqInfo(name, tot, seq_type = 'discovery')

            seq.genotyped = processedData['I'][ind - 1]
            seq.panel  = seq.tot - seq.genotyped
            sequences.append(seq)

        ### Initialize all sample type sequence data
        for i, ind in enumerate(processedData.get('sample')):
            tot = processedData['I'][ind-1]
            name = processedData.get('name').pop(0)
            seq = SeqInfo(name, tot, seq_type = 'sample')

            seq.panel = seq.tot
            seq.genotyped = seq.tot
            sequences.append(seq)
    else:
        for ind in range(int(processedData['macs_args'][4])):
            tot = processedData['I'][ind-1]
            name = processedData.get('name').pop(0)
            seq = SeqInfo(name, tot, seq_type = 'discovery')

          #  seq.panel = seq.tot #pretty sure it can be deleted
            seq.genotyped = seq.tot
            sequences.append(seq)
    return sequences
示例#7
0
文件: run_sim.py 项目: ko43/SimPrily
def run_macs(macs_args, sequences):
    '''
    Parameters: sequences and macs_args
    macs_args:
    ['./bin/macs', '166.0', '1000000', '-I', '2', '26', '140',
     '-t', '0.00444997180488', '-s', '1231', '-r', '0.00177998872195', 
     '-h', '1e5', '-n', '1', '1.0', '-n', '2', '0.899072251249', '-en',
    '0.0118708617304', '1', '0.224720524949', '-ej', '0.0143090794261',
     '2', '1', '-R', 'genetic_map_b37/genetic_map_GRCh37_chr1.txt.macshs']

    sequences: [A, B], which is a sequence type

    Returns: sequences, which is a list of two instance types stored as 
    [A, B]
    position: list of floats cast as strings, length: 10752
    the floaty strings increase from '0.000178136752' to '     0.99995896'
    '''
    debugPrint(2, "running macs simulation:")
    position = []
    null = open(os.devnull, 'w')
    proc = subprocess.Popen(macs_args, stdout=subprocess.PIPE, stderr=null)
    #debugPrint(3,"macs command: {}".format(" ".join(macs_args)))
    while True:
        line = proc.stdout.readline()
        line = line.rstrip()
        # line = line.decode("utf-8")
        if line != b'':
            if line.startswith(b"SITE:"):
                columns = line.split(b'\t')
                site_alleles = columns[4].strip()
                position.append(columns[2])
                seq_loc = 0
                for seq in sequences:
                    seq.bits.extend(site_alleles[seq_loc:seq_loc + seq.tot])
                    seq_loc += seq.tot
            # elif not line.isnum():
            #     debugPrint(3,line)
        else:
            break
    #print("THIS IS SEQUENCES zero: " + str(sequences[0].__dict__))
    #print("THIS IS SEQUENCES one: " + str(sequences[1].__dict__))
#   print("THIS IS position:  " + str(position))
#   debugPrint(2,"Finished macs simulation")
    return [sequences, position]
示例#8
0
def processArgs(arguments):
    parser = argparse.ArgumentParser()
    parser.add_argument("-p",
                        "--param",
                        help="REQUIRED!: The location of the parameter file",
                        required=True)
    parser.add_argument("-m",
                        "--model",
                        help="REQUIRED!: The location of the model file",
                        required=True)
    parser.add_argument("-o",
                        "--out",
                        help="REQUIRED!: The location of the output dir",
                        required=True)
    parser.add_argument("-g",
                        "--genome",
                        help="The location of the genome file",
                        required=True)
    parser.add_argument("-a",
                        "--array",
                        help="The location of the array file",
                        required=True)
    parser.add_argument("-v",
                        help="increase output verbosity",
                        action="count",
                        default=0)
    tmpArgs = parser.parse_args()

    args = {
        'param file': tmpArgs.param,
        'model file': tmpArgs.model,
        'genome file': tmpArgs.genome,
        'array file': tmpArgs.array,
        'output': tmpArgs.out
    }

    global_vars.init()
    global_vars.verbos = tmpArgs.v
    debugPrint(1, "Debug on: Level " + str(global_vars.verbos))

    return args
示例#9
0
def populate_flags(model_data_raw):
    """
    This will fill a dictionary with keys that equal the flags, and values that
    is a list of every time (in order) the flag is used.

    :param model_data_raw:
    :return:
    """
    debugPrint(2, "Starting: populateFlags ")
    flags = OrderedDict()
    # loops through all items in modelDataRaw
    for i, argument in enumerate(model_data_raw):
        arg_split = argument.split(',')

        flag = arg_split[0]
        if flag in flags.keys():
            flags[flag].append([x.strip() for x in arg_split[1:] if x])
        else:
            flags[flag] = [[x.strip() for x in arg_split[1:] if x]]

    return flags
示例#10
0
def main(args):

    chr_number = 1
    # Use dictionary keys instead of index keys for args
    args = process_args(args)
    job = str(args['job'])  # must be a number
    print('JOB {}'.format(job))

    prof_option = args['profile']

    sim_option = args['sim option']

    path = args['path']
    [sim_data_dir, germline_out_dir, sim_results_dir] = create_sim_directories(path)

    processedData = process_input_files(args['param file'], args['model file'], args)

    using_pseudo_array = True
    if not processedData.get('discovery') and not processedData.get('sample') and not processedData.get('daf'):
        using_pseudo_array = False

    debugPrint(3, "Finished processing input\nprocessedData: ", processedData)


    ### Create a list of Sequence class instances. These will contain the bulk of all sequence-based data
    sequences = create_sequences(processedData)
    names = [seq.name for seq in sequences]

    n_d = sum([1 for seq in sequences if seq.type == 'discovery'])

    debugPrint(1,'name\ttotal\tpanel\tgenotyped')
    for seq in sequences:
        debugPrint(1,'{}\t{}\t{}\t{}'.format(seq.name, seq.tot, seq.panel, seq.genotyped))

    total = sum([seq.tot for seq in sequences])
    debugPrint(1, 'total samples: {}'.format(sum([seq.genotyped for seq in sequences if seq.type=='discovery'] + [seq.tot for seq in sequences if seq.type=='sample'])))

    ### Define simulation size
    length = processedData['length']
    debugPrint(1, 'Perform simulation and get sequences')
    pedmap = args['pedmap']
    germline = args['germline']

    ##########################################################################
    ################## Perform simulation and get sequences ##################
    ##########################################################################

    ### Flag to check if the simulation works
    SNPs_exceed_available_sites = True
    while SNPs_exceed_available_sites:

        # add genetic map to macs_args list
        macs_args = []
        macs_args = processedData['macs_args']

        if sim_option == 'macs':
            ### Run macs and make bitarray
            profile(prof_option, path, job, "start_run_macs")
            [sequences,position] = run_macs(macs_args, sequences)
            profile(prof_option, path, job, "end_run_macs")
            nbss = len(sequences[0].bits) / (sequences[0].tot)

            if using_pseudo_array:
                ## get position of the simulated sites and scale it to the "real" position in the SNP chip
                sim_positions = get_sim_positions(position, nbss, length)

        elif sim_option == 'macs_file':
            ### Using a static sim output rather than generating from seed
            seq_alleles = AllelesMacsFile('tests/test_data/sites1000000.txt')
            set_seq_bits(sequences, seq_alleles)
            nbss = len(sequences[0].bits) / (sequences[0].tot)

            if using_pseudo_array:
                ## get position of the simulated sites and scale it to the "real" position in the SNP chip
                sim_positions = get_sim_positions_old(seq_alleles, nbss, length)

        profile(prof_option, path, job, "start_set_discovery_bits")
        set_discovery_bits(sequences)
        profile(prof_option, path, job, "end_set_discovery_bits")

        debugPrint(1, 'Number of sites in simulation: {}'.format(nbss))

        assert nbss > 10, "Number of sites is less than 10: {}".format(nbss)

        ##########################################################################
        ### Create pseudo array according to ascertainment scheme and template ###
        ##########################################################################

        if using_pseudo_array:
            SNPs = get_SNP_sites(args['SNP file'])
            debugPrint(1, 'Number of SNPs in Array: {}'.format(len(SNPs)))

            profile(prof_option, path, job, "start_set_panel_bits")
            asc_panel_bits = set_panel_bits(nbss, sequences)

            profile(prof_option, path, job, "end_set_panel_bits")
            debugPrint(1,'Number of chromosomes in asc_panel: {}'.format(asc_panel_bits.length()/nbss))

            ### Get pseudo array sites
            debugPrint(2,'Making pseudo array')
            profile(prof_option, path, job, "start_pseudo_array_bits")

            [pos_asc, nbss_asc, avail_site_indices, avail_sites] = pseudo_array_bits(asc_panel_bits, processedData['daf'], sim_positions, SNPs)
            profile(prof_option, path, job, "end_pseudo_array_bits")
            nb_avail_sites = len(avail_sites)
            SNPs_exceed_available_sites = ( len(SNPs) >= nb_avail_sites )
        else:
            SNPs = []
            SNPs_exceed_available_sites = False

    if using_pseudo_array:
        profile(prof_option, path, job, "start_set_asc_bits")
        set_asc_bits(sequences, nbss_asc, pos_asc, avail_site_indices)
        profile(prof_option, path, job, "end_set_asc_bits")

    debugPrint(1, 'Calculating summary statistics')
    ##########################################################################
    ###################### Calculate summary statistics ######################
    ##########################################################################
    res, head = [], []

    ### Calculate summary stats from genomes
    if nbss > 0:   # Simulations must contain at least one segregating site
        profile(prof_option, path, job, "start_store_segregating_site_stats")
        stat_tools.store_segregating_site_stats(sequences, res, head)
        profile(prof_option, path, job, "end_store_segregating_site_stats")
        profile(prof_option, path, job, "start_store_pairwise_FSTs")
        stat_tools.store_pairwise_FSTs(sequences, n_d, res, head)
        profile(prof_option, path, job, "end_store_pairwise_FSTs")

    ### Calculate summary stats from the ascertained SNPs
    if using_pseudo_array:
        if nbss_asc > 0:
            profile(prof_option, path, job, "start_store_array_segregating_site_stats")
            stat_tools.store_array_segregating_site_stats(sequences, res, head)
            profile(prof_option, path, job, "end_store_array_segregating_site_stats")
            profile(prof_option, path, job, "start_store_array_FSTs")
            stat_tools.store_array_FSTs(sequences, res, head)
            profile(prof_option, path, job, "end_store_array_FSTs")

        debugPrint(2,'Making ped and map files')
        ped_file_name = '{0}/macs_asc_{1}_chr{2}.ped'.format(sim_data_dir, job, str(chr_number))
        map_file_name = '{0}/macs_asc_{1}_chr{2}.map'.format(sim_data_dir, job, str(chr_number))
        out_file_name = '{0}/macs_asc_{1}_chr{2}'.format(germline_out_dir, job, str(chr_number))

        if os.path.isfile(out_file_name + '.match'):  # Maybe remove if statement
            os.remove(ped_file_name)
            os.remove(map_file_name)

        if using_pseudo_array and pedmap or germline:
            profile(prof_option, path, job, "start_make_ped_file")
            make_ped_file(ped_file_name, sequences)
            profile(prof_option, path, job, "end_make_ped_file")
            profile(prof_option, path, job, "start_make_map_file")
            make_map_file(map_file_name, pos_asc, chr_number, avail_sites)
            profile(prof_option, path, job, "end_make_map_file")

        ### Use Germline to find IBD on pseduo array ped and map files
        do_i_run_germline = int(args['germline'])

        debugPrint(1,'run germline? {}'.format("True" if do_i_run_germline else "False"))

        if (do_i_run_germline == True):
            ########################### <CHANGE THIS LATER> ###########################
            ### Germline seems to be outputting in the wrong unit - so I am putting the min at 3000000 so that it is 3Mb, but should be the default.
            profile(prof_option, path, job, "start_run_germline")
            # germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m = 3000000)
            profile(prof_option, path, job, "end_run_germline")
            germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m=300)
            ########################### </CHANGE THIS LATER> ##########################

        ### Get IBD stats from Germline output
        if os.path.isfile(out_file_name + '.match'):
            print('Reading Germline IBD output')
            profile(prof_option, path, job, "start_process_germline_file")
            [IBD_pairs, IBD_dict] = process_germline_file(out_file_name, names)
            profile(prof_option, path, job, "end_process_germline_file")

            print('Calculating summary stats')
            stats = OrderedDict([('num', len), ('mean', np.mean), ('med', np.median), ('var', np.var)])
            profile(prof_option, path, job, "start_store_IBD_stats")
            stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head)
            stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head, min_val=30)
            profile(prof_option, path, job, "end_store_IBD_stats")

        debugPrint(1,'finished calculating ss')

    write_sim_results_file(sim_results_dir, job, processedData['param_dict'], res, head)

    print('')
    print('#########################')
    print('### PROGRAM COMPLETED ###')
    print('#########################')
    print('')

    profile(prof_option, path, job, "COMPLETE")
示例#11
0
def process_input_files(param_file, model_file, args):
    """

    :param param_file:
    :param model_file:
    :param args:
    :return:
    """
    """
    This is the function that takes links to two files and outputs a dictionary (processedData)
    With all the (useful) data in the two files
    """
    debugPrint(2, "Starting processInputFiles")

    model_data_raw = read_model_file(model_file)
    debugPrint(2, "Finished reading " + str(model_file))
    debugPrint(3, "Raw input data into make_args", model_data_raw)

    model_params_dict_raw = read_params_file(param_file)
    debugPrint(2, "Finished reading " + str(param_file))
    debugPrint(3, "Raw Output for modelParamsDict", model_params_dict_raw)

    # defining and replacing the variables from the param file
    model_params_variables = define_priors(model_params_dict_raw,
                                           model_data_raw)
    model_data = substitute_variables(model_params_variables, model_data_raw)

    flags = populate_flags(model_data)

    macs_args = generate_macs_args(flags)

    # find and add sizes to macs_args
    sizes = populate_sizes(flags)

    total = float(sum(sizes))
    macs_args.insert(1, str(total))
    sizes_str = map(str, sizes)
    if sys.version_info > (3, 0):
        sizes_str = list(sizes_str)
    macs_args.extend(sizes_str)

    debugPrint(3, "Processing flags in for macs_args")

    # take out ignored flags
    flags = remove_ignored_flags(flags)

    # take out process data )type 1
    processed_data = process_type1_flags(flags)
    flags = filter_out_type1(flags)

    # scale values if needed
    scaled_flags = scale_flags(flags)

    # pull out seed
    seed = scaled_flags.get("-s", None)
    if seed:
        processed_data['seed'] = seed

    # seasons is all the time based events
    seasons = add_events_to_seasons(scaled_flags)
    macs_args_flags = filter_out_events(scaled_flags)

    # add to macs_args
    # TODO: This needs to be done explictily
    populate_macs_args(macs_args, macs_args_flags)

    pop_names = gather_pop_names(model_data_raw)
    processed_data['name'] = pop_names

    if not processed_data.get('discovery') or not processed_data.get(
            'sample') or not processed_data.get('daf'):
        if not processed_data.get('discovery') and not processed_data.get(
                'sample') and not processed_data.get('daf'):
            debugPrint(2, "discovery, sample, and daf are all missing")
        else:
            print("discovery, sample, or daf is missing")
            quit()

    debugPrint(2, "Adding events data back to flag pool")
    for i in range(len(seasons)):
        seasons[i][1] = float(seasons[i][1])
    seasons = sorted(seasons, key=itemgetter(1))
    for i in range(len(seasons)):
        seasons[i][1] = str(seasons[i][1])
    for season in seasons:
        macs_args.extend(season)

    processed_data["macs_args"] = macs_args

    debugPrint(3, "printing model_params_variables:", model_params_variables)

    processed_data['param_dict'] = model_params_variables

    if 'genetic_map' in args.keys() and args['genetic map']:
        processed_data['macs_args'].extend(['-R', args['genetic map']])

    return processed_data
示例#12
0
def pseudo_array(asc_panel, daf, pos, snps):
    Tasc_panel = zip(*asc_panel)
    print( 'number of sites in Tasc_panel:', len(Tasc_panel))
    print( 'number of chromosomes in Tasc_panel:', len(Tasc_panel[0]))

    #######Array with the available sites given the frequency cut off
    ##array with the frequency of all the simulated snps
    sites_freq = []
    ##array with the available sites, that pass the frequency cut-off
    avail_sites = []  ##this one has the positions of the snps
    index_avail_sites = []  ##this one has the indexes of the snps
    for n in range(len(Tasc_panel)):
        freq_site = float(Tasc_panel[n][0:len(asc_panel)].count('1')) / float(len(asc_panel))
        if freq_site >= daf and freq_site <= 1 - daf:
            sites_freq.append(freq_site)
            avail_sites.append(pos[n])
            index_avail_sites.append(n)
    nb_avail_sites = len(avail_sites)
    if (len(avail_sites) == len(snps)):
        debugPrint(3,"number of avail_sites is equal to the number of Array snps")
        pos_asc = []
        pos_asc = index_avail_sites
        nbss_asc = len(pos_asc)
        flag_nb_asc_snps = 1
    elif (len(avail_sites) > len(snps)):
        debugPrint(3,"number of avail_sites greater than number of Array snps")
        pos_asc = [None] * int(len(snps))  ##indexes of the SNPs that pass the frequency cut-off and position
        for i in range(len(snps)):  # each snp on the snp array on a chromosome
            ## y is the position of the SNPs in the array
            y = snps[i]
            ##find the closest SNP in the array
            closestleft = find2(avail_sites, y)
            if (i > 0 and pos_asc[i - 1] == closestleft and closestleft + 1 < len(avail_sites)):  ##avoid duplicates
                closestleft = closestleft + 1  ##move one position to the right
                pos_asc[i] = closestleft
            elif (i > 0 and pos_asc[i - 1] > closestleft and pos_asc[i - 1] + 1 < len(avail_sites)):
                closestleft = pos_asc[i - 1] + 1
                pos_asc[i] = closestleft
            else:
                pos_asc[i] = closestleft
                ###if I have duplicates at this point, it means that there were not anyt more snps to choose from
                ###closestleft+1 or pos_asc[i-1]+1 == len(avail_sites)
        #####smoothing
        ##last index of the pos_asc
        i = len(pos_asc) - 1
        ##check if there is another position that might work better
        for j in range(0, i):
            if (j == i - 1 and pos_asc[j] + 1 < pos_asc[j + 1] and pos_asc[j] < (len(avail_sites) - 1) and (
                        j + 1) < len(avail_sites)):
                d1 = abs(snps[j] - avail_sites[pos_asc[j]])
                d2 = abs(snps[j] - avail_sites[pos_asc[j] + 1])
                if (d2 < d1):
                    pos_asc[j] = pos_asc[j] + 1

        ##removes duplicates
        pos_asc = (list(set(pos_asc)))
        pos_asc.sort()

        nbss_asc = len(pos_asc)

        if (len(snps) == nbss_asc):
            flag_nb_asc_snps = 1
            debugPrint(3,'Number of asc snps equal to nb array snps')

        if (len(snps) != len(pos_asc)):
            flag_nb_asc_snps = 0
            debugPrint(3,'Number of asc snps not equal to nb array snps')
            diff = int(len(snps) - len(pos_asc))
            for m in range(1, diff + 1):
                pos_asc2 = []
                pos_asc2 = add_snps(avail_sites, nb_avail_sites, pos_asc, nbss_asc, nb_array_snps)
                pos_asc = pos_asc2
                nbss_asc = len(pos_asc)
                if nbss_asc == len(snps):
                    flag_nb_asc_snps = 1
                    break
                else:
                    flag_nb_asc_snps = 0

        if (flag_nb_asc_snps == 0):  ##it means that the 1st index in pos_asc is 0; and the last is len(avail_sites)-1
            diff = int(len(snps) - len(pos_asc))
            while (len(pos_asc) != len(snps)):
                rand_numb = random.randint(0, len(avail_sites) - 1)
                # print( 'random',rand_numb)
                if rand_numb not in pos_asc:
                    pos_asc.append(rand_numb)
            pos_asc.sort()
            nbss_asc = len(pos_asc)
    print( 'finished making pseudo array')
    return pos_asc, nbss_asc, index_avail_sites, avail_sites
示例#13
0
def pseudo_array_bits(asc_panel_bits, daf, pos, snps):
    '''
    Parameters: 
    asc_panel_bits: bitarray
    daf: float (0.0264139586625)
    pos: list of floats in acsending order
    snps: list of ints

    Returns: pos_asc: list of ints (2481-2679)
    nbss_asc: 200
    index_avail_sites: 
    avail_sites: list of floats

    Errors: 
    - the asc_panel_bits needs to be divisible by pos
    - daf cannot be negative or greater than 1
    '''
    n = asc_panel_bits.length()/len(pos)
    n = int(n)
    #######Array with the available sites given the frequency cut off
    ##array with the frequency of all the simulated snps
    sites_freq = []
    ##array with the available sites, that pass the frequency cut-off
    avail_sites = []  ##this one has the positions of the snps
    index_avail_sites = []  ##this one has the indexes of the snps

    i = 0
    for site in range(0, asc_panel_bits.length(), int(n)):
        freq_site = float(asc_panel_bits[site:site + n].count(1) / float(n))
        if freq_site >= daf and freq_site <= 1 - daf:
            sites_freq.append(freq_site)
            avail_sites.append(pos[i])
            index_avail_sites.append(i)
        i=i+1
    nb_avail_sites = len(avail_sites)
    if (len(avail_sites) < len(snps)):
        print( "Error: There are not enough simulated sites in the discovery panel with allele frequency >=",daf,"and <=",1 - daf)
        sys.exit()

    if (len(avail_sites) == len(snps)):
        #debugPrint(3,"Number of avail_sites is equal to the number of Array snps")
        pos_asc = []
        pos_asc = index_avail_sites
        nbss_asc = len(pos_asc)
        flag_nb_asc_snps = 1

    elif (len(avail_sites) > len(snps)):
        #debugPrint(3,"Number of avail_sites greater than number of Array snps")
        pos_asc = [None] * int(len(snps))  ##indexes of the SNPs that pass the frequency cut-off and position
        for i in range(len(snps)):  # each snp on the snp array on a chromosome
            ## y is the position of the SNPs in the array
            y = snps[i]
            ##find the closest SNP in the array
            closestleft = find2(avail_sites, y)
            if (i > 0 and pos_asc[i - 1] == closestleft and closestleft + 1 < len(avail_sites)):  ##avoid duplicates
                closestleft = closestleft + 1  ##move one position to the right
                pos_asc[i] = closestleft
            elif (i > 0 and pos_asc[i - 1] > closestleft and pos_asc[i - 1] + 1 < len(avail_sites)):
                closestleft = pos_asc[i - 1] + 1
                pos_asc[i] = closestleft
            else:
                pos_asc[i] = closestleft
                ###if I have duplicates at this point, it means that there were not anyt more snps to choose from
                ###closestleft+1 or pos_asc[i-1]+1 == len(avail_sites)

        #####smoothing
        ##last index of the pos_asc
        i = len(pos_asc) - 1

        ##check if there is another position that might work better
        for j in range(0, i):
            if (j == i - 1 and pos_asc[j] + 1 < pos_asc[j + 1] and pos_asc[j] < (len(avail_sites) - 1) and (
                        j + 1) < len(avail_sites)):
                d1 = abs(snps[j] - avail_sites[pos_asc[j]])
                d2 = abs(snps[j] - avail_sites[pos_asc[j] + 1])
                if (d2 < d1):
                    pos_asc[j] = pos_asc[j] + 1

        ##removes duplicates
        pos_asc = (list(set(pos_asc)))
        pos_asc.sort()

        nbss_asc = len(pos_asc)
        nb_array_snps = len(snps)

        if (len(snps) == nbss_asc):
            flag_nb_asc_snps = 1
            debugPrint(3,'nb of asc snps equal to nb array snps')

        if (len(snps) != len(pos_asc)):
            flag_nb_asc_snps = 0
           #debugPrint(3,'nb of asc snps not equal to nb array snps')
            diff = int(len(snps) - len(pos_asc))
            for m in range(1, diff + 1):
                pos_asc2 = []
                pos_asc2 = add_snps(avail_sites, nb_avail_sites, pos_asc, nbss_asc, nb_array_snps)
                pos_asc = pos_asc2
                nbss_asc = len(pos_asc)
                if nbss_asc == len(snps):
                    flag_nb_asc_snps = 1
                    break
                else:
                    flag_nb_asc_snps = 0

        if (flag_nb_asc_snps == 0):  ##it means that the 1st index in pos_asc is 0; and the last is len(avail_sites)-1
            diff = int(len(snps) - len(pos_asc))
            while (len(pos_asc) != len(snps)):
                rand_numb = random.randint(0, len(avail_sites) - 1)
                # print( 'random',rand_numb)
                if rand_numb not in pos_asc:
                    pos_asc.append(rand_numb)
            pos_asc.sort()
            nbss_asc = len(pos_asc)
    #debugPrint(2,'finished making pseudo array')
    return pos_asc, nbss_asc, index_avail_sites, avail_sites
示例#14
0
def main(args):
    args = processArgs(args)

    model_file = args['model file']
    param_file = args['param file']
    path = args['output']

    [sim_data_dir, germline_out_dir,
     sim_results_dir] = create_sim_directories(path)

    processedData = processInputFiles(param_file, model_file, args)
    debugPrint(3, "Finished processing input\nprocessedData: ", processedData)

    using_pseudo_array = True
    if not processedData.get('discovery') and not processedData.get(
            'sample') and not processedData.get('daf'):
        using_pseudo_array = False

    ### Create a list of Sequence class instances. These will contain the bulk of all sequence-based data
    sequences = create_sequences(processedData, args)
    names = [seq.name for seq in sequences]

    n_d = sum([1 for seq in sequences if seq.type == 'discovery'])

    debugPrint(1, 'name\ttotal\tpanel\tgenotyped')
    for seq in sequences:
        debugPrint(
            1, '{}\t{}\t{}\t{}'.format(seq.name, seq.tot, seq.panel,
                                       seq.genotyped))

    total = sum([seq.tot for seq in sequences])
    debugPrint(
        1, 'total samples: {}'.format(
            sum([
                seq.genotyped for seq in sequences if seq.type == 'discovery'
            ] + [seq.tot for seq in sequences if seq.type == 'sample'])))

    ##########################################################################
    ####################### Read Data from tped files ########################
    ##########################################################################

    genome_file = args['genome file']
    job = os.path.basename(genome_file)
    seq_alleles_genome = AllelesReal(str(genome_file) + '.tped')
    set_real_genome_bits(sequences, seq_alleles_genome)
    if using_pseudo_array == True:
        array_file = args['array file']
        job = str(job) + '_' + str(os.path.basename(array_file))
        seq_alleles_array = AllelesReal(str(array_file) + '.tped')
        set_real_array_bits(sequences, seq_alleles_array)

    ##########################################################################
    ###################### Calculate summary statistics ######################
    ##########################################################################
    res, head = [], []

    ### Calculate summary stats from genomes
    stat_tools.store_segregating_site_stats(sequences, res, head)
    stat_tools.store_pairwise_FSTs(sequences, n_d, res, head)

    ### Calculate summary stats from the ascertained SNPs
    if using_pseudo_array:
        stat_tools.store_array_segregating_site_stats(sequences, res, head)
        stat_tools.store_array_FSTs(sequences, res, head)

        debugPrint(1, 'Make ped and map files')
        ped_file_name = '{0}/{1}.ped'.format(sim_data_dir, job)
        map_file_name = '{0}/{1}.map'.format(sim_data_dir, job)
        out_file_name = '{0}/{1}'.format(germline_out_dir, job)

        ### Use Germline to find IBD on pseduo array ped and map files
        do_i_run_germline = 1  #fix this later

        debugPrint(1, 'run germline? ' + str(do_i_run_germline))
        if (do_i_run_germline == 0):
            ########################### <CHANGE THIS LATER> ###########################
            ### Germline seems to be outputting in the wrong unit - so I am putting the min at 3000000 so that it is 3Mb, but should be the default.
            # germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m = 3000000)
            germline = run_germline(ped_file_name,
                                    map_file_name,
                                    out_file_name,
                                    min_m=300)
            ########################### </CHANGE THIS LATER> ##########################

        ### Get IBD stats from Germline output
        if os.path.isfile(out_file_name + '.match'):
            debugPrint(1, 'Reading Germline IBD output')
            [IBD_pairs, IBD_dict] = process_germline_file(out_file_name, names)

            debugPrint(1, 'Calculating summary stats')
            stats = OrderedDict([('num', len), ('mean', np.mean),
                                 ('med', np.median), ('var', np.var)])
            stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head)
            stat_tools.store_IBD_stats(stats,
                                       IBD_pairs,
                                       IBD_dict,
                                       res,
                                       head,
                                       min_val=30)

        # print 'finished calculating ss'

    write_stats_file(sim_results_dir, job, res, head)

    print('')
    print('##########################')
    print('### PROGRAM COMPLETED  ###')
    print('##########################')
    print('')
示例#15
0
def processInputFiles(paramFile, modelFile, args):
    '''
    This is the function that takes links to two files and outputs a dictionay (processedData)
    With all the (useful) data in the two files
    '''
    debugPrint(2, "Starting processInputFiles")
    
    modelData = readModelFile(modelFile)
    debugPrint(2, "Finished reading " + str(modelFile))
    debugPrint(3, "Raw input data into make_args", modelData)

    variables = readParamsFile(paramFile)
    debugPrint(2, "Finished reading " + str(paramFile))
    
    debugPrint(3,"Raw Output for variables", variables)



    processedData = processModelData(variables, modelData) # creates the input for macsSwig
    debugPrint(3,"Priting variables:", variables)

    processedData['param_dict'] = variables

    if args['genetic map']:
        processedData['macs_args'].extend(['-R', args['genetic map']])

    return processedData
示例#16
0
def processModelData(variables, modelData):
    """
    """
    debugPrint(2, "Starting: processModelData")
    processedData = {}
    
    flags = populateFlags(variables, modelData)

    if '-macs_file' in flags:
        macs_args = [flags['-macs_file'][0], flags['-length'][0][0], "-I", flags['-I'][0][0]]
    elif '-macsswig' in flags:
          macs_args = [flags['-macsswig'][0][0], flags['-length'][0][0], "-I", flags['-I'][0][0]]
    elif '-macs' in flags:
        macs_args = [flags['-macs'][0][0], flags['-length'][0][0], "-I", flags['-I'][0][0]]
    sizes = map(int, flags["-I"][0][1:])
    if (sys.version_info > (3, 0)):
        sizes = list(sizes)
    if '-discovery' in flags:
        for discovery_pop_str in flags["-discovery"][0]:
            discovery_pop = int(discovery_pop_str)-1
            if "True" in flags['-random_discovery'][0]:
                sizes[discovery_pop] += random.randint(2, sizes[discovery_pop])
            else:
                sizes[discovery_pop] += sizes[discovery_pop]
    total = float(sum(sizes))
    macs_args.insert(1,str(total))
    sizes_str = map(str, sizes)
    if (sys.version_info > (3, 0)):
        sizes_str = list(sizes_str)
    macs_args.extend(sizes_str)


    # seasons is all the time based events
    seasons = []

    Ne = findScaleValue(flags, variables)
    # processOrderedSeasons(flags, variables)
    debugPrint(3,"Processing flags in for macs_args")
    for flag in flags.keys():
        debugPrint(3,"  {}: {}".format(flag,flags[flag]))

        for tempLine in flags[flag]:
            try:
                # debugPrint(3,flag + ": " + str(tempLine))
                if flag == "-discovery":
                    processedData['discovery'] = [int(s.strip()) for s in tempLine if s]
                    continue
                if flag == "-sample":
                    processedData['sample'] = [int(s.strip()) for s in tempLine if s]
                    continue
                if flag == "-s":
                    processedData['seed'] = tempLine[0]
                if flag == "-daf":
                    processedData['daf'] = float(getUnscaledValue(variables, tempLine[0]))
                    continue
                if flag == "-length":
                    processedData['length'] = tempLine[0]
                    continue
                if flag == "-macs":
                    processedData['macs'] = tempLine[0]
                    continue
                if flag == "-I":
                    processedData["I"] = [int(s.strip()) for s in tempLine[1:] if s]
                    continue
                if flag == "-macsswig":
                    processedData['macsswig'] = tempLine[0]
                    continue
                if flag == "-n":
                    tmp = processedData.get('name', [])
                    tmp.append(tempLine[1])
                    processedData['name'] = tmp
                
                #----------------------- For Added Arguments from Model_CSV
                ignoredFlags = ["-germline",
                                "-array",
                                "-nonrandom_discovery",
                                "-random_discovery",
                                "-pedmap"]

                if flag in ignoredFlags:
                    continue

                if flag == "-Ne":
                    tempLine[0] = getUnscaledValue(variables, tempLine[0])
                if flag == "-em":
                    tempLine[3] = getUnscaledValue(variables, tempLine[3])
                    tempLine[3] = str(float(4*(float(tempLine[3])*Ne)))
                
                elif flag == "-eM" or flag == "-g":
                    tempLine[1] = getUnscaledValue(variables, tempLine[1])
                    tempLine[1] = str(float(4*(float(tempLine[1])*Ne)))

                elif flag == "-ema":
                    for i in range(2,len(tempLine)):
                        tempLine[i] = getUnscaledValue(variables, tempLine[i])
                        tempLine[i] = str(float(4*(float(tempLine[i])*Ne)))

                elif flag == "-eN" or flag == "-n":
                    tempLine[1] = getUnscaledValue(variables, tempLine[1])
                    tempLine[1] = str(float((float(tempLine[1])/Ne)))

                elif flag == "-en":
                    tempLine[2] = getUnscaledValue(variables, tempLine[2])
                    tempLine[2] = str(float((float(tempLine[2])/Ne)))

                elif flag == "-eg":
                    tempLine[2] = getUnscaledValue(variables, tempLine[2])
                    tempLine[2] = str(float(4*(float(tempLine[2])*Ne)))

                elif flag == "-es":
                    tempLine[2] = getUnscaledValue(variables, tempLine[2])

                elif flag == "-m":
                    tempLine[2] = getUnscaledValue(variables, tempLine[2])
                    tempLine[2] = str(float(4*(float(tempLine[2])*Ne)))

                elif flag == "-ma":
                    for i in range(len(tempLine)):
                        tempLine[i] = getUnscaledValue(variables, tempLine[i])
                        tempLine[i]=str(float(4*(float(tempLine[i])*Ne)))

                elif flag == "-t" or flag == "-r" or flag == "-G":
                    # both <m> <r> <alpha> have same scaling factor
                    tempLine[0] = getUnscaledValue(variables, tempLine[0])
                    tempLine[0] = str(float(4*(float(tempLine[0])*Ne)))

                if flag.startswith('-e'):
                    # all <t>'s are scaled
                    pass
                    tempLine[0] = getUnscaledValue(variables, tempLine[0])
                    tempLine[0]=str(round(float(tempLine[0]))/(4*Ne))
                    seasons.append([flag] + tempLine)
                else:
                    macs_args.append(flag.strip())
                    for subLine in tempLine:
                        macs_args.append(subLine.strip())
            except IndexError as e:
                print("There was an index error!\nThis most likely means your input file has a malformed flag.")
                print("Try running with -vv argument for last flag ran")
                sys.exit()

    if '-n' not in flags:
        tmp = list(range(1,int(flags['-I'][0][0])+1))
        processedData['name'] = tmp

    if not processedData.get('discovery') or not processedData.get('sample') or not processedData.get('daf'):
        if not processedData.get('discovery') and not processedData.get('sample') and not processedData.get('daf'):
            debugPrint(2, "discovery, sample, and daf are all missing")
        else:
            print("discovery, sample, or daf is missing")
            quit()
            

    debugPrint(2, "Adding events data back to flag pool")
    for i in range(len(seasons)):
        seasons[i][1] = float(seasons[i][1])
    seasons = sorted(seasons, key=itemgetter(1))
    for i in range(len(seasons)):
        seasons[i][1] = str(seasons[i][1])
    for season in seasons:
        macs_args.extend(season)

    processedData["macs_args"] = macs_args
    return processedData