Пример #1
0
 def test_alleles_bitarray(self):
     alleles_macs_file = AllelesMacsFile('tests/test_data/sites.txt')
     alleles_bits = alleles_macs_file.make_bitarray()[0]
     self.maxDiff = None
     expected_alleles_bits = bitarray(
         '0000000000000010000000100100001000000000000000000000000000110100000000010000100000000100000000000000000000001000100000000000000000000000010000000000000100000000110000111101111011110011110000100001001111000011110111101100001111000010000100110000001001000000000000111100001000010000001111000010000100001100001111011110111111000011110111101111110000111101111011110011110000100001000000111100001000010000110000111101111011110011110000100001000011000011110111101111001111000010000100000011110000100001000011000011110111101111000000001000000000000011110000100001000011000011110111101111010000110101001010000000000100000000000000111100001000010000110000111101111011110000000000000100000011000011110111101111010000000000000010000000000000001000000011000011010111101100'
     )
     self.assertEqual(alleles_bits, expected_alleles_bits)
Пример #2
0
 def test_seq_bitarray(self):
     seq_macs_file = AllelesMacsFile('tests/test_data/sites.txt')
     seqA_bits = seq_macs_file.make_bitarray_seq(0, 10)
     seqB_bits = seq_macs_file.make_bitarray_seq(10, 20)
     self.maxDiff = None
     expceted_seqA_bits = bitarray('00000000000010010000000000000001000000000000010000000000001000000000000000000000110000111100111100001100001111001111000000000010010011110000001111000011000011111100001111110000111100111100000011110000110000111100111100001100001111001111000000111100001100001111000000001000111100001100001111010000110100000001000011110000110000111100000000001100001111010000000000000000001100001101')
     expceted_seqB_bits = bitarray('00001000001000000000000000001101000010000000000000001000000000000001000100000000011110111110000100110111101100100001001100000000001000010000100001000001111011110111101111011110111110000100001000010000011110111110000100000111101111100001000010000100000111101111000000000010000100000111101111010010100000000000001000010000011110111100010000000111101111000000100000100000000111101100')
     self.assertEqual(seqA_bits,expceted_seqA_bits)
     self.assertEqual(seqB_bits,expceted_seqB_bits)
Пример #3
0
def main(args):

    chr_number = 1
    # Use dictionary keys instead of index keys for args
    args = process_args(args)
    job = str(args['job'])  # must be a number
    print('JOB {}'.format(job))

    prof_option = args['profile']

    sim_option = args['sim option']

    path = args['path']
    [sim_data_dir, germline_out_dir, sim_results_dir] = create_sim_directories(path)

    processedData = process_input_files(args['param file'], args['model file'], args)

    using_pseudo_array = True
    if not processedData.get('discovery') and not processedData.get('sample') and not processedData.get('daf'):
        using_pseudo_array = False

    debugPrint(3, "Finished processing input\nprocessedData: ", processedData)


    ### Create a list of Sequence class instances. These will contain the bulk of all sequence-based data
    sequences = create_sequences(processedData)
    names = [seq.name for seq in sequences]

    n_d = sum([1 for seq in sequences if seq.type == 'discovery'])

    debugPrint(1,'name\ttotal\tpanel\tgenotyped')
    for seq in sequences:
        debugPrint(1,'{}\t{}\t{}\t{}'.format(seq.name, seq.tot, seq.panel, seq.genotyped))

    total = sum([seq.tot for seq in sequences])
    debugPrint(1, 'total samples: {}'.format(sum([seq.genotyped for seq in sequences if seq.type=='discovery'] + [seq.tot for seq in sequences if seq.type=='sample'])))

    ### Define simulation size
    length = processedData['length']
    debugPrint(1, 'Perform simulation and get sequences')
    pedmap = args['pedmap']
    germline = args['germline']

    ##########################################################################
    ################## Perform simulation and get sequences ##################
    ##########################################################################

    ### Flag to check if the simulation works
    SNPs_exceed_available_sites = True
    while SNPs_exceed_available_sites:

        # add genetic map to macs_args list
        macs_args = []
        macs_args = processedData['macs_args']

        if sim_option == 'macs':
            ### Run macs and make bitarray
            profile(prof_option, path, job, "start_run_macs")
            [sequences,position] = run_macs(macs_args, sequences)
            profile(prof_option, path, job, "end_run_macs")
            nbss = len(sequences[0].bits) / (sequences[0].tot)

            if using_pseudo_array:
                ## get position of the simulated sites and scale it to the "real" position in the SNP chip
                sim_positions = get_sim_positions(position, nbss, length)

        elif sim_option == 'macs_file':
            ### Using a static sim output rather than generating from seed
            seq_alleles = AllelesMacsFile('tests/test_data/sites1000000.txt')
            set_seq_bits(sequences, seq_alleles)
            nbss = len(sequences[0].bits) / (sequences[0].tot)

            if using_pseudo_array:
                ## get position of the simulated sites and scale it to the "real" position in the SNP chip
                sim_positions = get_sim_positions_old(seq_alleles, nbss, length)

        profile(prof_option, path, job, "start_set_discovery_bits")
        set_discovery_bits(sequences)
        profile(prof_option, path, job, "end_set_discovery_bits")

        debugPrint(1, 'Number of sites in simulation: {}'.format(nbss))

        assert nbss > 10, "Number of sites is less than 10: {}".format(nbss)

        ##########################################################################
        ### Create pseudo array according to ascertainment scheme and template ###
        ##########################################################################

        if using_pseudo_array:
            SNPs = get_SNP_sites(args['SNP file'])
            debugPrint(1, 'Number of SNPs in Array: {}'.format(len(SNPs)))

            profile(prof_option, path, job, "start_set_panel_bits")
            asc_panel_bits = set_panel_bits(nbss, sequences)

            profile(prof_option, path, job, "end_set_panel_bits")
            debugPrint(1,'Number of chromosomes in asc_panel: {}'.format(asc_panel_bits.length()/nbss))

            ### Get pseudo array sites
            debugPrint(2,'Making pseudo array')
            profile(prof_option, path, job, "start_pseudo_array_bits")

            [pos_asc, nbss_asc, avail_site_indices, avail_sites] = pseudo_array_bits(asc_panel_bits, processedData['daf'], sim_positions, SNPs)
            profile(prof_option, path, job, "end_pseudo_array_bits")
            nb_avail_sites = len(avail_sites)
            SNPs_exceed_available_sites = ( len(SNPs) >= nb_avail_sites )
        else:
            SNPs = []
            SNPs_exceed_available_sites = False

    if using_pseudo_array:
        profile(prof_option, path, job, "start_set_asc_bits")
        set_asc_bits(sequences, nbss_asc, pos_asc, avail_site_indices)
        profile(prof_option, path, job, "end_set_asc_bits")

    debugPrint(1, 'Calculating summary statistics')
    ##########################################################################
    ###################### Calculate summary statistics ######################
    ##########################################################################
    res, head = [], []

    ### Calculate summary stats from genomes
    if nbss > 0:   # Simulations must contain at least one segregating site
        profile(prof_option, path, job, "start_store_segregating_site_stats")
        stat_tools.store_segregating_site_stats(sequences, res, head)
        profile(prof_option, path, job, "end_store_segregating_site_stats")
        profile(prof_option, path, job, "start_store_pairwise_FSTs")
        stat_tools.store_pairwise_FSTs(sequences, n_d, res, head)
        profile(prof_option, path, job, "end_store_pairwise_FSTs")

    ### Calculate summary stats from the ascertained SNPs
    if using_pseudo_array:
        if nbss_asc > 0:
            profile(prof_option, path, job, "start_store_array_segregating_site_stats")
            stat_tools.store_array_segregating_site_stats(sequences, res, head)
            profile(prof_option, path, job, "end_store_array_segregating_site_stats")
            profile(prof_option, path, job, "start_store_array_FSTs")
            stat_tools.store_array_FSTs(sequences, res, head)
            profile(prof_option, path, job, "end_store_array_FSTs")

        debugPrint(2,'Making ped and map files')
        ped_file_name = '{0}/macs_asc_{1}_chr{2}.ped'.format(sim_data_dir, job, str(chr_number))
        map_file_name = '{0}/macs_asc_{1}_chr{2}.map'.format(sim_data_dir, job, str(chr_number))
        out_file_name = '{0}/macs_asc_{1}_chr{2}'.format(germline_out_dir, job, str(chr_number))

        if os.path.isfile(out_file_name + '.match'):  # Maybe remove if statement
            os.remove(ped_file_name)
            os.remove(map_file_name)

        if using_pseudo_array and pedmap or germline:
            profile(prof_option, path, job, "start_make_ped_file")
            make_ped_file(ped_file_name, sequences)
            profile(prof_option, path, job, "end_make_ped_file")
            profile(prof_option, path, job, "start_make_map_file")
            make_map_file(map_file_name, pos_asc, chr_number, avail_sites)
            profile(prof_option, path, job, "end_make_map_file")

        ### Use Germline to find IBD on pseduo array ped and map files
        do_i_run_germline = int(args['germline'])

        debugPrint(1,'run germline? {}'.format("True" if do_i_run_germline else "False"))

        if (do_i_run_germline == True):
            ########################### <CHANGE THIS LATER> ###########################
            ### Germline seems to be outputting in the wrong unit - so I am putting the min at 3000000 so that it is 3Mb, but should be the default.
            profile(prof_option, path, job, "start_run_germline")
            # germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m = 3000000)
            profile(prof_option, path, job, "end_run_germline")
            germline = run_germline(ped_file_name, map_file_name, out_file_name, min_m=300)
            ########################### </CHANGE THIS LATER> ##########################

        ### Get IBD stats from Germline output
        if os.path.isfile(out_file_name + '.match'):
            print('Reading Germline IBD output')
            profile(prof_option, path, job, "start_process_germline_file")
            [IBD_pairs, IBD_dict] = process_germline_file(out_file_name, names)
            profile(prof_option, path, job, "end_process_germline_file")

            print('Calculating summary stats')
            stats = OrderedDict([('num', len), ('mean', np.mean), ('med', np.median), ('var', np.var)])
            profile(prof_option, path, job, "start_store_IBD_stats")
            stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head)
            stat_tools.store_IBD_stats(stats, IBD_pairs, IBD_dict, res, head, min_val=30)
            profile(prof_option, path, job, "end_store_IBD_stats")

        debugPrint(1,'finished calculating ss')

    write_sim_results_file(sim_results_dir, job, processedData['param_dict'], res, head)

    print('')
    print('#########################')
    print('### PROGRAM COMPLETED ###')
    print('#########################')
    print('')

    profile(prof_option, path, job, "COMPLETE")
Пример #4
0
 def test_alleles(self):
     alleles_macs_file = AllelesMacsFile('tests/test_data/sites.txt')
     alleles = alleles_macs_file.make_lists()[0]
     self.maxDiff = None
     expected_alleles = [
         [
             '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
             '0', '0', '1', '0', '0', '0', '0', '0'
         ],
         [
             '0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '1', '0',
             '0', '0', '0', '0', '0', '0', '0', '0'
         ],
         [
             '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
             '0', '0', '0', '0', '0', '0', '1', '1'
         ],
         [
             '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1',
             '0', '0', '0', '0', '1', '0', '0', '0'
         ],
         [
             '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0',
             '0', '0', '0', '0', '0', '0', '0', '0'
         ],
         [
             '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0',
             '1', '0', '0', '0', '0', '0', '0', '0'
         ],
         [
             '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
             '0', '0', '0', '0', '0', '1', '0', '0'
         ],
         [
             '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1',
             '0', '0', '0', '0', '0', '0', '0', '0'
         ],
         [
             '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1',
             '1', '1', '1', '0', '1', '1', '1', '1'
         ],
         [
             '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0',
             '0', '0', '0', '1', '0', '0', '1', '1'
         ],
         [
             '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1',
             '1', '1', '1', '0', '1', '1', '0', '0'
         ],
         [
             '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0',
             '0', '0', '0', '1', '0', '0', '1', '1'
         ],
         [
             '0', '0', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0',
             '0', '0', '0', '0', '0', '0', '0', '0'
         ],
         [
             '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0',
             '0', '0', '0', '1', '0', '0', '0', '0'
         ],
         [
             '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0',
             '0', '0', '0', '1', '0', '0', '0', '0'
         ],
         [
             '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1',
             '1', '1', '1', '0', '1', '1', '1', '1'
         ],
         [
             '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1',
             '1', '1', '1', '0', '1', '1', '1', '1'
         ],
         [
             '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1',
             '1', '1', '1', '0', '1', '1', '1', '1'
         ],
         [
             '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0',
             '0', '0', '0', '1', '0', '0', '0', '0'
         ],
         [
             '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0',
             '0', '0', '0', '1', '0', '0', '0', '0'
         ],
         [
             '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1',
             '1', '1', '1', '0', '1', '1', '1', '1'
         ],
         [
             '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0',
             '0', '0', '0', '1', '0', '0', '0', '0'
         ],
         [
             '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1',
             '1', '1', '1', '0', '1', '1', '1', '1'
         ],
         [
             '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0',
             '0', '0', '0', '1', '0', '0', '0', '0'
         ],
         [
             '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0',
             '0', '0', '0', '1', '0', '0', '0', '0'
         ],
         [
             '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1',
             '1', '1', '1', '0', '1', '1', '1', '1'
         ],
         [
             '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0',
             '0', '0', '0', '0', '0', '0', '0', '0'
         ],
         [
             '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0',
             '0', '0', '0', '1', '0', '0', '0', '0'
         ],
         [
             '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1',
             '1', '1', '1', '0', '1', '1', '1', '1'
         ],
         [
             '0', '1', '0', '0', '0', '0', '1', '1', '0', '1', '0', '1',
             '0', '0', '1', '0', '1', '0', '0', '0'
         ],
         [
             '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0',
             '0', '0', '0', '0', '0', '0', '0', '0'
         ],
         [
             '0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0',
             '0', '0', '0', '1', '0', '0', '0', '0'
         ],
         [
             '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1',
             '1', '1', '1', '0', '1', '1', '1', '1'
         ],
         [
             '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
             '0', '1', '0', '0', '0', '0', '0', '0'
         ],
         [
             '1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1',
             '1', '1', '1', '0', '1', '1', '1', '1'
         ],
         [
             '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
             '0', '0', '0', '0', '1', '0', '0', '0'
         ],
         [
             '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
             '1', '0', '0', '0', '0', '0', '0', '0'
         ],
         [
             '1', '1', '0', '0', '0', '0', '1', '1', '0', '1', '0', '1',
             '1', '1', '1', '0', '1', '1', '0', '0'
         ]
     ]
     self.assertListEqual(alleles, expected_alleles)
     return alleles
Пример #5
0
 def test_make_lists(self):
     alleles_macs_file = AllelesMacsFile('tests/test_data/sites.txt')
     alleles_bits = alleles_macs_file.make_lists()
     self.maxDiff = None
     check =[[['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0'], ['0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1'], ['0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0'], ['0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '1'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '1'], ['0', '0', '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '1', '0', '0', '0', '0', '1', '1', '0', '1', '0', '1', '0', '0', '1', '0', '1', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '1', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1'], ['0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0'], ['1', '1', '0', '0', '0', '0', '1', '1', '0', '1', '0', '1', '1', '1', '1', '0', '1', '1', '0', '0']], ['   0.0705276367', '   0.0773083074', '    0.141827334', '    0.144695839', '    0.148809003', '    0.169434171', '    0.181985945', '    0.193180539', '     0.21844322', '    0.244852453', '    0.256794131', '    0.298170765', '    0.344656974', '    0.372631514', '    0.468767313', '    0.491370823', '    0.497166281', '    0.542687155', '    0.567144847', '    0.567368208', '    0.573822118', '    0.575989391', '    0.581127352', '    0.602843268', '    0.633125154', '    0.645520699', '    0.660214656', '    0.683393244', '    0.708882773', '    0.746408379', '    0.803594527', '     0.80459505', '    0.811848332', '    0.862597683', '    0.874138479', '    0.933816125', '    0.968021389', '    0.982665522']]
     self.assertEqual(alleles_bits, check)