def indel_anavar(arg_tuple):
    ins_sfs, ins_m, del_sfs, del_m, bootstrap, n, region, c, out_stem = arg_tuple

    anavar_path = '/shared/evolgen1/shared_data/program_files/iceberg/'

    anavar_cmd = '{path}anavar1.2 {ctl} {rslts} {log}'

    results = []

    for i in [0] + range(1, bootstrap + 1):

        # sort sfs
        if i == 0:
            sfs_i = ins_sfs
            sfs_d = del_sfs
        else:
            sfs_i = resample_replace(ins_sfs)
            sfs_d = resample_replace(del_sfs)

        # convert to correct format for anavar
        sfs_i = sfs2counts(sfs_i, n)
        sfs_d = sfs2counts(sfs_d, n)

        # sort file names
        ctl_name = out_stem + '.rep{}.control.txt'.format(i)
        result_name = out_stem + '.rep{}.results.txt'.format(i)
        log_name = out_stem + '.rep{}.log.txt'.format(i)

        # construct control file
        sfs_m = {'INS': (sfs_i, ins_m), 'DEL': (sfs_d, del_m)}
        ctl = an.Indel1ControlFile()
        ctl.set_data(sfs_m, n, dfe='discrete', c=c)
        ctl_contents = ctl.construct()
        with open(ctl_name, 'w') as control:
            control.write(ctl_contents)

        # call anavar
        rep_cmd = anavar_cmd.format(path=anavar_path,
                                    ctl=ctl_name,
                                    rslts=result_name,
                                    log=log_name)
        subprocess.call(rep_cmd, shell=True)

        # process results
        with open(result_name) as rep_results:
            results_data = an.ResultsFile(rep_results)
            header = list(results_data.header()) + ['rep', 'region']
            ml_est = results_data.ml_estimate(
                as_string=True) + '\t{}\t{}'.format(i, region)
            if i == 0:
                results.append('\t'.join(header))
            results.append(ml_est)

    return results
Exemplo n.º 2
0
def main():
    res = sys.argv[1]

    results = an.ResultsFile(open(res))

    dn_truth = (0.000121257528807, 0.000200465669246)  # (ins, del)
    ds_truth = (0.00183918200407, 0.00383421025726)  # (ins, del)

    print('percent_bias_adjust',
          'alpha',
          'var_type',
          'rDI_cds',
          'rDI_nc',
          sep='\t')

    # estimate alpha assuming different % errors of div polarisation ie 1% error
    for i in range(0, 26):

        if i == 0:
            dn_adjuster = 0
            ds_adjuster = 0

        else:
            percent_error = i / float(100)

            dn_adjuster = sum(dn_truth) * percent_error
            ds_adjuster = sum(ds_truth) * percent_error

        for direction in [-1, 1]:

            dn_adj = adjust_div(dn_truth, dn_adjuster, direction)
            ds_adj = adjust_div(ds_truth, ds_adjuster, direction)

            ins_alpha = results.get_alpha(dn=dn_adj[0],
                                          ds=ds_adj[0],
                                          var_type='ins')
            del_alpha = results.get_alpha(dn=dn_adj[1],
                                          ds=ds_adj[1],
                                          var_type='del')

            print(i * direction,
                  ins_alpha,
                  'ins',
                  dn_adj[1] / dn_adj[0],
                  ds_adj[1] / ds_adj[0],
                  sep='\t')
            print(i * direction,
                  del_alpha,
                  'del',
                  dn_adj[1] / dn_adj[0],
                  ds_adj[1] / ds_adj[0],
                  sep='\t')
Exemplo n.º 3
0
def main():

    header = [
        'run', 'imp', 'exit_code', 'neu_theta_1', 'neu_gamma_1', 'neu_e_1',
        'sel_theta', 'sel_shape', 'sel_scale', 'sel_e', 'lnL'
    ]

    additional = ['converged', 'bounds', 'rep', 'region']

    head_out = header + additional

    print(*head_out, sep=',')

    res_list = []

    for res in sys.stdin:

        res_stem = res.split('/')[-1]
        bs = int(res_stem.split('.bsrep')[-1].split('.')[0])
        region = res_stem.split('_')[1]

        res = res.rstrip()

        results = an.ResultsFile(open(res))
        mle = results.ml_estimate()
        converged = results.converged()
        bounds = results.bounds_hit(gamma_r=(-500, 100),
                                    theta_r=(1e-14, 0.1),
                                    r_r=(0.01, 100),
                                    scale_r=(0.1, 5000.0))

        current_res = [mle[x] for x in header
                       ] + [converged, ';'.join(bounds), bs, region]

        res_list.append([bs, current_res])

    for processed_res in sorted(res_list, key=lambda x: x[0]):

        print(*processed_res[1], sep=',')
Exemplo n.º 4
0
def sel_v_neu_anavar(mode, vcf, call, sel_region, constraint, n, c, dfe, alg,
                     nnoimp, maximp, out_stem, search, degree, spread, evolgen,
                     start_index, given, ar_ref):
    """
    submits anavar jobs to cluster after writing required files etc
    :param mode: str
    :param vcf: str
    :param call: dict
    :param sel_region: str
    :param constraint: str
    :param n: int
    :param c: int
    :param dfe: str
    :param alg: str
    :param nnoimp: int
    :param maximp: int
    :param out_stem: str
    :param search: int
    :param degree: int
    :param spread: int
    :param evolgen: bool
    :param start_index: int
    :param given: bool
    :param ar_ref: bool
    :return: None
    """

    anavar_path = '/shared/evolgen1/shared_data/program_files/sharc/'

    anavar_cmd = '{path}anavar1.4 {ctl} {rslts} {log} {seed}'

    # sort file names
    ctl_name = out_stem + '.control.txt'
    merge_out = out_stem + '.merged.results.txt'

    # catch given on first run
    init = ()
    if given:
        if not os.path.isfile(merge_out):
            sys.exit(
                'Given True but no previous runs completed to take besty res from'
            )
        else:
            # get best result from merged out
            best_res = an.ResultsFile(
                open(merge_out)).ml_estimate(as_string=True)
            init = tuple(best_res.split()[3:-1])

    # region combinations
    region_combs = {
        'CDS': ['CDS_frameshift', 'CDS_non_frameshift'],
        'intron': ['intron'],
        'intergenic': ['intergenic'],
        'noncoding': ['intergenic', 'intron']
    }

    # make control file
    if mode == 'snp':
        sfs_data = prepare_snp_sfs(vcf,
                                   call,
                                   n,
                                   sel_sfs_regions=region_combs[sel_region],
                                   call_sel_reg=sel_region)
        ctl = an.SNPNeuSelControlFile()

    else:
        sfs_data = prepare_indel_sfs(vcf,
                                     call,
                                     n,
                                     sel_sfs_regions=region_combs[sel_region],
                                     call_sel_reg=sel_region,
                                     ar_ref=ar_ref)
        ctl = an.IndelNeuSelControlFile()

    ctl.set_alg_opts(search=search,
                     alg=alg,
                     key=3,
                     epsabs=1e-20,
                     epsrel=1e-9,
                     rftol=1e-9,
                     maxtime=3600,
                     optional=True,
                     maximp=maximp,
                     nnoimp=nnoimp,
                     init=init)

    ctl.set_data(sfs_data,
                 n,
                 dfe=dfe,
                 c=c,
                 gamma_r=(-5e4, 1e5),
                 theta_r=(1e-14, 0.1),
                 r_r=(0.01, 100),
                 scale_r=(0.1, 5000.0))
    if degree != 50:
        ctl.set_dfe_optional_opts(degree=degree, optional=True)
    ctl.set_constraint(constraint)
    ctl_contents = ctl.construct()
    with open(ctl_name, 'w') as control:
        control.write(ctl_contents)

    res_file_list = out_stem + '.allres.list.txt'
    hjids = []
    with open(res_file_list, 'a') as res_list:

        # split into requested jobs
        for i in range(start_index, start_index + spread):

            split_stem = '{}.split{}'.format(out_stem, i)

            result_name = split_stem + '.results.txt'
            log_name = split_stem + '.log.txt'

            print(result_name, file=res_list)

            # call anavar
            rep_cmd = anavar_cmd.format(path=anavar_path,
                                        ctl=ctl_name,
                                        rslts=result_name,
                                        log=log_name,
                                        seed=i)

            q_sub([rep_cmd],
                  out=split_stem,
                  jid=split_stem.split('/')[-1] + '.sh',
                  t=48,
                  evolgen=evolgen)
            hjids.append(split_stem.split('/')[-1] + '.sh')

    # hold job to merge outputs
    gather = 'cat {} | ~/parus_indel/anavar_analyses/gather_searches.py {}'.format(
        res_file_list, merge_out)
    q_sub([gather], out=out_stem + '.merge', hold=hjids, evolgen=evolgen)
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-file_pattern',
        help=
        'takes a regular expression in order to extract a custom ID from a file'
        'name, along with a column name eg) degree,_degree(\d+)\.')
    parser.add_argument('-neu_ref',
                        help='sets neutral reference type to determine ds',
                        default='ar',
                        choices=['ar', 'nc'])
    args = parser.parse_args()
    counter = 0

    if args.file_pattern is not None:
        spec_col = args.file_pattern.split(',')[0]
        spec_pattern = args.file_pattern.split(',')[1]
    else:
        spec_col, spec_pattern = None, None

    all_res = []
    for res in sys.stdin:

        # sort custom col contents
        if spec_col is not None:
            spec_val = re.search(spec_pattern, res).group(1)
        else:
            spec_val = None

        res = res.rstrip()

        if 'equal_t' in res:
            constraint = '_equal_t'
        else:
            constraint = ''

        results = an.ResultsFile(open(res))
        mle = results.ml_estimate()
        n_class = results.num_class()
        variant = results.data_type()
        dfe = results.dfe()
        free_params = len(results.free_parameters())

        if constraint == '_equal_t':

            if args.neu_ref == 'ar':
                i_ds = 0.00183918200407
                d_ds = 0.00383421025726
            else:
                i_ds = 0.00171438749928
                d_ds = 0.00308343831154

            ins_alpha = results.get_alpha(dn=0.000121257528807,
                                          ds=i_ds,
                                          var_type='ins')
            del_alpha = results.get_alpha(dn=0.000200465669246,
                                          ds=d_ds,
                                          var_type='del')
        else:
            ins_alpha = 'NA'
            del_alpha = 'NA'

        mod_name = '{}_{}_{}class{}'.format(variant, dfe, n_class, constraint)

        reformed = reformat_mle(mle,
                                n_class,
                                variant,
                                results.converged(),
                                results.bounds_hit(gamma_r=(-5e4, 1e5),
                                                   theta_r=(1e-12, 0.1),
                                                   r_r=(0.01, 100),
                                                   scale_r=(0.1, 5000.0)),
                                mod_name,
                                free_params,
                                dfe,
                                results.num_runs(),
                                alpha_ins=ins_alpha,
                                alpha_del=del_alpha,
                                custom_col=spec_col,
                                custom_val=spec_val)

        if counter == 0:
            all_res.append(reformed[0])
            counter += 1

        for x in reformed[1]:
            all_res.append(x)

    # calc AIC
    aics = sorted([z[-1] for z in all_res[1:]])
    best_aic = aics[0]

    print(*all_res[0] + ['delta_AIC'], sep=',')

    out_data = []
    for line in all_res[1:]:
        delta = delta_aic(line[-1], best_aic)
        out_data.append((delta, line + [delta]))

    for line in sorted(out_data, reverse=True):
        print(*line[1], sep=',')
Exemplo n.º 6
0
def sel_v_neu_anavar(sfs_dat, constraint, n, c, dfe, alg, nnoimp, maximp,
                     out_stem, search, degree, spread, start_index, given):

    """
    submits anavar jobs to cluster after writing required files etc
    :param sfs_dat: dict
    :param constraint: str
    :param n: int
    :param c: int
    :param dfe: str
    :param alg: str
    :param nnoimp: int
    :param maximp: int
    :param out_stem: str
    :param search: int
    :param degree: int
    :param spread: int
    :param start_index: int
    :param given: bool
    :return: None
    """

    anavar_path = ''

    anavar_cmd = '{path}anavar {ctl} {rslts} {log} {seed}'

    # sort file names
    ctl_name = out_stem + '.control.txt'
    merge_out = out_stem + '.merged.results.txt'

    # catch given on first run
    init = ()
    if given:
        if not os.path.isfile(merge_out):
            sys.exit('Given True but no previous runs completed to take besty res from')
        else:
            # get best result from merged out
            best_res = an.ResultsFile(open(merge_out)).ml_estimate(as_string=True)
            init = tuple(best_res.split()[3:-1])

    # make control file
    ctl = an.SNPNeuSelControlFile()

    ctl.set_alg_opts(search=search, alg=alg, key=3,
                     epsabs=1e-20, epsrel=1e-9, rftol=1e-9,
                     maxtime=3600, optional=True,
                     maximp=maximp, nnoimp=nnoimp, init=init)

    ctl.set_data(sfs_dat, n, dfe=dfe, c=c, gamma_r=(-500, 100), theta_r=(1e-14, 0.1), r_r=(0.01, 100),
                 scale_r=(0.1, 5000.0), snp_fold=False)
    if degree != 50:
        ctl.set_dfe_optional_opts(degree=degree, optional=True)
    ctl.set_constraint(constraint)
    ctl_contents = ctl.construct()
    with open(ctl_name, 'w') as control:
        control.write(ctl_contents)

    res_file_list = out_stem + '.allres.list.txt'
    with open(res_file_list, 'a') as res_list:

        # split into requested jobs
        for i in range(start_index, start_index+spread):

            split_stem = '{}.split{}'.format(out_stem, i)

            result_name = split_stem + '.results.txt'
            log_name = split_stem + '.log.txt'

            print(result_name, file=res_list)

            # call anavar
            rep_cmd = anavar_cmd.format(path=anavar_path, ctl=ctl_name, rslts=result_name, log=log_name, seed=i)

            q_sub([rep_cmd], out=split_stem, jid=split_stem.split('/')[-1] + '.sh', t=48, scheduler='SLURM')
Exemplo n.º 7
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-file_pattern',
        help=
        'takes a regular expression in order to extract a custom ID from a file'
        'name, along with a column name eg) degree,_degree(\d+)\.')
    args = parser.parse_args()
    counter = 0

    if args.file_pattern is not None:
        spec_col = args.file_pattern.split(',')[0]
        spec_pattern = args.file_pattern.split(',')[1]
    else:
        spec_col, spec_pattern = None, None

    all_res = []
    for res in sys.stdin:

        # sort custom col contents
        if spec_col is not None:
            spec_val = re.search(spec_pattern, res).group(1)
        else:
            spec_val = None

        res = res.rstrip()

        if 'equal_t' in res:
            constraint = '_equal_t'
        else:
            constraint = ''

        results = an.ResultsFile(open(res))
        mle = results.ml_estimate()
        n_class = results.num_class()
        variant = results.data_type()
        dfe = results.dfe()
        free_params = len(results.free_parameters())

        mod_name = '{}_{}_{}class{}'.format(variant, dfe, n_class, constraint)

        reformed = reformat_mle(mle,
                                n_class,
                                variant,
                                results.converged(),
                                results.bounds_hit(gamma_r=(-5e4, 1e3),
                                                   theta_r=(1e-10, 0.1),
                                                   r_r=(0.01, 100),
                                                   scale_r=(0.1, 5000.0)),
                                mod_name,
                                free_params,
                                dfe,
                                results.num_runs(),
                                custom_col=spec_col,
                                custom_val=spec_val)

        if counter == 0:
            all_res.append(reformed[0])
            counter += 1

        for x in reformed[1]:
            all_res.append(x)

    # calc AIC
    aics = sorted([z[-1] for z in all_res[1:]])
    best_aic = aics[0]

    print(*all_res[0] + ['delta_AIC'], sep=',')

    out_data = []
    for line in all_res[1:]:
        delta = delta_aic(line[-1], best_aic)
        out_data.append((delta, line + [delta]))

    for line in sorted(out_data, reverse=True):
        print(*line[1], sep=',')