示例#1
0
def cost_function(params, *args):
    """
    Cost function for pfactor fitting.
    :param params: list of estimated pfactors
    :param args: arguments required for calculating the cost
    :return: cost score (float)
    """
    dexp, tk, assignments, k, kint, weights = args
    if weights is not None:
        dpred = calculate_dpred(np.array(params), tk, kint, assignments)
        score = calculate_rms(dpred, dexp, len(assignments), weights)
    else:
        dpred = calculate_dpred(np.array(params), tk, kint, assignments)
        score = calculate_rms(dpred, dexp, len(assignments))
    score += harmonic_score(params, k)
    return float(score)
示例#2
0
def predict_dexp(pfact, time_points, assignments):
    """
    Calculates predicted dexp from pfactors, time_points, assignments and kint values.
    :param pfact: array of pfactors.
    :param time_points: array of time points.
    :param kint: array of kint values
    :param assignments: array of assignment arrays.
    :return: numpy array of dexp values.
    """
    dexp = calculate_dpred(pfact, time_points, assignments)
    return dexp
示例#3
0
def loo_crossval(dexp, time_points, ass, lam, pH, temp, seq, res1, resn):
    """
    This function performs leave-one-out cross-validation at a fixed value
    of lambda. All protecion factors are initialized to ln(P)=1
    (except prolines, for which ln(P)=-1). A minimization is applied using
    a specific penalization term lambda. The cross validation error is
    evaluated on every train and test datasets generated by leaving out
    one time point at a time.
    """
    cv_train = 0
    cv_test = 0
    for k in range(len(time_points)):

        out_file = "CVout.rm%s" % str(k)

        dexp_train, times_train, dexp_test, times_test = L1OUT_dataset(
            dexp, time_points, k)

        run(base_dir=os.getcwd(),
            dexp=dexp_train,
            assignments=ass,
            pfact=None,
            random_steps=None,
            time_points=times_train,
            harmonic_term=lam,
            output_file=out_file,
            tolerance=1e-10,
            weights=None,
            pH=pH,
            temperature=temp,
            seq=seq,
            res1=res1,
            resn=resn)

        pfact = read_pfact(out_file + '.pfact')
        dpred_test = calculate_dpred(pfact, times_test, kint, ass)
        cost_test = [
            1 / len(pred) * np.sum((pred - exp)**2)
            for pred, exp in zip(dpred_test, dexp_test)
        ]

        cv_train += sum(np.loadtxt(out_file +
                                   '.diff'))[1]  # summed over peptides
        cv_test += sum(cost_test)  # sum over peptides

        os.remove(out_file + ".Dpred")
        os.remove(out_file + ".diff")
        os.remove(out_file + ".pfact")
    return cv_train / len(time_points), cv_test / len(time_points)
示例#4
0
def run(base_dir, dexp, assignments, pfact, random_steps, time_points,
        harmonic_term, output_file, tolerance, weights, pH, temperature, seq,
        res1, resn):
    """

    :param base_dir: base directory for all input files.
    :param dexp: file containing dexp values.
    :param assignments: file containing assignments of kints to dexp values.
    :param pfact: file containing pfactor values.
    :param random_steps: number of steps for random search.
    :param time_points: a list of experiment time points.
    :param harmonic_term: term to be used for harmonic cost scoring.
    :param output_file: stub for all output files.
    :param tolerance: tolerance value for minimisation convergence.
    :return:
    """

    assignment_set = set()
    for ass in assignments:
        for x in range(int(ass[1]), int(ass[2]) + 1):
            assignment_set.add(x)

    pfactor_filter = set()
    for ass in assignments:
        for x in range(int(ass[1] + 1), int(ass[2]) + 1):
            pfactor_filter.add(x)
        if ass[1] < min(pfactor_filter):
            pfactor_filter.add(ass[1])

    kint, prolines = calculate_kint_for_sequence(res1, resn, seq, temperature,
                                                 pH)

    if not pfact:
        if random_steps:
            rand_output = do_random_search(kint,
                                           random_steps,
                                           pfactor_filter,
                                           dexp,
                                           time_points,
                                           assignments,
                                           harmonic_term,
                                           prolines,
                                           weights,
                                           seed=None)
            min_score = min(rand_output.keys())
            init_array = rand_output[min_score]
        else:
            init_array = [
                1 if ii not in prolines or ii == 0 or ii + 1 in pfactor_filter
                else -1 for ii in range(max(pfactor_filter))
            ]

    else:
        init_array = read_pfact(pfact)

    bounds = [(0.00001, 20) if x >= 0 else (-1, -1) if x == -1 else (0, 0)
              for x in init_array]

    pfit = fit_pfact(init_array, dexp, time_points, assignments, harmonic_term,
                     kint, bounds, tolerance, weights)

    write_pfact(pfit.x, output_file)

    dpred = calculate_dpred(pfit.x, time_points, kint, assignments)

    write_dpred(output_file, dpred, time_points)
    write_diff(output_file, dpred, dexp)

    final_score = cost_function(pfit.x, dexp, time_points, assignments,
                                harmonic_term, kint, weights)
    print('Final value of cost function w harm term: {}'.format(final_score))
    final_score = cost_function(pfit.x, dexp, time_points, assignments, 0.0,
                                kint, weights)
    print('Final value of cost function w/o harm term: {}'.format(final_score))
示例#5
0
    config['pfact'] = read_pfact(opts.pfact)
if opts.times:
    config['times'] = read_time_points(opts.times)
if opts.seq:
    config['sequence'] = read_seq(opts.seq)
    config['res1'] = 1
    config['resn'] = len(read_seq(opts.seq))

# Optional arguments
if opts.out:
    config['output'] = opts.out
else:
    config['output'] = None

pfact = config['pfact']
assignments = read_assignments(config['assignments'])

assignment_set = set()
for ass in assignments:
    for x in range(int(ass[1]), int(ass[2]) + 1):
        assignment_set.add(x)

kint, prolines = calculate_kint_for_sequence(config['res1'], config['resn'],
                                             config['sequence'],
                                             config['temperature'],
                                             config['pH'])

dpred = calculate_dpred(pfact, config['times'], kint, assignments)

write_dpred(config['output'], dpred, config['times'])