Пример #1
0
def select_top_solutions(out_file, n):
    files = os.listdir()
    diff_files = []
    diff_value = []
    for file in files:
        if '.diff' in file and out_file in file:
            f = open(file, 'r')
            diff = pd.read_csv(file, header=None, delim_whitespace=True)
            diff_files.append(file)
            diff_value.append(np.average(diff[1]))
    sorted_files = pd.DataFrame(zip(
        diff_files, diff_value)).sort_values(1).reset_index(drop=True)
    np.savetxt('diff.list', sorted_files.values, fmt='%s %5.10f')
    log.info("Cost function of all solutions stored in file diff.list")

    p = []
    for i in range(int(sorted_files.shape[0] * n / 100)):
        file = sorted_files[0][i]
        p.append(list(read_pfact(file.replace('.diff', '.pfact'))))

    with open("all.sp", "w+") as f:
        for i in range(len(p)):
            for j in range(len(p[i])):
                if j == len(p[i]) - 1:
                    f.write("{: <12}\n".format(round(p[i][j], 5)))
                else:
                    f.write("{: <12} ".format(round(p[i][j], 5)))
    log.info("Top %s solutions stored in all.sp" %
             str(sorted_files.shape[0] * n / 100))
Пример #2
0
def loo_crossval(dexp, time_points, ass, lam, pH, temp, seq, res1, resn):
    """
    This function performs leave-one-out cross-validation at a fixed value
    of lambda. All protecion factors are initialized to ln(P)=1
    (except prolines, for which ln(P)=-1). A minimization is applied using
    a specific penalization term lambda. The cross validation error is
    evaluated on every train and test datasets generated by leaving out
    one time point at a time.
    """
    cv_train = 0
    cv_test = 0
    for k in range(len(time_points)):

        out_file = "CVout.rm%s" % str(k)

        dexp_train, times_train, dexp_test, times_test = L1OUT_dataset(
            dexp, time_points, k)

        run(base_dir=os.getcwd(),
            dexp=dexp_train,
            assignments=ass,
            pfact=None,
            random_steps=None,
            time_points=times_train,
            harmonic_term=lam,
            output_file=out_file,
            tolerance=1e-10,
            weights=None,
            pH=pH,
            temperature=temp,
            seq=seq,
            res1=res1,
            resn=resn)

        pfact = read_pfact(out_file + '.pfact')
        dpred_test = calculate_dpred(pfact, times_test, kint, ass)
        cost_test = [
            1 / len(pred) * np.sum((pred - exp)**2)
            for pred, exp in zip(dpred_test, dexp_test)
        ]

        cv_train += sum(np.loadtxt(out_file +
                                   '.diff'))[1]  # summed over peptides
        cv_test += sum(cost_test)  # sum over peptides

        os.remove(out_file + ".Dpred")
        os.remove(out_file + ".diff")
        os.remove(out_file + ".pfact")
    return cv_train / len(time_points), cv_test / len(time_points)
Пример #3
0
def get_dcalc(assignments_file, kint_file, pfact_file, time_points_file,
              fragment_number):

    assignments = read_assignments(assignments_file)
    kint = read_kint(kint_file, -1)
    pfact = read_pfact(pfact_file)
    time_points = read_time_points(time_points_file)

    residue_range = np.array([assignments[int(fragment_number) - 1]])[0]

    dcalc_all = np.zeros(
        (len(time_points), residue_range[2] - residue_range[1] + 1))

    for ii, time in enumerate(time_points):
        k = kint[residue_range[1]:residue_range[2]]
        p = pfact[residue_range[1]:residue_range[2]]
        dcalc = np.insert(1.0 - np.exp(-k * 60 * time / p), 0, time)
        dcalc_all[ii] = dcalc

    return dcalc_all
Пример #4
0
def predict_isotopic_envelope(ass_file,
                              seq_file,
                              temperature,
                              pH,
                              lnp_file,
                              times_file,
                              pep,
                              charge_state,
                              exchange,
                              out_file,
                              pi0_file=''):

    seq = read_seq(seq_file)
    times = read_time_points(times_file)

    # Select residues involving the selected peptide
    ass = read_assignments(ass_file)
    start_res = ass[int(pep) - 1][1]
    end_res = ass[int(pep) - 1][2]

    # Upload kint and lnP values
    if exchange == 'f':
        kint, _ = calculate_kint_for_sequence(1, len(seq), seq,
                                              float(temperature), float(pH))
        kint = kint[start_res:end_res]
    elif exchange == 'b':
        kint, _ = calculate_kback_for_sequence(1, len(seq), seq,
                                               float(temperature), float(pH))
        kint = kint[start_res:end_res]

    lnP = read_pfact(lnp_file)[start_res:end_res]
    # Calculate fully protonated isotopic envelope
    if exchange == 'f':
        pi0 = fully_protonated_envelope(seq[start_res:end_res + 1],
                                        z=charge_state)
        mass = list(pi0.keys())
        fr0 = list(pi0.values())
        while len(mass) <= 2 * len(kint[start_res:end_res + 1]):
            mass.append(
                (mass[-1] + 1.00627 * int(charge_state)) / charge_state)
            fr0.append(0)
            print(mass, fr0)
    elif exchange == 'b':
        pi0 = pd.read_csv(pi0_file,
                          skiprows=1,
                          header=None,
                          delim_whitespace=True)
        mass = list(pi0[1])
        u_fr0 = list(pi0[2])
        fr0 = centered_isotopic_envelope(0, kint, lnP, u_fr0)

    # Calculate isotopic envelopes at different times
    for i in range(len(times)):
        if exchange == 'f':
            f1 = centered_isotopic_envelope(times[i], kint, lnP, fr0)
        elif exchange == 'b':
            f1 = back_centered_isotopic_envelope(times[i], kint, lnP, fr0)

        f1 = [f1[j] / sum(f1) * 100 for j in range(len(f1))]
        with open("%s.%s.isot" % (out_file, str(i)), 'w+') as f:
            f.write('# ' + seq[start_res:end_res] + '\n')
            for j in range(len(f1)):
                f.write('%d\t' % j)
                f.write('%5.5f\t' % mass[j])
                f.write('%5.2f\t' % f1[j])
                last_col = f1[j] / max(f1) * 100
                if j == len(f1) - 1:
                    f.write('%5.2f' % last_col)
                else:
                    f.write('%5.2f\n' % last_col)
Пример #5
0
def run(base_dir, dexp, assignments, pfact, random_steps, time_points,
        harmonic_term, output_file, tolerance, weights, pH, temperature, seq,
        res1, resn):
    """

    :param base_dir: base directory for all input files.
    :param dexp: file containing dexp values.
    :param assignments: file containing assignments of kints to dexp values.
    :param pfact: file containing pfactor values.
    :param random_steps: number of steps for random search.
    :param time_points: a list of experiment time points.
    :param harmonic_term: term to be used for harmonic cost scoring.
    :param output_file: stub for all output files.
    :param tolerance: tolerance value for minimisation convergence.
    :return:
    """

    assignment_set = set()
    for ass in assignments:
        for x in range(int(ass[1]), int(ass[2]) + 1):
            assignment_set.add(x)

    pfactor_filter = set()
    for ass in assignments:
        for x in range(int(ass[1] + 1), int(ass[2]) + 1):
            pfactor_filter.add(x)
        if ass[1] < min(pfactor_filter):
            pfactor_filter.add(ass[1])

    kint, prolines = calculate_kint_for_sequence(res1, resn, seq, temperature,
                                                 pH)

    if not pfact:
        if random_steps:
            rand_output = do_random_search(kint,
                                           random_steps,
                                           pfactor_filter,
                                           dexp,
                                           time_points,
                                           assignments,
                                           harmonic_term,
                                           prolines,
                                           weights,
                                           seed=None)
            min_score = min(rand_output.keys())
            init_array = rand_output[min_score]
        else:
            init_array = [
                1 if ii not in prolines or ii == 0 or ii + 1 in pfactor_filter
                else -1 for ii in range(max(pfactor_filter))
            ]

    else:
        init_array = read_pfact(pfact)

    bounds = [(0.00001, 20) if x >= 0 else (-1, -1) if x == -1 else (0, 0)
              for x in init_array]

    pfit = fit_pfact(init_array, dexp, time_points, assignments, harmonic_term,
                     kint, bounds, tolerance, weights)

    write_pfact(pfit.x, output_file)

    dpred = calculate_dpred(pfit.x, time_points, kint, assignments)

    write_dpred(output_file, dpred, time_points)
    write_diff(output_file, dpred, dexp)

    final_score = cost_function(pfit.x, dexp, time_points, assignments,
                                harmonic_term, kint, weights)
    print('Final value of cost function w harm term: {}'.format(final_score))
    final_score = cost_function(pfit.x, dexp, time_points, assignments, 0.0,
                                kint, weights)
    print('Final value of cost function w/o harm term: {}'.format(final_score))
Пример #6
0
# Compulsory arguments

if opts.base:
    config['base'] = opts.base
else:
    config['base'] = os.getcwd()

if opts.ass:
    config['assignments'] = opts.ass
if opts.temp:
    config['temperature'] = float(opts.temp)
if opts.pH:
    config['pH'] = float(opts.pH)
if opts.pfact:
    config['pfact'] = read_pfact(opts.pfact)
if opts.times:
    config['times'] = read_time_points(opts.times)
if opts.seq:
    config['sequence'] = read_seq(opts.seq)
    config['res1'] = 1
    config['resn'] = len(read_seq(opts.seq))

# Optional arguments
if opts.out:
    config['output'] = opts.out
else:
    config['output'] = None

pfact = config['pfact']
assignments = read_assignments(config['assignments'])