Exemplo n.º 1
0
def test_read_write_score_matrix():

    # Integer scores
    score = np.array([
        [4, -1, -2, -3],
        [-1, 5, -4, -5],
        [-2, -4, 6, -7],
        [-3, -5, -7, 8],
    ])
    score = pd.DataFrame(score,
                         columns=['A', 'T', 'C', 'G'],
                         index=['A', 'T', 'C', 'G'])

    with tempfile.TemporaryDirectory() as tempdir:
        tempdir = pathlib.Path(tempdir)
        scorefile = tempdir / 'TEST_SCORE'

        io.write_score_matrix(scorefile, score)

        res_score = io.read_score_matrix(scorefile)

    assert np.all(score.columns == res_score.columns)
    assert np.all(score.index == res_score.index)
    np.testing.assert_allclose(score.values, res_score.values)

    # Floating point scores
    score = np.array([
        [4, -1.5, -2, -3],
        [-1.5, 5, -4, -5],
        [-2, -4, 6.1, -7],
        [-3, -5, -7, 8],
    ])
    score = pd.DataFrame(score,
                         columns=['A', 'T', 'C', 'G'],
                         index=['A', 'T', 'C', 'G'])

    with tempfile.TemporaryDirectory() as tempdir:
        tempdir = pathlib.Path(tempdir)
        scorefile = tempdir / 'TEST_SCORE'

        io.write_score_matrix(scorefile, score)
        with open(scorefile, 'rt') as fp:
            for line in fp:
                print(line)

        res_score = io.read_score_matrix(scorefile)

    assert np.all(score.columns == res_score.columns)
    assert np.all(score.index == res_score.index)
    np.testing.assert_allclose(score.values, res_score.values)
Exemplo n.º 2
0
def test_reads_score_matrix(filename, scores):

    filepath = DATADIR / filename
    assert filepath.is_file()

    # Read the score matrix into a pandas dataframe
    mat = io.read_score_matrix(filepath)

    assert mat.shape == (24, 24)
    assert set(mat.columns) == SCORE_COLUMNS
    assert set(mat.index) == SCORE_COLUMNS

    # Make sure we can pull out some scores
    for row, col, exp in scores:
        assert mat.loc[row, col] == exp
Exemplo n.º 3
0
def test_smith_waterman(seq1, seq2, scorefile, exp_score, align1, align2):
    scorefile = DATADIR / scorefile
    assert scorefile.is_file()

    score = io.read_score_matrix(scorefile)

    res_score, res_align1, res_align2 = smith_waterman(seq1,
                                                       seq2,
                                                       score,
                                                       gap_opening=-3,
                                                       gap_extension=-1)

    assert res_score == exp_score
    assert res_align1 == align1
    assert res_align2 == align2
Exemplo n.º 4
0
def main(args=None):
    args = parse_args(args=args)

    print('Aligning with {}'.format(args.score_file))
    score = io.read_score_matrix(args.score_file)

    ALIGNDIR.mkdir(exist_ok=True, parents=True)

    # Force the gap penalties to be negative
    gap_opening = -abs(args.gap_opening)
    gap_extension = -abs(args.gap_extension)

    posalign_file = ALIGNDIR / 'pospairs_{}.fa'.format(args.score_file.name)
    negalign_file = ALIGNDIR / 'negpairs_{}.fa'.format(args.score_file.name)

    # Align the positive pairs
    print('Aligning positive pairs: {}'.format(posalign_file))
    with posalign_file.open('wt') as fp:
        for res in align_all(POSPAIR_FILE,
                             score,
                             gap_opening=gap_opening,
                             gap_extension=gap_extension,
                             processes=args.processes):
            p1, p2, name1, name2, align_score, align1, align2 = res
            fp.write(f'>{name1},{name2},{align_score}\n')
            fp.write(f'{align1}\n')
            fp.write(f'{align2}\n\n')

    # Align the negative pairs
    print('Aligning negative pairs: {}'.format(negalign_file))
    with negalign_file.open('wt') as fp:
        for res in align_all(NEGPAIR_FILE,
                             score,
                             gap_opening=gap_opening,
                             gap_extension=gap_extension,
                             processes=args.processes):
            p1, p2, name1, name2, align_score, align1, align2 = res
            fp.write(f'>{name1},{name2},{align_score}\n')
            fp.write(f'{align1}\n')
            fp.write(f'{align2}\n\n')
Exemplo n.º 5
0
def calc_all_scores(pair_file, pair_scores):
    score = io.read_score_matrix(SCORE_FILE)

    pair_scores = pathlib.Path(pair_scores)

    have_scores = set()

    if pair_scores.is_file():
        print('Reloading scores: {}'.format(pair_scores))
        # Replay so we don't calculate stuff twice
        needs_header = False
        with pair_scores.open('rt') as fp:
            for line in fp:
                line = line.split('#', 1)[0].strip()
                if line == '':
                    continue
                p1, p2, _, opening, extension = line.split(',')
                opening = int(opening)
                extension = int(extension)

                have_scores.add((p1, p2, opening, extension))
        print('Loaded {} scores'.format(len(have_scores)))

    else:
        needs_header = True

    with pair_scores.open('at') as fp:
        if needs_header:
            fp.write('#fasta1,fasta2,score,opening,extension\n')

        for p1, p2 in io.read_pair_file(pair_file):
            _, seq1 = io.read_fasta(DATADIR / p1)
            _, seq2 = io.read_fasta(DATADIR / p2)

            gap_opening_penalties = range(GAP_OPENING_MIN, GAP_OPENING_MAX + 1)
            gap_extension_penalties = range(GAP_EXTENSION_MIN,
                                            GAP_EXTENSION_MAX + 1)

            # Try all combinations of scores and write them to a file
            items = product(gap_opening_penalties, gap_extension_penalties)
            items = [
                ScoreItem(seq1=seq1,
                          seq2=seq2,
                          score=score,
                          gap_opening=-opening,
                          gap_extension=-extention)
                for opening, extention in items
                if (p1, p2, -opening, -extention) not in have_scores
            ]
            if len(items) == 0:
                print('Already finished {}, {}'.format(p1, p2))
                continue

            t0 = time.perf_counter()
            print('Processing {}, {}'.format(p1, p2))
            with multiprocessing.Pool(PROCESSES) as pool:
                for res in pool.imap_unordered(calc_single_score, items):
                    align_score, opening, extension = res
                    print(res)
                    # Probably should be using csv writer here...
                    rec = f'{p1},{p2},{align_score},{opening},{extension}\n'
                    fp.write(rec)
            print('Finished in {} secs'.format(time.perf_counter() - t0))
            fp.flush()
Exemplo n.º 6
0
def main(args=None):

    args = parse_args(args=args)
    score_out_file = DATADIR / '{}_OPT'.format(args.score_file.name)
    score_plot_file = PLOTDIR / 'opt_trajectory_{}.png'.format(args.score_file.name)

    # Initialize the score matrix
    score = io.read_score_matrix(args.score_file)
    opt_history = []
    score_history = []

    alpha = ALPHA

    # Optimization loops
    for n in range(NUM_STEPS + 1):

        print('==== Optimizing step {} of {} ===='.format(n, NUM_STEPS))
        print('Alpha = {:0.5f}'.format(alpha))

        print('Aligning Positive examples...')
        t0 = time.perf_counter()
        pos_results = []
        for i, item in enumerate(alignment.align_all(POSPAIR_FILE, score)):
            if i % 10 == 0:
                print('* {}'.format(i + 1))
            pos_results.append(item)
        dt = time.perf_counter() - t0
        print('Positive finished in {:0.1f} secs\n'.format(dt))

        print('Aligning Negative examples...')
        t0 = time.perf_counter()
        neg_results = []
        for i, item in enumerate(alignment.align_all(NEGPAIR_FILE, score)):
            if i % 10 == 0:
                print('* {}'.format(i + 1))
            neg_results.append(item)
        dt = time.perf_counter() - t0
        print('Negative finished in {:0.1f} secs\n'.format(dt))

        # Get a measure of how good our current step is
        pos_scores = np.array([item.align_score for item in pos_results])
        neg_scores = np.array([item.align_score for item in neg_results])

        # Debug scores
        print('Best Positive Score:  {:0.2f}'.format(np.max(pos_scores)))
        print('Worst Positive Score: {:0.2f}'.format(np.min(pos_scores)))
        print('Confused Positives:   {:d}'.format(np.sum(pos_scores <= np.max(neg_scores))))
        print('')

        print('Best Negative Score:  {:0.2f}'.format(np.max(neg_scores)))
        print('Worst Negative Score: {:0.2f}'.format(np.min(neg_scores)))
        print('Confused Negatives:   {:d}'.format(np.sum(neg_scores >= np.min(pos_scores))))
        print('')

        opt = optimize.score_matrix_objective(pos_scores, neg_scores)

        if len(opt_history) > 3 and opt < min(opt_history[-OPT_WINDOW:]):
            print('Got worse opt, dropping alpha...')
            alpha = alpha * ALPHA_DECAY
            print('Alpha is now: {:0.5f}'.format(alpha))
            print('')

        opt_history.append(opt)
        score_history.append(score)

        if opt >= OPT_MAX:
            print('Got perfect score!')
            break

        if len(opt_history) >= OPT_BOREDOM:
            if all([abs(o - opt_history[-1]) < 1e-5 for o in opt_history[-OPT_BOREDOM:]]):
                print('Score hasn\'t changed in {} steps. Exiting!'.format(OPT_BOREDOM))
                break

        print('Step {}: Opt Score {:0.2f}'.format(n, opt))
        print('Worst of last {}: {:0.2f}'.format(OPT_WINDOW, min(opt_history[-OPT_WINDOW:])))
        print('Last 5 Opt: {}'.format(', '.join(
            '{:0.2f}'.format(o) for o in opt_history[-5:])))
        print('')

        # Unpack the positive and negative scores
        pos_align = [(item.align1, item.align2) for item in pos_results]
        neg_align = [(item.align1, item.align2) for item in neg_results]

        # Calculate the empirical distributions
        print('Calculating update...')

        grad_score = optimize.calc_score_gradient(pos_scores, neg_scores,
                                                  pos_align, neg_align)
        score += alpha * grad_score

        if alpha < MIN_ALPHA:
            print('Reached minimal step size...')
            break
        print('')
        print('Gradient max:  {:0.4f}'.format(np.max(grad_score.values)))
        print('Gradient min:  {:0.4f}'.format(np.min(grad_score.values)))
        print('Gradient mean: {:0.4f}'.format(np.mean(grad_score.values)))
        print('Gradient std:  {:0.4f}'.format(np.std(grad_score.values)))

        print('')

    opt_history = np.array(opt_history)
    opt_steps = np.arange(1, opt_history.shape[0] + 1)

    # Look back through history and find our best matrix
    print('==== Final Results ====')
    best_opt_idx = np.argmax(opt_history)
    print('Best Round: {}'.format(best_opt_idx))
    print('Best Opt:   {}'.format(opt_history[best_opt_idx]))
    print('Best Score: {}'.format(score_history[best_opt_idx]))
    print('')
    print('Last Opt:   {}'.format(opt_history[-1]))
    print('Last Score: {}'.format(score_history[-1]))

    print('Writing best scoring matrix')
    io.write_score_matrix(score_out_file, score_history[best_opt_idx])

    print('Plotting the optimization trajectory')

    fig, ax = plt.subplots(1, 1, figsize=(12, 12))

    ax.plot(opt_steps, opt_history)
    ax.plot([1, opt_history.shape[0] + 1], [OPT_MAX, OPT_MAX], '--')

    ax.set_title('Optimization Progress')
    ax.set_xlabel('Number of steps')
    ax.set_ylabel('Objective Score')

    plt.tight_layout()

    fig.savefig(str(score_plot_file))

    plt.close()