def test_read_write_score_matrix(): # Integer scores score = np.array([ [4, -1, -2, -3], [-1, 5, -4, -5], [-2, -4, 6, -7], [-3, -5, -7, 8], ]) score = pd.DataFrame(score, columns=['A', 'T', 'C', 'G'], index=['A', 'T', 'C', 'G']) with tempfile.TemporaryDirectory() as tempdir: tempdir = pathlib.Path(tempdir) scorefile = tempdir / 'TEST_SCORE' io.write_score_matrix(scorefile, score) res_score = io.read_score_matrix(scorefile) assert np.all(score.columns == res_score.columns) assert np.all(score.index == res_score.index) np.testing.assert_allclose(score.values, res_score.values) # Floating point scores score = np.array([ [4, -1.5, -2, -3], [-1.5, 5, -4, -5], [-2, -4, 6.1, -7], [-3, -5, -7, 8], ]) score = pd.DataFrame(score, columns=['A', 'T', 'C', 'G'], index=['A', 'T', 'C', 'G']) with tempfile.TemporaryDirectory() as tempdir: tempdir = pathlib.Path(tempdir) scorefile = tempdir / 'TEST_SCORE' io.write_score_matrix(scorefile, score) with open(scorefile, 'rt') as fp: for line in fp: print(line) res_score = io.read_score_matrix(scorefile) assert np.all(score.columns == res_score.columns) assert np.all(score.index == res_score.index) np.testing.assert_allclose(score.values, res_score.values)
def test_reads_score_matrix(filename, scores): filepath = DATADIR / filename assert filepath.is_file() # Read the score matrix into a pandas dataframe mat = io.read_score_matrix(filepath) assert mat.shape == (24, 24) assert set(mat.columns) == SCORE_COLUMNS assert set(mat.index) == SCORE_COLUMNS # Make sure we can pull out some scores for row, col, exp in scores: assert mat.loc[row, col] == exp
def test_smith_waterman(seq1, seq2, scorefile, exp_score, align1, align2): scorefile = DATADIR / scorefile assert scorefile.is_file() score = io.read_score_matrix(scorefile) res_score, res_align1, res_align2 = smith_waterman(seq1, seq2, score, gap_opening=-3, gap_extension=-1) assert res_score == exp_score assert res_align1 == align1 assert res_align2 == align2
def main(args=None): args = parse_args(args=args) print('Aligning with {}'.format(args.score_file)) score = io.read_score_matrix(args.score_file) ALIGNDIR.mkdir(exist_ok=True, parents=True) # Force the gap penalties to be negative gap_opening = -abs(args.gap_opening) gap_extension = -abs(args.gap_extension) posalign_file = ALIGNDIR / 'pospairs_{}.fa'.format(args.score_file.name) negalign_file = ALIGNDIR / 'negpairs_{}.fa'.format(args.score_file.name) # Align the positive pairs print('Aligning positive pairs: {}'.format(posalign_file)) with posalign_file.open('wt') as fp: for res in align_all(POSPAIR_FILE, score, gap_opening=gap_opening, gap_extension=gap_extension, processes=args.processes): p1, p2, name1, name2, align_score, align1, align2 = res fp.write(f'>{name1},{name2},{align_score}\n') fp.write(f'{align1}\n') fp.write(f'{align2}\n\n') # Align the negative pairs print('Aligning negative pairs: {}'.format(negalign_file)) with negalign_file.open('wt') as fp: for res in align_all(NEGPAIR_FILE, score, gap_opening=gap_opening, gap_extension=gap_extension, processes=args.processes): p1, p2, name1, name2, align_score, align1, align2 = res fp.write(f'>{name1},{name2},{align_score}\n') fp.write(f'{align1}\n') fp.write(f'{align2}\n\n')
def calc_all_scores(pair_file, pair_scores): score = io.read_score_matrix(SCORE_FILE) pair_scores = pathlib.Path(pair_scores) have_scores = set() if pair_scores.is_file(): print('Reloading scores: {}'.format(pair_scores)) # Replay so we don't calculate stuff twice needs_header = False with pair_scores.open('rt') as fp: for line in fp: line = line.split('#', 1)[0].strip() if line == '': continue p1, p2, _, opening, extension = line.split(',') opening = int(opening) extension = int(extension) have_scores.add((p1, p2, opening, extension)) print('Loaded {} scores'.format(len(have_scores))) else: needs_header = True with pair_scores.open('at') as fp: if needs_header: fp.write('#fasta1,fasta2,score,opening,extension\n') for p1, p2 in io.read_pair_file(pair_file): _, seq1 = io.read_fasta(DATADIR / p1) _, seq2 = io.read_fasta(DATADIR / p2) gap_opening_penalties = range(GAP_OPENING_MIN, GAP_OPENING_MAX + 1) gap_extension_penalties = range(GAP_EXTENSION_MIN, GAP_EXTENSION_MAX + 1) # Try all combinations of scores and write them to a file items = product(gap_opening_penalties, gap_extension_penalties) items = [ ScoreItem(seq1=seq1, seq2=seq2, score=score, gap_opening=-opening, gap_extension=-extention) for opening, extention in items if (p1, p2, -opening, -extention) not in have_scores ] if len(items) == 0: print('Already finished {}, {}'.format(p1, p2)) continue t0 = time.perf_counter() print('Processing {}, {}'.format(p1, p2)) with multiprocessing.Pool(PROCESSES) as pool: for res in pool.imap_unordered(calc_single_score, items): align_score, opening, extension = res print(res) # Probably should be using csv writer here... rec = f'{p1},{p2},{align_score},{opening},{extension}\n' fp.write(rec) print('Finished in {} secs'.format(time.perf_counter() - t0)) fp.flush()
def main(args=None): args = parse_args(args=args) score_out_file = DATADIR / '{}_OPT'.format(args.score_file.name) score_plot_file = PLOTDIR / 'opt_trajectory_{}.png'.format(args.score_file.name) # Initialize the score matrix score = io.read_score_matrix(args.score_file) opt_history = [] score_history = [] alpha = ALPHA # Optimization loops for n in range(NUM_STEPS + 1): print('==== Optimizing step {} of {} ===='.format(n, NUM_STEPS)) print('Alpha = {:0.5f}'.format(alpha)) print('Aligning Positive examples...') t0 = time.perf_counter() pos_results = [] for i, item in enumerate(alignment.align_all(POSPAIR_FILE, score)): if i % 10 == 0: print('* {}'.format(i + 1)) pos_results.append(item) dt = time.perf_counter() - t0 print('Positive finished in {:0.1f} secs\n'.format(dt)) print('Aligning Negative examples...') t0 = time.perf_counter() neg_results = [] for i, item in enumerate(alignment.align_all(NEGPAIR_FILE, score)): if i % 10 == 0: print('* {}'.format(i + 1)) neg_results.append(item) dt = time.perf_counter() - t0 print('Negative finished in {:0.1f} secs\n'.format(dt)) # Get a measure of how good our current step is pos_scores = np.array([item.align_score for item in pos_results]) neg_scores = np.array([item.align_score for item in neg_results]) # Debug scores print('Best Positive Score: {:0.2f}'.format(np.max(pos_scores))) print('Worst Positive Score: {:0.2f}'.format(np.min(pos_scores))) print('Confused Positives: {:d}'.format(np.sum(pos_scores <= np.max(neg_scores)))) print('') print('Best Negative Score: {:0.2f}'.format(np.max(neg_scores))) print('Worst Negative Score: {:0.2f}'.format(np.min(neg_scores))) print('Confused Negatives: {:d}'.format(np.sum(neg_scores >= np.min(pos_scores)))) print('') opt = optimize.score_matrix_objective(pos_scores, neg_scores) if len(opt_history) > 3 and opt < min(opt_history[-OPT_WINDOW:]): print('Got worse opt, dropping alpha...') alpha = alpha * ALPHA_DECAY print('Alpha is now: {:0.5f}'.format(alpha)) print('') opt_history.append(opt) score_history.append(score) if opt >= OPT_MAX: print('Got perfect score!') break if len(opt_history) >= OPT_BOREDOM: if all([abs(o - opt_history[-1]) < 1e-5 for o in opt_history[-OPT_BOREDOM:]]): print('Score hasn\'t changed in {} steps. Exiting!'.format(OPT_BOREDOM)) break print('Step {}: Opt Score {:0.2f}'.format(n, opt)) print('Worst of last {}: {:0.2f}'.format(OPT_WINDOW, min(opt_history[-OPT_WINDOW:]))) print('Last 5 Opt: {}'.format(', '.join( '{:0.2f}'.format(o) for o in opt_history[-5:]))) print('') # Unpack the positive and negative scores pos_align = [(item.align1, item.align2) for item in pos_results] neg_align = [(item.align1, item.align2) for item in neg_results] # Calculate the empirical distributions print('Calculating update...') grad_score = optimize.calc_score_gradient(pos_scores, neg_scores, pos_align, neg_align) score += alpha * grad_score if alpha < MIN_ALPHA: print('Reached minimal step size...') break print('') print('Gradient max: {:0.4f}'.format(np.max(grad_score.values))) print('Gradient min: {:0.4f}'.format(np.min(grad_score.values))) print('Gradient mean: {:0.4f}'.format(np.mean(grad_score.values))) print('Gradient std: {:0.4f}'.format(np.std(grad_score.values))) print('') opt_history = np.array(opt_history) opt_steps = np.arange(1, opt_history.shape[0] + 1) # Look back through history and find our best matrix print('==== Final Results ====') best_opt_idx = np.argmax(opt_history) print('Best Round: {}'.format(best_opt_idx)) print('Best Opt: {}'.format(opt_history[best_opt_idx])) print('Best Score: {}'.format(score_history[best_opt_idx])) print('') print('Last Opt: {}'.format(opt_history[-1])) print('Last Score: {}'.format(score_history[-1])) print('Writing best scoring matrix') io.write_score_matrix(score_out_file, score_history[best_opt_idx]) print('Plotting the optimization trajectory') fig, ax = plt.subplots(1, 1, figsize=(12, 12)) ax.plot(opt_steps, opt_history) ax.plot([1, opt_history.shape[0] + 1], [OPT_MAX, OPT_MAX], '--') ax.set_title('Optimization Progress') ax.set_xlabel('Number of steps') ax.set_ylabel('Objective Score') plt.tight_layout() fig.savefig(str(score_plot_file)) plt.close()