def test_process_results(self): """Check results are processed and summarized correctly""" best_hits = { "HABJ36W02EXF44": [ { "a": { "evalue": 0.0, "subject_id": "NZ_ABEH01000018_641736102", "bit_score": 1005.0, "percentage_id": 99.42, "alg_length": 519, }, "b": {"subject_id": None, "bit_score": -1}, } ], "HABJ36W02DLDSY": [ { "a": { "evalue": 0.0, "subject_id": "NZ_ABEH01000005_641736102", "bit_score": 959.0, "percentage_id": 99.22, "alg_length": 512, }, "b": {"subject_id": None, "bit_score": -1}, } ], } out_results = process_results([0.80], [50], [0.30], [30], best_hits) self.assertEquals( out_results, [ { "db_interest": 0, "db_other": 0, "db_seqs_counts": { "a": {"NZ_ABEH01000005_641736102": 1, "NZ_ABEH01000018_641736102": 1}, "b": {None: 0}, }, "perfect_interest": 2, "equal": 0, "summary": [ "#SeqId\tFirst\tSecond", "HABJ36W02EXF44\tNZ_ABEH01000018_641736102\t", "HABJ36W02DLDSY\tNZ_ABEH01000005_641736102\t", ], "filename": "p1_0-a1_50_p2_0-a2_30", } ], )
def test_process_results(self): """Check results are processed and summarized correctly""" best_hits = { 'HABJ36W02EXF44': [{ 'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000018_641736102', 'bit_score': 1005.0, 'percentage_id': 99.42, 'alg_length': 519}, 'b': {'subject_id': None, 'bit_score': -1}}], 'HABJ36W02DLDSY': [{ 'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000005_641736102', 'bit_score': 959.0, 'percentage_id': 99.22, 'alg_length': 512}, 'b': {'subject_id': None, 'bit_score': -1}}], 'SAME-VALUES': [{ 'a': {'evalue': 0.0, 'subject_id': 'RESULT-A', 'bit_score': 959.0, 'percentage_id': 99.22, 'alg_length': 512}, 'b': {'evalue': 0.0, 'subject_id': 'RESULT-B', 'bit_score': 959.0, 'percentage_id': 99.22, 'alg_length': 512}}], 'OTHER-BETTER': [{ 'a': {'evalue': 0.0, 'subject_id': 'RESULT-A', 'bit_score': 10.0, 'percentage_id': 10.0, 'alg_length': 10}, 'b': {'evalue': 0.0, 'subject_id': 'RESULT-B', 'bit_score': 959.0, 'percentage_id': 100, 'alg_length': 900}}] } out_results = process_results([0.80], [50], [0.30], [30], best_hits) self.assertEquals(out_results, [{ 'db_interest': 0, 'db_other': 1, 'db_seqs_counts': { 'a': {'NZ_ABEH01000005_641736102': 1, 'RESULT-A': 1, 'NZ_ABEH01000018_641736102': 1}, 'b': {None: 0, 'RESULT-B': 2}}, 'perfect_interest': 2, 'equal': 1, 'summary': ['#SeqId\tFirst\tSecond', 'HABJ36W02EXF44\tNZ_ABEH01000018_641736102\t', 'OTHER-BETTER\n\t', 'SAME-VALUES\tRESULT-A\tRESULT-B', 'HABJ36W02DLDSY\tNZ_ABEH01000005_641736102\t'], 'filename': 'p1_0-a1_50_p2_0-a2_30'}])
def test_process_results(self): """Check results are processed and summarized correctly""" best_hits = { 'HABJ36W02EXF44': [{ 'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000018_641736102', 'bit_score': 1005.0, 'percentage_id': 99.42, 'alg_length': 519}, 'b': {'subject_id': None, 'bit_score': -1}}], 'HABJ36W02DLDSY': [{ 'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000005_641736102', 'bit_score': 959.0, 'percentage_id': 99.22, 'alg_length': 512}, 'b': {'subject_id': None, 'bit_score': -1}}], 'SAME-VALUES': [{ 'a': {'evalue': 0.0, 'subject_id': 'RESULT-A', 'bit_score': 959.0, 'percentage_id': 99.22, 'alg_length': 512}, 'b': {'evalue': 0.0, 'subject_id': 'RESULT-B', 'bit_score': 959.0, 'percentage_id': 99.22, 'alg_length': 512}}], 'OTHER-BETTER': [{ 'a': {'evalue': 0.0, 'subject_id': 'RESULT-A', 'bit_score': 10.0, 'percentage_id': 10.0, 'alg_length': 10}, 'b': {'evalue': 0.0, 'subject_id': 'RESULT-B', 'bit_score': 959.0, 'percentage_id': 100, 'alg_length': 900}}], 'NO-VALS': [None] } out_results = process_results([0.80], [50], [0.30], [30], best_hits, self.base, False, False) # removing the file pointers so we don't need to test out_results[0].pop('summary_fh') out_results[0].pop('db_seqs_counts') self.assertEquals(out_results, [{ 'db_interest': 0, 'db_other': 1, 'perfect_interest': 2, 'equal': 1, 'filename': 'p1_0-a1_50_p2_0-a2_30'}])
def compare(interest_fp, other_fp, output_dir='blast-results-compare', interest_pcts=None, interest_alg_lens=None, other_pcts=None, other_alg_lens=None, hits_to_first=False, hits_to_second=False): """Compare two databases and write the outputs Parameters ---------- interest_fp : str BLAST results when searching against the database of interest. other_fp : str BLAST results when searching against the other database. output_dir : str, optional Name of the output file path. interest_pcts : list, optional Minimum percentage identity to be considered as a valid result in the interest database search results. If None is passed, it defaults to `[70]`. interest_alg_lens : list, optional Minimum alginment length to be considered a valid result in the interest database search results. If None is passed, it defaults to `[50]`. other_pcts : list, optional Minimum percentage identity to be considered as a valid result in the other database search results. If None is passed, it defaults to `[70]`. other_lengths : list, optional Minimum alginment length to be considered a valid result in the other database search results. If None is passed, it defaults to `[50]`. hits_to_first : bool, optional defaults to False Outputs the labels and counts of the sequences being hit in the first database. hits_to_second : bool, optional defaults to False Outputs the labels and counts of the sequences being hit in the second database. Raises ------ click.BadParameter If the `interest_pcts` and the `other_pcts` lists are of different length. If the `interest_alg_lens` and the `other_alg_lens` lists are of different length. """ if interest_pcts is None: interest_pcts = [70] if interest_alg_lens is None: interest_alg_lens = [50] db_a = open(interest_fp, 'U') db_b = open(other_fp, 'U') # try to create the output directory, if it exists, just continue create_dir(output_dir, False) # run some validations on the input parameters if other_pcts: if len(interest_pcts) != len(other_pcts): raise BadParameter("The percentage values for both databases " "should be the same length: %s - %s" % (interest_pcts, other_pcts)) else: other_pcts = interest_pcts if other_alg_lens: if len(interest_alg_lens) != len(other_alg_lens): raise BadParameter("The alignment length values for both databases" " should be the length : %s - %s" % (interest_alg_lens, other_alg_lens)) else: other_alg_lens = interest_alg_lens # process databases total_queries, best_hits = parse_first_database(db_a, interest_pcts, interest_alg_lens) parse_second_database(db_b, best_hits, other_pcts, other_alg_lens) # parse results results = process_results(interest_pcts, interest_alg_lens, other_pcts, other_alg_lens, best_hits) # Collating output and writing full results for i, item in enumerate(results): filename = join(output_dir, "summary_" + item['filename'] + ".txt") with open(filename, 'w') as fd: fd.write('\n'.join(item['summary'])) if i == 0: combined_results = [] combined_results.append(['filename']) combined_results.append(['interest db (%s)' % basename(interest_fp)]) combined_results.append(['other db (%s)' % basename(other_fp)]) combined_results.append(['only interest']) combined_results.append(['both dbs']) combined_results.append(['no hits in interest db']) no_hits = total_queries - item['db_interest'] - item['db_other'] - \ item['perfect_interest'] - item['equal'] combined_results[0].append(item['filename']) combined_results[1].append(str(item['db_interest'])) combined_results[2].append(str(item['db_other'])) combined_results[3].append(str(item['perfect_interest'])) combined_results[4].append(str(item['equal'])) combined_results[5].append(str(no_hits)) # tiny helper function to save hits files def save_hits(data, name): s_hits = sorted(data, key=itemgetter(1), reverse=True) filename = join(output_dir, name) with open(filename, 'w') as fd: fd.write('\n'.join(['%s\t%d' % (k, v) for k, v in s_hits if v != 0])) if hits_to_first: save_hits(item['db_seqs_counts']['a'].items(), "hits_to_first_db_%s.txt" % item['filename']) if hits_to_second: save_hits(item['db_seqs_counts']['b'].items(), "hits_to_second_db_%s.txt" % item['filename']) # saving collated results with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output: compiled_output.write('\n'.join(['\t'.join(item) for item in combined_results])) fn = join(output_dir, "compile_output_no_nohits.txt") with open(fn, 'w') as fd: fd.write('\n'.join(['\t'.join(item) for item in combined_results[:-1]]))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # turn a comma-separated list of numbers into a 1-D list of integers list_of_ints = lambda string: map(int, string.split(",")) input_path_interest = opts.input_path_interest input_path_other = opts.input_path_other percentage_ids = list_of_ints(opts.percentage_ids) alignment_lengths = list_of_ints(opts.alignment_lengths) hits_to_1st = opts.hits_to_1st hits_to_2nd = opts.hits_to_2nd percentage_ids_other = opts.percentage_ids_other alignment_lengths_other = opts.alignment_lengths_other output_dir = opts.output_dir db_a = open(input_path_interest, "U") db_b = open(input_path_other, "U") # try to create the output directory, if it exists, just continue create_dir(output_dir, False) # run some validations on the input parameters if percentage_ids_other: percentage_ids_other = list_of_ints(percentage_ids_other) if len(percentage_ids) != len(percentage_ids_other): option_parser.error( "The percentage values for both databases " "should be the same length: %s - %s" % (percentage_ids, percentage_ids_other) ) else: percentage_ids_other = percentage_ids if alignment_lengths_other: alignment_lengths_other = list_of_ints(alignment_lengths_other) if len(alignment_lengths) != len(alignment_lengths_other): option_parser.error( "The alignment length values for both databases" " should be the length : %s - %s" % (alignment_lengths, alignment_lengths_other) ) else: alignment_lengths_other = alignment_lengths # Process databases total_queries, best_hits = parse_first_database(db_a, percentage_ids, alignment_lengths) parse_second_database(db_b, best_hits, percentage_ids_other, alignment_lengths_other) # Parse results results = process_results( percentage_ids, alignment_lengths, percentage_ids_other, alignment_lengths_other, best_hits, input_path_interest, input_path_other, ) # Collating output and writing full results for i, item in enumerate(results): filename = join(output_dir, "summary_" + item["filename"] + ".txt") fd = open(filename, "w") fd.write("\n".join(item["summary"])) fd.close() if i == 0: combined_results = [] combined_results.append(["filename"]) combined_results.append(["interestdb (%s)" % input_path_interest]) combined_results.append(["other db (%s)" % input_path_other]) combined_results.append(["only interest"]) combined_results.append(["both dbs"]) combined_results.append(["no hits in interest db"]) no_hits = total_queries - item["db_interest"] - item["db_other"] - item["perfect_interest"] - item["equal"] combined_results[0].append(item["filename"]) combined_results[1].append(str(item["db_interest"])) combined_results[2].append(str(item["db_other"])) combined_results[3].append(str(item["perfect_interest"])) combined_results[4].append(str(item["equal"])) combined_results[5].append(str(no_hits)) # Printing count of hits to the db if hits_to_1st: s_hits = sorted(item["db_seqs_counts"]["a"].items(), key=itemgetter(1), reverse=True) filename = join(output_dir, "hits_to_1st_db_" + item["filename"] + ".txt") fd = open(filename, "w") fd.write("\n".join(["%s\t%d" % (k, v) for k, v in s_hits if v != 0])) fd.close() if hits_to_2nd: s_hits = sorted(item["db_seqs_counts"]["b"].items(), key=itemgetter(1), reverse=True) filename = join(output_dir, "hits_to_2nd_db_" + item["filename"] + ".txt") fd = open(filename, "w") fd.write("\n".join(["%s: %d" % (k, v) for k, v in s_hits if v != 0])) fd.close() # Printing collated results compiled_output_fd = open(join(output_dir, "compile_output.txt"), "w") compiled_output_fd.write("\n".join(["\t".join(item) for item in combined_results])) compiled_output_fd.close() compiled_output_no_hits_fd = open(join(output_dir, "compile_output_no_nohits.txt"), "w") compiled_output_no_hits_fd.write("\n".join(["\t".join(item) for item in combined_results[:-1]])) compiled_output_no_hits_fd.close()
def compare(interest_fp, other_fp, output_dir='blast-results-compare', interest_pcts=None, interest_alg_lens=None, other_pcts=None, other_alg_lens=None, hits_to_first=False, hits_to_second=False): """Compare two databases and write the outputs Parameters ---------- interest_fp : str BLAST results when searching against the database of interest. other_fp : str BLAST results when searching against the other database. output_dir : str, optional Name of the output file path. interest_pcts : list, optional Minimum percentage identity to be considered as a valid result in the interest database search results. If None is passed, it defaults to `[70]`. interest_alg_lens : list, optional Minimum alginment length to be considered a valid result in the interest database search results. If None is passed, it defaults to `[50]`. other_pcts : list, optional Minimum percentage identity to be considered as a valid result in the other database search results. If None is passed, it defaults to `[70]`. other_lengths : list, optional Minimum alginment length to be considered a valid result in the other database search results. If None is passed, it defaults to `[50]`. hits_to_first : bool, optional defaults to False Outputs the labels and counts of the sequences being hit in the first database. hits_to_second : bool, optional defaults to False Outputs the labels and counts of the sequences being hit in the second database. Raises ------ click.BadParameter If the `interest_pcts` and the `other_pcts` lists are of different length. If the `interest_alg_lens` and the `other_alg_lens` lists are of different length. """ if interest_pcts is None: interest_pcts = [70] if interest_alg_lens is None: interest_alg_lens = [50] db_a = open(interest_fp, 'U') db_b = open(other_fp, 'U') # try to create the output directory, if it exists, just continue create_dir(output_dir, False) # run some validations on the input parameters if other_pcts: if len(interest_pcts) != len(other_pcts): raise BadParameter("The percentage values for both databases " "should be the same length: %s - %s" % (interest_pcts, other_pcts)) else: other_pcts = interest_pcts if other_alg_lens: if len(interest_alg_lens) != len(other_alg_lens): raise BadParameter("The alignment length values for both databases" " should be the length : %s - %s" % (interest_alg_lens, other_alg_lens)) else: other_alg_lens = interest_alg_lens # process databases total_queries, best_hits = parse_first_database(db_a, interest_pcts, interest_alg_lens) parse_second_database(db_b, best_hits, other_pcts, other_alg_lens) # parse results results = process_results(interest_pcts, interest_alg_lens, other_pcts, other_alg_lens, best_hits) # Collating output and writing full results for i, item in enumerate(results): filename = join(output_dir, "summary_" + item['filename'] + ".txt") with open(filename, 'w') as fd: fd.write('\n'.join(item['summary'])) if i == 0: combined_results = [] combined_results.append(['filename']) combined_results.append( ['interest db (%s)' % basename(interest_fp)]) combined_results.append(['other db (%s)' % basename(other_fp)]) combined_results.append(['only interest']) combined_results.append(['both dbs']) combined_results.append(['no hits in interest db']) no_hits = total_queries - item['db_interest'] - item['db_other'] - \ item['perfect_interest'] - item['equal'] combined_results[0].append(item['filename']) combined_results[1].append(str(item['db_interest'])) combined_results[2].append(str(item['db_other'])) combined_results[3].append(str(item['perfect_interest'])) combined_results[4].append(str(item['equal'])) combined_results[5].append(str(no_hits)) # tiny helper function to save hits files def save_hits(data, name): s_hits = sorted(data, key=itemgetter(1), reverse=True) filename = join(output_dir, name) with open(filename, 'w') as fd: fd.write('\n'.join( ['%s\t%d' % (k, v) for k, v in s_hits if v != 0])) if hits_to_first: save_hits(item['db_seqs_counts']['a'].items(), "hits_to_first_db_%s.txt" % item['filename']) if hits_to_second: save_hits(item['db_seqs_counts']['b'].items(), "hits_to_second_db_%s.txt" % item['filename']) # saving collated results with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output: compiled_output.write('\n'.join( ['\t'.join(item) for item in combined_results])) fn = join(output_dir, "compile_output_no_nohits.txt") with open(fn, 'w') as fd: fd.write('\n'.join(['\t'.join(item) for item in combined_results[:-1]]))