def test_2_compare_hits_2(self): # test 2 hits with 1 function, case 1.2 hit = DiamondHit() hit.import_hit('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1 fig|485913.3.peg.8591 87.9 33 4 101 100 2 1 33 1.1e-09 58.5 UreA'.split('\t')) read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1') old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1') old_hit_list.add_hit(hit) read.hit_list = old_hit_list print ('*test_2_compare_hits_2: test 2 hits with 1 function, case 1.2 *') new_hits = ['NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1|100|2 fig|316274.7.peg.2519 87.9 33 4 101 99 1 1 33 2.1e-07 58.9' ] hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1|100|2') for new_hit in new_hits: hit = DiamondHit() hit.create_hit(new_hit.split('\t')) hit.annotate_hit(self.parser.ref_data) hit_list.add_hit(hit) self.assertEqual(read.functions['UreA'], 0.0) compare_hits_erpk_lca(read, 100, 2, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data) # print('Read status:', read.status) # print('Read function:', read.functions) self.assertEqual(read.status, STATUS_GOOD) self.assertEqual(len(read.functions), 1) self.assertEqual(read.functions['UreA'], get_erpk_score(101, 150, 15)) self.assertEqual(read.taxonomy, '65')
def import_hit_list(self): """Imports tab-separated table of DAIMOND hits. Use for resuming analysis after Fama restart Returns: :obj:dict[str, :obj:AnnotatedRead] """ infile = os.path.join(os.path.join(self.sample.work_directory, self.sample.sample_id + '_' + self.end + '_' + self.options.ref_hits_list_name)) ret_val = {} hit_list = None current_read_id = None with open(infile, 'r', newline='') as infile: tsvin = csv.reader(infile, delimiter='\t') for row in tsvin: if current_read_id is None: # initialize current_read_id = row[0] hit_list = DiamondHitList(current_read_id) elif current_read_id != row[0]: ret_val[current_read_id] = AnnotatedRead(current_read_id) ret_val[current_read_id].hit_list = hit_list current_read_id = row[0] hit_list = DiamondHitList(current_read_id) hit = DiamondHit() hit.import_hit(row) hit_list.add_hit(hit) ret_val[current_read_id] = AnnotatedRead(current_read_id) ret_val[current_read_id].hit_list = hit_list return ret_val
def test_2_compare_hits_12(self): # test 20 hits, one function, case 2.5 hit = DiamondHit() hit.import_hit('NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2 kegg|nio:NITINOP_1721 58.5 41 17 311 3 125 271 311 4.2e-05 44.3 RP-L22'.split('\t')) read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2') old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2') old_hit_list.add_hit(hit) read.hit_list = old_hit_list # print ('* test2_10: 17 hits with 1 function, case 2.5*') new_hits = ['NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125 kegg|nio:NITINOP_1721 58.5 41 17 311 1 123 271 311 1.0e-02 43.9', 'NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125 kegg|mgot:MgSA37_03614 58.5 41 17 128 1 123 88 128 1.4e-02 43.5', 'NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125 kegg|lfc:LFE_0874 58.5 41 17 127 1 123 87 127 1.4e-02 43.5', 'NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125 fig|269799.8.peg.645 53.7 41 19 127 1 123 87 127 1.8e-02 43.1', 'NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125 fig|653733.4.peg.1965 58.5 41 17 126 1 123 86 126 1.8e-02 43.1', 'NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125 kegg|gbe:GbCGDNIH1_0546 56.1 41 18 125 1 123 85 125 1.8e-02 43.1' ] hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125') for new_hit in new_hits: hit = DiamondHit() hit.create_hit(new_hit.split('\t')) hit.annotate_hit(self.parser.ref_data) hit_list.add_hit(hit) compare_hits_erpk_lca(read, 3, 125, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data) # print('Read status:', read.status) # print('Read function:', read.functions) self.assertEqual(read.status, STATUS_BAD) self.assertEqual(len(read.functions), 0) self.assertEqual(read.taxonomy, None)
def test_2_compare_hits_5(self): # test 7 hits, two functions, case 2.1 hit = DiamondHit() hit.import_hit('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1 fig|1121378.3.peg.2960 76.0 50 12 231 150 1 23 72 9.0e-15 75.5 UreA|UreB'.split('\t')) read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1') old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1') old_hit_list.add_hit(hit) read.hit_list = old_hit_list print ('* test_2_compare_hits_5: test 7 hits with 2 functions, case 2.1*') new_hits = ['NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1121378.3.peg.2960 76.0 50 12 231 150 1 23 72 3.3e-12 75.5', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 kegg|hco:LOKO_03690 72.0 50 14 100 150 1 23 72 4.3e-12 75.1', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1160705.3.peg.7402 72.0 50 14 236 150 1 159 208 4.3e-12 75.1', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1156841.3.peg.6425 74.0 50 13 100 150 1 23 72 4.3e-12 75.1', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 kegg|masw:AM586_12165 74.0 50 13 100 150 1 23 72 5.6e-12 74.7', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1203460.3.peg.2591 72.0 50 14 100 150 1 23 72 5.6e-12 74.7', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1121943.4.peg.3735 72.0 50 14 100 150 1 23 72 5.6e-12 74.7', ] hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1') for new_hit in new_hits: hit = DiamondHit() hit.create_hit(new_hit.split('\t')) hit.annotate_hit(self.parser.ref_data) hit_list.add_hit(hit) compare_hits_erpk_lca(read, 150, 1, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data) # print('Read status:', read.get_status()) # print('Read function:', read.get_functions()) self.assertEqual(read.status, STATUS_GOOD) self.assertEqual(len(read.functions), 1) self.assertEqual(read.functions['UreA'], get_erpk_score(231, 150, 15)) self.assertEqual(read.taxonomy, '2')
def parse_reference_output(self): """Reads and processes DIAMOND tabular output of the first DIAMOND search. Note: this function finds query sequences similar to reference proteins. Since a query sequence may have more than one areas of similarity (for instance, in fusion proteins of two subunits or in multi-domain proteins), it will try to find as many such areas as possible. DIAMOND hits are filtered by two parameters: length of alignment and amino acid identity %. This function does not return anything. Instead, it populates 'reads' dictionary with AnnotatedRead objects. """ tsvfile = os.path.join( self.options.get_project_dir(self.sample.sample_id), self.sample.sample_id + '_' + self.end + '_' + self.options.ref_output_name ) current_sequence_read_id = '' hit_list = DiamondHitList(current_sequence_read_id) # TODO: cleanup identity_cutoff = self.config.get_identity_cutoff(self.collection) length_cutoff = self.config.get_length_cutoff(self.collection) print('Length cutoff:', length_cutoff) with open(tsvfile, 'r', newline='') as infile: tsvin = csv.reader(infile, delimiter='\t') for row in tsvin: hit = DiamondHit() (row[0], _) = parse_fastq_seqid(row[0]) hit.create_hit(row) # filtering by length if hit.length < length_cutoff: continue # go to next hit if hit.query_id != current_sequence_read_id: # when new query ID reached, process collected hits, # then start over with new query identifier # filtering: remove overlapping hits hit_list.filter_list(self.config.get_overlap_cutoff(self.collection)) # if any hits left, assign function to hits and populate reads dictionary hit_list.annotate_hits(self.ref_data) hit_list.filter_list_by_identity(self.ref_data) if hit_list.hits_number != 0: read = AnnotatedRead(current_sequence_read_id) read.hit_list = hit_list self.reads[current_sequence_read_id] = read # start over current_sequence_read_id = hit.query_id hit_list = DiamondHitList(current_sequence_read_id) hit_list.add_hit(hit) # when EOF reached, process collected hits hit_list.filter_list(self.config.get_overlap_cutoff(self.collection)) hit_list.annotate_hits(self.ref_data) hit_list.filter_list_by_identity(self.ref_data) if hit_list.hits_number != 0: read = AnnotatedRead(current_sequence_read_id) read.hit_list = hit_list self.reads[current_sequence_read_id] = read
def test_3_protein_taxonomy(self): self.project.import_reads_json(sample, ENDS) protein = 'D16-4706_contig_11213_7' print('D16-4706_contig_11213_7 taxonomy') print(self.project.samples[sample].reads[end][protein].taxonomy) parser = DiamondParser(config=self.project.config, options=self.project.options, taxonomy_data=self.project.taxonomy_data, ref_data=self.project.ref_data, sample=self.project.samples[sample], end=end) parser.parse_reference_output() print(str(parser.reads[protein])) # parse_background_output(parser) hit_line = 'D16-4706_contig_11213_7|4|257 fig|408672.3.peg.2637 63.0 254 94 256 1 254 2 255 1.1e-97 362.1' hit = DiamondHit() hit.create_hit(tabular_output_fields=hit_line.split('\t')) hit_list = DiamondHitList('D16-4706_contig_11213_7|4|257') hit_list.add_hit(hit) hit_list.annotate_hits(self.project.ref_data) hit_list.filter_list_by_identity(self.project.ref_data) print('hit_list') print(hit_list) compare_protein_hits_lca(parser.reads[protein], 4, 257, hit_list, 0.03, 1.0, 1.0, self.project.taxonomy_data, self.project.ref_data) print(parser.reads[protein].taxonomy) self.assertEqual(parser.reads[protein].taxonomy, '408672')
def parse_reference_output(self): """Reads and processes DIAMOND tabular output of the preselection DIAMOND search. Note: this function finds query sequences similar to reference proteins. Since a query sequence may have more than one areas of similarity (for instance, in fusion proteins of two subunits or in multi-domain proteins), it will try to find as many such areas as possible. DIAMOND hits are filtered by two parameters: length of alignment and amino acid identity %, which are defined in program config ini. """ tsvfile = os.path.join(self.assembly_dir, 'all_contigs_' + self.project.options.ref_output_name) current_id = '' hit_list = DiamondHitList(current_id) identity_cutoff = self.project.config.get_identity_cutoff( self.project.options.get_collection()) length_cutoff = self.project.config.get_length_cutoff( self.project.options.get_collection()) print('Parse reference output: Identity cutoff: ', identity_cutoff, ', Length cutoff: ', length_cutoff) with open(tsvfile, 'r', newline='') as infile: tsvin = csv.reader(infile, delimiter='\t') for row in tsvin: hit = DiamondHit() hit.create_hit(row) # filtering by identity and length if hit.identity < identity_cutoff: continue # skip this line if hit.length < length_cutoff: continue # skip this line if hit.query_id != current_id: # filter list for overlapping hits hit_list.filter_list(self.project.config.get_overlap_cutoff( self.project.options.get_collection())) if hit_list.hits_number != 0: # annotate_hits hit_list.annotate_hits(self.project.ref_data) function_id, contig_id, _ = parse_gene_id(current_id) self.assembly.contigs[function_id][contig_id].\ genes[current_id].hit_list = hit_list current_id = hit.query_id hit_list = DiamondHitList(current_id) hit_list.add_hit(hit) hit_list.filter_list( self.project.config.get_overlap_cutoff(self.project.options.get_collection())) if hit_list.hits_number != 0: # annotate_hits hit_list.annotate_hits(self.project.ref_data) function_id, contig_id, _ = parse_gene_id(current_id) self.assembly.contigs[function_id][contig_id].genes[current_id].hit_list = \ hit_list
def decode_assembly(obj): """Custom JSON decoder for Sample object Args: obj (obj): GeneAssembly object to decode """ if '__DiamondHitList__' in obj: diamond_hit_list = DiamondHitList() diamond_hit_list.__dict__.update(obj['__DiamondHitList__']) return diamond_hit_list elif '__DiamondHit__' in obj: diamond_hit = DiamondHit() diamond_hit.__dict__.update(obj['__DiamondHit__']) return diamond_hit elif '__Contig__' in obj: contig = Contig() contig.__dict__.update(obj['__Contig__']) return contig elif '__Gene__' in obj: gene = Gene() gene.__dict__.update(obj['__Gene__']) return gene elif '__GeneAssembly__' in obj: gene_assembly = GeneAssembly() gene_assembly.__dict__.update(obj['__GeneAssembly__']) return gene_assembly return obj
def test_2_compare_hits_4(self): # test hit with two functions hit = DiamondHit() hit.import_hit('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1 fig|1121378.3.peg.2960 76.0 50 12 231 150 1 23 72 9.0e-15 75.5 UreA|UreB'.split('\t')) read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1') old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1') old_hit_list.add_hit(hit) read.hit_list = old_hit_list # test 20 hits, one function print ('*test_2_compare_hits_4: test 20 hits with 2 functions, case 2.1*') new_hits = ['NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1121378.3.peg.2960 76.0 50 12 231 150 1 23 72 3.3e-12 75.5', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 kegg|hco:LOKO_03690 72.0 50 14 100 150 1 23 72 4.3e-12 75.1', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1160705.3.peg.7402 72.0 50 14 236 150 1 159 208 4.3e-12 75.1', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1156841.3.peg.6425 74.0 50 13 100 150 1 23 72 4.3e-12 75.1', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 kegg|masw:AM586_12165 74.0 50 13 100 150 1 23 72 5.6e-12 74.7', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1203460.3.peg.2591 72.0 50 14 100 150 1 23 72 5.6e-12 74.7', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1121943.4.peg.3735 72.0 50 14 100 150 1 23 72 5.6e-12 74.7', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 kegg|samb:SAM23877_1321 72.0 50 14 100 150 1 23 72 7.3e-12 74.3', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|100226.15.peg.1235 72.0 50 14 100 150 1 23 72 7.3e-12 74.3', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|290398.11.peg.2325 72.0 50 14 100 150 1 23 72 7.3e-12 74.3', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1000565.3.peg.3274 72.0 50 14 100 150 1 23 72 9.6e-12 73.9', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 kegg|hhu:AR456_08480 68.0 50 16 100 150 1 23 72 1.3e-11 73.6', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1126229.3.peg.323 70.0 50 15 100 150 1 23 72 1.3e-11 73.6', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1155714.3.peg.3022 70.0 50 15 100 150 1 23 72 1.3e-11 73.6', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1155716.3.peg.3618 72.0 50 14 100 150 1 23 72 1.3e-11 73.6', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1155718.3.peg.3182 68.0 50 16 100 150 1 23 72 1.3e-11 73.6', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|591167.6.peg.5739 70.0 50 15 100 150 1 23 72 1.3e-11 73.6', 'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1 fig|1054862.3.peg.1097 72.0 50 14 103 150 1 26 75 1.3e-11 73.6' ] hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1') for new_hit in new_hits: hit = DiamondHit() hit.create_hit(new_hit.split('\t')) hit.annotate_hit(self.parser.ref_data) hit_list.add_hit(hit) compare_hits_erpk_lca(read, 150, 1, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data) # print('268 Read status:', read.get_status()) # print('269 Read function:', read.get_functions()) self.assertEqual(read.status, STATUS_GOOD) self.assertEqual(len(read.functions), 1) self.assertEqual(read.functions['UreA'], get_erpk_score(231, 150, 15)) self.assertEqual(read.taxonomy, '2')
def test_2_compare_hits_7(self): # test hit with one function and many close homologs, case 2.2 hit = DiamondHit() hit.import_hit('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1 fig|363754.4.peg.84 88.0 50 6 570 1 150 270 319 7.6e-22 99.0 UreC'.split('\t')) read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1') old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1') old_hit_list.add_hit(hit) read.hit_list = old_hit_list # test 20 hits, one function print ('* test 17 hits with 1 function, case 2.2*') new_hits = ['NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|266835.9.peg.3902 88.0 50 6 570 1 150 270 319 2.8e-19 99.0', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|363754.4.peg.84 90.0 50 5 590 1 150 290 339 4.7e-19 98.2', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|411684.3.peg.2730 88.0 50 6 570 1 150 270 319 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|176299.10.peg.2410 90.0 50 5 569 1 150 269 318 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|maad:AZF01_14085 86.0 50 7 570 1 150 270 319 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|266834.11.peg.3959 88.0 50 6 570 1 150 270 319 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|six:BSY16_122 88.0 50 6 568 1 150 269 318 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1144306.3.peg.717 86.0 50 7 569 1 150 269 318 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|meso:BSQ44_05880 88.0 50 6 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|118163.3.peg.2804 90.0 50 5 565 1 150 266 315 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|244592.3.peg.2201 90.0 50 5 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1150469.3.peg.1765 90.0 50 5 583 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|don:BSK21_03675 90.0 50 5 566 1 150 269 318 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1048680.4.peg.3848 86.0 50 7 569 1 150 269 318 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|311402.9.peg.5014 86.0 50 7 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|hoe:IMCC20628_02897 88.0 50 6 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|744979.4.peg.2070 88.0 50 6 570 1 150 270 319 8.1e-19 97.4' ] hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150') for new_hit in new_hits: hit = DiamondHit() hit.create_hit(new_hit.split('\t')) hit.annotate_hit(self.parser.ref_data) hit_list.add_hit(hit) compare_hits_erpk_lca(read, 1, 150, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data) # print('Read status:', read.get_status()) # print('Read function:', read.get_functions()) self.assertEqual(read.status, STATUS_GOOD) self.assertEqual(len(read.functions), 1) self.assertEqual(read.functions['UreC'], get_erpk_score(570, 150, 15)) self.assertEqual(read.taxonomy, '28211')
def test_2_compare_hits_8(self): # test hit with one function and many close homologs, case 2.4 hit = DiamondHit() hit.import_hit('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1 fig|363754.4.peg.84 88.0 50 6 570 1 150 270 319 7.6e-22 99.0 UreC'.split('\t')) read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1') old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1') old_hit_list.add_hit(hit) read.hit_list = old_hit_list # test 20 hits, one function, case 2.4 print ('* test2_8: 17 hits with 1 function, case 2.4*') new_hits = ['NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id1 88.0 50 6 570 1 150 270 319 2.8e-19 99.0', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id2 90.0 50 5 590 1 150 290 339 4.7e-19 98.2', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id3 88.0 50 6 570 1 150 270 319 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id4 90.0 50 5 569 1 150 269 318 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id5 86.0 50 7 570 1 150 270 319 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id6 88.0 50 6 570 1 150 270 319 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id7 88.0 50 6 568 1 150 269 318 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id8 86.0 50 7 569 1 150 269 318 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id9 88.0 50 6 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id10 90.0 50 5 565 1 150 266 315 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id11 90.0 50 5 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id12 90.0 50 5 583 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id13 90.0 50 5 566 1 150 269 318 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id14 86.0 50 7 569 1 150 269 318 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id15 86.0 50 7 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id16 88.0 50 6 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fake_id17 88.0 50 6 570 1 150 270 319 8.1e-19 97.4' ] hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150') for new_hit in new_hits: hit = DiamondHit() hit.create_hit(new_hit.split('\t')) hit.annotate_hit(self.parser.ref_data) hit_list.add_hit(hit) compare_hits_erpk_lca(read, 1, 150, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data) # print('Read status:', read.get_status()) # print('Read function:', read.get_functions()) self.assertEqual(read.status, STATUS_BAD) self.assertEqual(len(read.functions), 0) self.assertEqual(read.taxonomy, None)
def test_2_compare_hits_3(self): # test 3 hits with 1 function, case 1.3 hit = DiamondHit() hit.import_hit('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1 fig|485913.3.peg.8591 87.9 33 4 101 100 2 1 33 1.1e-09 58.5 UreA'.split('\t')) read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1') old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1') old_hit_list.add_hit(hit) read.hit_list = old_hit_list print ('* test 1 hit with 1 function, case 1.3 *') new_hits = ['NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1|100|2 fake_id 87.9 33 4 101 99 1 1 33 2.1e-07 58.9' ] hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1|100|2') for new_hit in new_hits: hit = DiamondHit() hit.create_hit(new_hit.split('\t')) hit.annotate_hit(self.parser.ref_data) hit_list.add_hit(hit) compare_hits_erpk_lca(read, 100, 2, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data) # print('Read status:', read.get_status()) # print('Read function:', read.get_functions()) self.assertEqual(read.status, STATUS_BAD) self.assertEqual(len(read.functions), 0) self.assertEqual(read.taxonomy, None)
def decode_reads(obj): """Custom JSON decoder for AnnotatedRead object Args: obj (obj): AnnotatedRead object to decode """ if '__AnnotatedRead__' in obj: annotated_read = AnnotatedRead() annotated_read.__dict__.update(obj['__AnnotatedRead__']) return annotated_read elif '__DiamondHitList__' in obj: diamond_hit_list = DiamondHitList() diamond_hit_list.__dict__.update(obj['__DiamondHitList__']) return diamond_hit_list elif '__DiamondHit__' in obj: diamond_hit = DiamondHit() diamond_hit.__dict__.update(obj['__DiamondHit__']) return diamond_hit return obj
def decode_sample(obj): """Custom JSON decoder for Sample object Args: obj (obj): Sample object to decode """ if '__Sample__' in obj: sample = Sample() sample.__dict__.update(obj['__Sample__']) return sample elif '__AnnotatedRead__' in obj: annotated_read = AnnotatedRead() annotated_read.__dict__.update(obj['__AnnotatedRead__']) return annotated_read elif '__DiamondHitList__' in obj: diamond_hit_list = DiamondHitList() diamond_hit_list.__dict__.update(obj['__DiamondHitList__']) return diamond_hit_list elif '__DiamondHit__' in obj: diamond_hit = DiamondHit() diamond_hit.__dict__.update(obj['__DiamondHit__']) return diamond_hit return obj
def parse_background_output(self): """Reads and processes DIAMOND tabular output of the classification DIAMOND search. Note: this function takes existing list of hits and compares each of them with results of new similarity serach (against classification DB). For the comparison, it calls compare_hits_lca function. """ tsvfile = os.path.join(self.assembly_dir, 'all_contigs_' + self.project.options.background_output_name) current_query_id = None hit_list = None length_cutoff = self.project.config.get_length_cutoff( self.project.options.get_collection()) biscore_range_cutoff = self.project.config.get_biscore_range_cutoff( self.project.options.get_collection()) print('Relative bit-score cutoff: ', biscore_range_cutoff, ', Length cutoff: ', length_cutoff) average_coverage = self.assembly.calculate_average_coverage() with open(tsvfile, 'r', newline='') as infile: tsvin = csv.reader(infile, delimiter='\t') function_id = '' contig_id = '' gene_id = '' coverage = '' for row in tsvin: if current_query_id is None: current_query_id = row[0] hit_list = DiamondHitList(current_query_id) hit = DiamondHit() hit.create_hit(row) # filtering by identity and length if hit.length < length_cutoff: continue # skip this hit if hit.query_id != current_query_id: hit_list.annotate_hits(self.project.ref_data) hit_list.filter_list_by_identity(self.project.ref_data) # compare list of hits from search in background DB with existing # hit from search in reference DB current_query_id_tokens = current_query_id.split('|') function_id = current_query_id_tokens[0] contig_id = '_'.join(current_query_id_tokens[1].split('_')[:-1]) gene_id = '|'.join(current_query_id_tokens[:-2]) coverage = self.assembly.contigs[function_id][contig_id].get_coverage() try: compare_protein_hits_lca( self.assembly.contigs[function_id][contig_id].genes[gene_id], int(current_query_id_tokens[-2]), # hit_start int(current_query_id_tokens[-1]), # hit_end hit_list, biscore_range_cutoff, coverage, average_coverage, self.project.taxonomy_data, self.project.ref_data ) except KeyError: print(' '.join(['Gene not found:', gene_id, 'in', function_id, contig_id])) current_query_id = hit.query_id hit_list = DiamondHitList(current_query_id) hit_list.add_hit(hit) hit_list.annotate_hits(self.project.ref_data) hit_list.filter_list_by_identity(self.project.ref_data) current_query_id_tokens = current_query_id.split('|') function_id = current_query_id_tokens[0] contig_id = '_'.join(current_query_id_tokens[1].split('_')[:-1]) gene_id = '|'.join(current_query_id_tokens[:-2]) coverage = self.assembly.contigs[function_id][contig_id].get_coverage() try: compare_protein_hits_lca( self.assembly.contigs[function_id][contig_id].genes[gene_id], int(current_query_id_tokens[-2]), # hit_start int(current_query_id_tokens[-1]), # hit_end hit_list, biscore_range_cutoff, coverage, average_coverage, self.project.taxonomy_data, self.project.ref_data ) except KeyError: print(' '.join(['Gene not found:', gene_id, 'in', function_id, contig_id]))
def test_2_compare_hits_6(self): # test hit with one function and many close homologs, case 2.1 hit = DiamondHit() hit.import_hit('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1 fig|363754.4.peg.84 88.0 50 6 570 1 150 270 319 7.6e-22 99.0 UreC'.split('\t')) read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1') old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1') old_hit_list.add_hit(hit) read.hit_list = old_hit_list # test 20 hits, one function print ('* test_2_compare_hits_6: test 40 hits with 1 function, case 2.1*') new_hits = ['NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|363754.4.peg.84 88.0 50 6 570 1 150 270 319 2.8e-19 99.0', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|266835.9.peg.3902 90.0 50 5 590 1 150 290 339 4.7e-19 98.2', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|411684.3.peg.2730 88.0 50 6 570 1 150 270 319 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|176299.10.peg.2410 90.0 50 5 569 1 150 269 318 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|maad:AZF01_14085 86.0 50 7 570 1 150 270 319 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|266834.11.peg.3959 88.0 50 6 570 1 150 270 319 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|six:BSY16_122 88.0 50 6 568 1 150 269 318 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1144306.3.peg.717 86.0 50 7 569 1 150 269 318 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|meso:BSQ44_05880 88.0 50 6 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|118163.3.peg.2804 90.0 50 5 565 1 150 266 315 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|244592.3.peg.2201 90.0 50 5 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1150469.3.peg.1765 90.0 50 5 583 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|don:BSK21_03675 90.0 50 5 566 1 150 269 318 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1048680.4.peg.3848 86.0 50 7 569 1 150 269 318 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|311402.9.peg.5014 86.0 50 7 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|hoe:IMCC20628_02897 88.0 50 6 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|744979.4.peg.2070 88.0 50 6 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1123256.3.peg.2564 90.0 50 5 570 1 150 270 319 1.1e-18 97.1', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|labr:CHH27_19355 90.0 50 5 570 1 150 270 319 1.1e-18 97.1', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1156935.5.peg.1846 86.0 50 7 570 1 150 270 319 1.1e-18 97.1', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|314231.3.peg.911 90.0 50 5 570 1 150 270 319 1.1e-18 97.1', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|216596.11.peg.5042 86.0 50 7 570 1 150 270 319 1.1e-18 97.1', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|292564.3.peg.2428 88.0 50 6 574 1 150 270 319 1.1e-18 97.1', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1122218.3.peg.2653 88.0 50 6 570 1 150 270 319 1.4e-18 96.7', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|633131.3.peg.1456 88.0 50 6 586 1 150 287 336 1.4e-18 96.7', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|375451.14.peg.3633 88.0 50 6 569 1 150 270 319 1.4e-18 96.7', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|391593.3.peg.1570 88.0 50 6 569 1 150 270 319 1.4e-18 96.7', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1120961.3.peg.1492 86.0 50 7 570 1 150 270 319 1.8e-18 96.3', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|756067.3.peg.5652 88.0 50 6 581 1 150 266 315 1.8e-18 96.3', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|640510.4.peg.718 88.0 50 6 568 1 150 269 318 1.8e-18 96.3', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|272129.4.peg.175 88.0 50 6 452 1 150 266 315 1.8e-18 96.3', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|197221.4.peg.4 88.0 50 6 572 1 150 270 319 1.8e-18 96.3', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|314256.5.peg.1029 88.0 50 6 569 1 150 270 319 1.8e-18 96.3', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|313624.3.peg.4907 88.0 50 6 568 1 150 270 319 1.8e-18 96.3', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|402777.3.peg.1444 88.0 50 6 601 1 150 270 319 1.8e-18 96.3', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|179408.3.peg.7446 88.0 50 6 603 1 150 266 315 1.8e-18 96.3', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|643473.3.peg.1780 88.0 50 6 564 1 150 266 315 1.8e-18 96.3', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1173263.3.peg.2622 86.0 50 7 565 1 150 266 315 2.4e-18 95.9', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|391038.7.peg.2407 88.0 50 6 568 1 150 269 318 2.4e-18 95.9', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|103690.10.peg.4352 88.0 50 6 568 1 150 270 319 2.4e-18 95.9' ] hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150') for new_hit in new_hits: hit = DiamondHit() hit.create_hit(new_hit.split('\t')) hit.annotate_hit(self.parser.ref_data) hit_list.add_hit(hit) compare_hits_erpk_lca(read, 1, 150, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data) # print('Read status:', read.get_status()) # print('Read function:', read.get_functions()) self.assertEqual(read.status, STATUS_GOOD) self.assertEqual(len(read.functions), 1) self.assertEqual(read.functions['UreC'], get_erpk_score(570, 150, 15)) self.assertEqual(read.taxonomy, '2')
def test_6_annotate_hit(self): print ('Test hit annotation') hit_line = 'NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1 fig|485913.3.peg.8591 87.9 33 4 101 100 2 1 33 1.1e-09 58.5 UreA' hit = DiamondHit() hit.import_hit(hit_line.split('\t')) hit.annotate_hit(self.parser.ref_data) self.assertEqual(len(hit.functions), 1) self.assertEqual(hit.functions[0], 'UreA') hit_line = 'NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1|100|2 kegg|cap:CLDAP_03170 81.8 33 6 100 99 1 1 33 1.4e-06 56.2 UreA' hit = DiamondHit() hit.import_hit(hit_line.split('\t')) hit.annotate_hit(self.parser.ref_data) self.assertEqual(len(hit.functions), 1) self.assertEqual(hit.functions[0], 'UreA')
def test_2_compare_hits_11(self): # read with two hits hit1 = DiamondHit() hit1.import_hit('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1 fig|363754.4.peg.84 88.0 50 6 570 1 150 270 319 7.6e-22 99.0 UreC'.split('\t')) hit2 = DiamondHit() hit2.import_hit('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1 fig|485913.3.peg.8591 87.9 33 4 101 100 2 1 33 1.1e-09 58.5 UreA'.split('\t')) read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1') old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1') old_hit_list.add_hit(hit1) old_hit_list.add_hit(hit2) read.hit_list = old_hit_list print ('* test read with 2 hits *') new_hits = ['NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|100|2 fig|485913.3.peg.8591 87.9 33 4 101 99 1 1 33 2.1e-07 58.9', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|100|2 kegg|cap:CLDAP_03170 81.8 33 6 100 99 1 1 33 1.4e-06 56.2', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|100|2 fig|316274.7.peg.2519 78.8 33 7 100 99 1 1 33 6.8e-06 53.9' ] hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|100|2') for new_hit in new_hits: hit = DiamondHit() hit.create_hit(new_hit.split('\t')) hit.annotate_hit(self.parser.ref_data) hit_list.add_hit(hit) compare_hits_erpk_lca(read, 100, 2, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data) new_hits = ['NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|266835.9.peg.3902 88.0 50 6 570 1 150 270 319 2.8e-19 99.0', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|363754.4.peg.84 90.0 50 5 590 1 150 290 339 4.7e-19 98.2', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|411684.3.peg.2730 88.0 50 6 570 1 150 270 319 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|176299.10.peg.2410 90.0 50 5 569 1 150 269 318 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|maad:AZF01_14085 86.0 50 7 570 1 150 270 319 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|266834.11.peg.3959 88.0 50 6 570 1 150 270 319 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|six:BSY16_122 88.0 50 6 568 1 150 269 318 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1144306.3.peg.717 86.0 50 7 569 1 150 269 318 6.2e-19 97.8', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|meso:BSQ44_05880 88.0 50 6 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|244592.3.peg.2201 90.0 50 5 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1150469.3.peg.1765 90.0 50 5 583 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|don:BSK21_03675 90.0 50 5 566 1 150 269 318 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|1048680.4.peg.3848 86.0 50 7 569 1 150 269 318 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|311402.9.peg.5014 86.0 50 7 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 kegg|hoe:IMCC20628_02897 88.0 50 6 570 1 150 270 319 8.1e-19 97.4', 'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150 fig|744979.4.peg.2070 88.0 50 6 570 1 150 270 319 8.1e-19 97.4' ] hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150') for new_hit in new_hits: hit = DiamondHit() hit.create_hit(new_hit.split('\t')) hit.annotate_hit(self.parser.ref_data) hit_list.add_hit(hit) compare_hits_erpk_lca(read, 1, 150, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data) # print('Read status:', read.status) # print('Read function:', read.functions) # print('Read hits:', read.show_hits()) self.assertEqual(read.status, STATUS_GOOD) self.assertEqual(len(read.functions), 2) self.assertEqual(read.functions['UreA'], get_erpk_score(101, 150, 15)) self.assertEqual(read.taxonomy, '28211')
def parse_background_output(self): """Reads and processes DIAMOND tabular output of the second DIAMOND search. Note: this function takes existing list of hits and compares each of them with results of other similarity serach (against larger DB). For the comparison, it calls compare_hits_erpk_lca function, which in turn updates entries in the 'reads' dictionary. Raises: KeyError if read identifier not found in the 'reads' dictionary """ if not self.reads: # Something went wrong and 'reads' dictionary is empty. # Let's try to import list of reads from file. self.reads = self.import_hit_list() tsvfile = os.path.join( self.sample.work_directory, self.sample.sample_id + '_' + self.end + '_' + self.options.background_output_name ) average_read_length = self.sample.get_avg_read_length(self.end) current_query_id = None hit_list = None length_cutoff = self.config.get_length_cutoff(self.collection) bitscore_range_cutoff = self.config.get_biscore_range_cutoff(self.collection) print('Relative bit-score cutoff:', bitscore_range_cutoff, ', Length cutoff:', length_cutoff ) with open(tsvfile, 'r', newline='') as infile: tsvin = csv.reader(infile, delimiter='\t') for row in tsvin: if current_query_id is None: current_query_id = row[0] hit_list = DiamondHitList(current_query_id) hit = DiamondHit() hit.create_hit(row) # filtering by length if hit.length < length_cutoff: continue # skip this line # when new query ID reached, process collected hits, # then start over with new query identifier if hit.query_id != current_query_id: # assign functions to selected hits hit_list.annotate_hits(self.ref_data) hit_list.filter_list_by_identity(self.ref_data) # extract initial read identifier from identifier of the hit current_query_id_tokens = current_query_id.split('|') read_id = '|'.join(current_query_id_tokens[:-2]) # compare list of hits from search in background DB # with existing hit from the first similarity search try: compare_hits_erpk_lca( self.reads[read_id], int(current_query_id_tokens[-2]), # hit_start int(current_query_id_tokens[-1]), # hit_end hit_list, bitscore_range_cutoff, length_cutoff, average_read_length, self.taxonomy_data, self.ref_data ) except KeyError: print('Read not found: ', read_id) # starting over current_query_id = hit.query_id hit_list = DiamondHitList(current_query_id) hit_list.add_hit(hit) # when EOF reached, process collected hits # assign functions to selected hits hit_list.annotate_hits(self.ref_data) hit_list.filter_list_by_identity(self.ref_data) # extract initial read identifier from identifier of the hit current_query_id_tokens = current_query_id.split('|') read_id = '|'.join(current_query_id_tokens[:-2]) # compare list of hits from search in background DB with # existing hit from the first similarity search try: compare_hits_erpk_lca( self.reads[read_id], int(current_query_id_tokens[-2]), # hit_start int(current_query_id_tokens[-1]), # hit_end hit_list, bitscore_range_cutoff, length_cutoff, average_read_length, self.taxonomy_data, self.ref_data ) except KeyError: print('Read not found: ', read_id)
def parse_background_output(parser): """Reads and processes DIAMOND tabular output of the second DIAMOND search. Args: parser (:obj:DiamondParser): parser object Note: this function takes existing list of hits and compares each of them with results of other similarity serach (against larger DB). For the comparison, it calls compare_hits_lca function, which in turn updates entries in the 'reads' dictionary. Raises: KeyError if read identifier not found in the 'reads' dictionary """ tsvfile = os.path.join( parser.sample.work_directory, parser.sample.sample_id + '_' + parser.end + '_' + parser.options.background_output_name) coverage_data = load_coverage_data(parser) total_coverage = 0.0 if coverage_data: for contig_id in coverage_data.keys(): total_coverage += coverage_data[contig_id] average_coverage = total_coverage / len(coverage_data) else: average_coverage = 1.0 current_query_id = None _hit_list = None length_cutoff = parser.config.get_length_cutoff(parser.collection) biscore_range_cutoff = parser.config.get_biscore_range_cutoff( parser.collection) print('Relative bitscore cutoff: ', biscore_range_cutoff, ', Length cutoff: ', length_cutoff) with open(tsvfile, 'r', newline='') as infile: tsvin = csv.reader(infile, delimiter='\t') for row in tsvin: if current_query_id is None: current_query_id = row[0] _hit_list = DiamondHitList(current_query_id) hit = DiamondHit() hit.create_hit(row) # filtering by length if hit.length < length_cutoff: continue if hit.query_id != current_query_id: _hit_list.annotate_hits(parser.ref_data) _hit_list.filter_list_by_identity(parser.ref_data) current_query_id_tokens = current_query_id.split('|') protein_id = '|'.join(current_query_id_tokens[:-2]) hit_start = int(current_query_id_tokens[-2]) hit_end = int(current_query_id_tokens[-1]) # Coverage data can be used only if protein ID contains contig ID contig_id = '_'.join( protein_id.split(' # ')[0].split('_')[:-1])[1:] coverage = 1.0 if coverage_data is not None and contig_id in coverage_data: coverage = coverage_data[contig_id] try: compare_protein_hits_lca(parser.reads[protein_id], hit_start, hit_end, _hit_list, biscore_range_cutoff, coverage, average_coverage, parser.taxonomy_data, parser.ref_data) except KeyError: print('Protein not found: ', protein_id) current_query_id = hit.query_id _hit_list = DiamondHitList(current_query_id) _hit_list.add_hit(hit) _hit_list.annotate_hits(parser.ref_data) _hit_list.filter_list_by_identity(parser.ref_data) current_query_id_tokens = current_query_id.split('|') protein_id = '|'.join(current_query_id_tokens[:-2]) hit_start = int(current_query_id_tokens[-2]) hit_end = int(current_query_id_tokens[-1]) try: # Coverage data can be used only if protein ID contains contig ID contig_id = '_'.join( protein_id.split(' # ')[0].split('_')[:-1])[1:] coverage = 1.0 if coverage_data is not None and contig_id in coverage_data: coverage = coverage_data[contig_id] compare_protein_hits_lca(parser.reads[protein_id], hit_start, hit_end, _hit_list, biscore_range_cutoff, coverage, average_coverage, parser.taxonomy_data, parser.ref_data) except KeyError: print('Protein not found: ', protein_id)