示例#1
0
    def test_2_compare_hits_12(self):
        # test 20 hits, one function, case 2.5
        hit = DiamondHit()
        hit.import_hit('NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2	kegg|nio:NITINOP_1721	58.5	41	17	311	3	125	271	311	4.2e-05	44.3	RP-L22'.split('\t'))
        read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2')
        old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2')
        old_hit_list.add_hit(hit)
        read.hit_list = old_hit_list
#        print ('* test2_10: 17 hits with 1 function, case 2.5*')
        new_hits = ['NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125	kegg|nio:NITINOP_1721	58.5	41	17	311	1	123	271	311	1.0e-02	43.9',
                    'NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125	kegg|mgot:MgSA37_03614	58.5	41	17	128	1	123	88	128	1.4e-02	43.5',
                    'NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125	kegg|lfc:LFE_0874	58.5	41	17	127	1	123	87	127	1.4e-02	43.5',
                    'NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125	fig|269799.8.peg.645	53.7	41	19	127	1	123	87	127	1.8e-02	43.1',
                    'NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125	fig|653733.4.peg.1965	58.5	41	17	126	1	123	86	126	1.8e-02	43.1',
                    'NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125	kegg|gbe:GbCGDNIH1_0546	56.1	41	18	125	1	123	85	125	1.8e-02	43.1'
                     ]
        hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11112:25043:18797#CTCTCT/2|3|125')
        for new_hit in new_hits:
            hit = DiamondHit()
            hit.create_hit(new_hit.split('\t'))
            hit.annotate_hit(self.parser.ref_data)
            hit_list.add_hit(hit)
        compare_hits_erpk_lca(read, 3, 125, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data)
        # print('Read status:', read.status)
        # print('Read function:', read.functions)
        self.assertEqual(read.status, STATUS_BAD)
        self.assertEqual(len(read.functions), 0)
        self.assertEqual(read.taxonomy, None)
示例#2
0
 def test_2_compare_hits_2(self):
     # test 2 hits with 1 function, case 1.2
     hit = DiamondHit()
     hit.import_hit('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1	fig|485913.3.peg.8591	87.9	33	4	101	100	2	1	33	1.1e-09	58.5	UreA'.split('\t'))
     read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1')
     old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1')
     old_hit_list.add_hit(hit)
     read.hit_list = old_hit_list
     print ('*test_2_compare_hits_2: test 2 hits with 1 function, case 1.2 *')
     new_hits = ['NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1|100|2	fig|316274.7.peg.2519	87.9	33	4	101	99	1	1	33	2.1e-07	58.9'
                 ]
     hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1|100|2')
     for new_hit in new_hits:
         hit = DiamondHit()
         hit.create_hit(new_hit.split('\t'))
         hit.annotate_hit(self.parser.ref_data)
         hit_list.add_hit(hit)
     self.assertEqual(read.functions['UreA'], 0.0)
     compare_hits_erpk_lca(read, 100, 2, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data)
     # print('Read status:', read.status)
     # print('Read function:', read.functions)
     self.assertEqual(read.status, STATUS_GOOD)
     self.assertEqual(len(read.functions), 1)
     self.assertEqual(read.functions['UreA'], get_erpk_score(101, 150, 15))
     self.assertEqual(read.taxonomy, '65')
示例#3
0
    def test_2_compare_hits_5(self):
        # test 7 hits, two functions, case 2.1
        hit = DiamondHit()
        hit.import_hit('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1	fig|1121378.3.peg.2960	76.0	50	12	231	150	1	23	72	9.0e-15	75.5	UreA|UreB'.split('\t'))
        read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1')
        old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1')
        old_hit_list.add_hit(hit)
        read.hit_list = old_hit_list
        print ('* test_2_compare_hits_5: test 7 hits with 2 functions, case 2.1*')
        new_hits = ['NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1121378.3.peg.2960	76.0	50	12	231	150	1	23	72	3.3e-12	75.5',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	kegg|hco:LOKO_03690	72.0	50	14	100	150	1	23	72	4.3e-12	75.1',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1160705.3.peg.7402	72.0	50	14	236	150	1	159	208	4.3e-12	75.1',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1156841.3.peg.6425	74.0	50	13	100	150	1	23	72	4.3e-12	75.1',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	kegg|masw:AM586_12165	74.0	50	13	100	150	1	23	72	5.6e-12	74.7',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1203460.3.peg.2591	72.0	50	14	100	150	1	23	72	5.6e-12	74.7',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1121943.4.peg.3735	72.0	50	14	100	150	1	23	72	5.6e-12	74.7',
                     ]
        hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1')
        for new_hit in new_hits:
            hit = DiamondHit()
            hit.create_hit(new_hit.split('\t'))
            hit.annotate_hit(self.parser.ref_data)
            hit_list.add_hit(hit)
        compare_hits_erpk_lca(read, 150, 1, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data)
#        print('Read status:', read.get_status())
#        print('Read function:', read.get_functions())
        self.assertEqual(read.status, STATUS_GOOD)
        self.assertEqual(len(read.functions), 1)
        self.assertEqual(read.functions['UreA'], get_erpk_score(231, 150, 15))
        self.assertEqual(read.taxonomy, '2')
    def test_3_protein_taxonomy(self):
        self.project.import_reads_json(sample, ENDS)
        protein = 'D16-4706_contig_11213_7'
        print('D16-4706_contig_11213_7 taxonomy')
        print(self.project.samples[sample].reads[end][protein].taxonomy)
        
        parser = DiamondParser(config=self.project.config,
                       options=self.project.options,
                       taxonomy_data=self.project.taxonomy_data,
                       ref_data=self.project.ref_data,
                       sample=self.project.samples[sample],
                       end=end)
        parser.parse_reference_output()
        print(str(parser.reads[protein]))
        
#        parse_background_output(parser)
        hit_line = 'D16-4706_contig_11213_7|4|257	fig|408672.3.peg.2637	63.0	254	94	256	1	254	2	255	1.1e-97	362.1'
        hit = DiamondHit()
        hit.create_hit(tabular_output_fields=hit_line.split('\t'))
        hit_list = DiamondHitList('D16-4706_contig_11213_7|4|257')
        hit_list.add_hit(hit)
        hit_list.annotate_hits(self.project.ref_data)
        hit_list.filter_list_by_identity(self.project.ref_data)
        print('hit_list')
        print(hit_list)
        
        compare_protein_hits_lca(parser.reads[protein], 4, 257, hit_list, 0.03, 1.0, 1.0, self.project.taxonomy_data, self.project.ref_data)
        print(parser.reads[protein].taxonomy)
        self.assertEqual(parser.reads[protein].taxonomy, '408672')
示例#5
0
def decode_assembly(obj):
    """Custom JSON decoder for Sample object

    Args:
        obj (obj): GeneAssembly object to decode
    """
    if '__DiamondHitList__' in obj:
        diamond_hit_list = DiamondHitList()
        diamond_hit_list.__dict__.update(obj['__DiamondHitList__'])
        return diamond_hit_list
    elif '__DiamondHit__' in obj:
        diamond_hit = DiamondHit()
        diamond_hit.__dict__.update(obj['__DiamondHit__'])
        return diamond_hit
    elif '__Contig__' in obj:
        contig = Contig()
        contig.__dict__.update(obj['__Contig__'])
        return contig
    elif '__Gene__' in obj:
        gene = Gene()
        gene.__dict__.update(obj['__Gene__'])
        return gene
    elif '__GeneAssembly__' in obj:
        gene_assembly = GeneAssembly()
        gene_assembly.__dict__.update(obj['__GeneAssembly__'])
        return gene_assembly
    return obj
示例#6
0
    def import_hit_list(self):
        """Imports tab-separated table of DAIMOND hits. Use for resuming
        analysis after Fama restart

        Returns:
            :obj:dict[str, :obj:AnnotatedRead]

        """
        infile = os.path.join(os.path.join(self.sample.work_directory,
                                           self.sample.sample_id + '_'
                                           + self.end + '_'
                                           + self.options.ref_hits_list_name))
        ret_val = {}
        hit_list = None
        current_read_id = None
        with open(infile, 'r', newline='') as infile:
            tsvin = csv.reader(infile, delimiter='\t')
            for row in tsvin:
                if current_read_id is None:
                    # initialize
                    current_read_id = row[0]
                    hit_list = DiamondHitList(current_read_id)
                elif current_read_id != row[0]:
                    ret_val[current_read_id] = AnnotatedRead(current_read_id)
                    ret_val[current_read_id].hit_list = hit_list
                    current_read_id = row[0]
                    hit_list = DiamondHitList(current_read_id)
                hit = DiamondHit()
                hit.import_hit(row)
                hit_list.add_hit(hit)
            ret_val[current_read_id] = AnnotatedRead(current_read_id)
            ret_val[current_read_id].hit_list = hit_list

        return ret_val
示例#7
0
    def test_2_compare_hits_4(self):
        # test hit with two functions
        hit = DiamondHit()
        hit.import_hit('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1	fig|1121378.3.peg.2960	76.0	50	12	231	150	1	23	72	9.0e-15	75.5	UreA|UreB'.split('\t'))
        read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1')
        old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1')
        old_hit_list.add_hit(hit)
        read.hit_list = old_hit_list
        # test 20 hits, one function
        print ('*test_2_compare_hits_4: test 20 hits with 2 functions, case 2.1*')
        new_hits = ['NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1121378.3.peg.2960	76.0	50	12	231	150	1	23	72	3.3e-12	75.5',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	kegg|hco:LOKO_03690	72.0	50	14	100	150	1	23	72	4.3e-12	75.1',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1160705.3.peg.7402	72.0	50	14	236	150	1	159	208	4.3e-12	75.1',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1156841.3.peg.6425	74.0	50	13	100	150	1	23	72	4.3e-12	75.1',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	kegg|masw:AM586_12165	74.0	50	13	100	150	1	23	72	5.6e-12	74.7',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1203460.3.peg.2591	72.0	50	14	100	150	1	23	72	5.6e-12	74.7',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1121943.4.peg.3735	72.0	50	14	100	150	1	23	72	5.6e-12	74.7',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	kegg|samb:SAM23877_1321	72.0	50	14	100	150	1	23	72	7.3e-12	74.3',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|100226.15.peg.1235	72.0	50	14	100	150	1	23	72	7.3e-12	74.3',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|290398.11.peg.2325	72.0	50	14	100	150	1	23	72	7.3e-12	74.3',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1000565.3.peg.3274	72.0	50	14	100	150	1	23	72	9.6e-12	73.9',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	kegg|hhu:AR456_08480	68.0	50	16	100	150	1	23	72	1.3e-11	73.6',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1126229.3.peg.323	70.0	50	15	100	150	1	23	72	1.3e-11	73.6',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1155714.3.peg.3022	70.0	50	15	100	150	1	23	72	1.3e-11	73.6',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1155716.3.peg.3618	72.0	50	14	100	150	1	23	72	1.3e-11	73.6',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1155718.3.peg.3182	68.0	50	16	100	150	1	23	72	1.3e-11	73.6',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|591167.6.peg.5739	70.0	50	15	100	150	1	23	72	1.3e-11	73.6',
                    'NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1	fig|1054862.3.peg.1097	72.0	50	14	103	150	1	26	75	1.3e-11	73.6'

                    ]
        hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9189:2106#CTCTCT/1|150|1')
        for new_hit in new_hits:
            hit = DiamondHit()
            hit.create_hit(new_hit.split('\t'))
            hit.annotate_hit(self.parser.ref_data)
            hit_list.add_hit(hit)
        compare_hits_erpk_lca(read, 150, 1, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data)
#        print('268 Read status:', read.get_status())
#        print('269 Read function:', read.get_functions())
        self.assertEqual(read.status, STATUS_GOOD)
        self.assertEqual(len(read.functions), 1)
        self.assertEqual(read.functions['UreA'], get_erpk_score(231, 150, 15))
        self.assertEqual(read.taxonomy, '2')
示例#8
0
    def test_2_compare_hits_7(self):
        # test hit with one function and many close homologs, case 2.2
        hit = DiamondHit()
        hit.import_hit('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1	fig|363754.4.peg.84	88.0	50	6	570	1	150	270	319	7.6e-22	99.0	UreC'.split('\t'))
        read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1')
        old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1')
        old_hit_list.add_hit(hit)
        read.hit_list = old_hit_list
        # test 20 hits, one function

        print ('* test 17 hits with 1 function, case 2.2*')
        new_hits = ['NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|266835.9.peg.3902	88.0	50	6	570	1	150	270	319	2.8e-19	99.0',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|363754.4.peg.84	90.0	50	5	590	1	150	290	339	4.7e-19	98.2',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|411684.3.peg.2730	88.0	50	6	570	1	150	270	319	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|176299.10.peg.2410	90.0	50	5	569	1	150	269	318	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|maad:AZF01_14085	86.0	50	7	570	1	150	270	319	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|266834.11.peg.3959	88.0	50	6	570	1	150	270	319	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|six:BSY16_122	88.0	50	6	568	1	150	269	318	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1144306.3.peg.717	86.0	50	7	569	1	150	269	318	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|meso:BSQ44_05880	88.0	50	6	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|118163.3.peg.2804	90.0	50	5	565	1	150	266	315	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|244592.3.peg.2201	90.0	50	5	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1150469.3.peg.1765	90.0	50	5	583	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|don:BSK21_03675	90.0	50	5	566	1	150	269	318	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1048680.4.peg.3848	86.0	50	7	569	1	150	269	318	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|311402.9.peg.5014	86.0	50	7	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|hoe:IMCC20628_02897	88.0	50	6	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|744979.4.peg.2070	88.0	50	6	570	1	150	270	319	8.1e-19	97.4'
                     ]
        hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150')
        for new_hit in new_hits:
            hit = DiamondHit()
            hit.create_hit(new_hit.split('\t'))
            hit.annotate_hit(self.parser.ref_data)
            hit_list.add_hit(hit)
        compare_hits_erpk_lca(read, 1, 150, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data)
#        print('Read status:', read.get_status())
#        print('Read function:', read.get_functions())
        self.assertEqual(read.status, STATUS_GOOD)
        self.assertEqual(len(read.functions), 1)
        self.assertEqual(read.functions['UreC'], get_erpk_score(570, 150, 15))
        self.assertEqual(read.taxonomy, '28211')
示例#9
0
    def test_2_compare_hits_8(self):
        # test hit with one function and many close homologs, case 2.4
        hit = DiamondHit()
        hit.import_hit('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1	fig|363754.4.peg.84	88.0	50	6	570	1	150	270	319	7.6e-22	99.0	UreC'.split('\t'))
        read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1')
        old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1')
        old_hit_list.add_hit(hit)
        read.hit_list = old_hit_list

        # test 20 hits, one function, case 2.4
        print ('* test2_8: 17 hits with 1 function, case 2.4*')
        new_hits = ['NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id1	88.0	50	6	570	1	150	270	319	2.8e-19	99.0',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id2	90.0	50	5	590	1	150	290	339	4.7e-19	98.2',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id3	88.0	50	6	570	1	150	270	319	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id4	90.0	50	5	569	1	150	269	318	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id5	86.0	50	7	570	1	150	270	319	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id6	88.0	50	6	570	1	150	270	319	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id7	88.0	50	6	568	1	150	269	318	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id8	86.0	50	7	569	1	150	269	318	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id9	88.0	50	6	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id10	90.0	50	5	565	1	150	266	315	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id11	90.0	50	5	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id12	90.0	50	5	583	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id13	90.0	50	5	566	1	150	269	318	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id14	86.0	50	7	569	1	150	269	318	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id15	86.0	50	7	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id16	88.0	50	6	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fake_id17	88.0	50	6	570	1	150	270	319	8.1e-19	97.4'
                     ]
        hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150')
        for new_hit in new_hits:
            hit = DiamondHit()
            hit.create_hit(new_hit.split('\t'))
            hit.annotate_hit(self.parser.ref_data)
            hit_list.add_hit(hit)
        compare_hits_erpk_lca(read, 1, 150, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data)
#        print('Read status:', read.get_status())
#        print('Read function:', read.get_functions())
        self.assertEqual(read.status, STATUS_BAD)
        self.assertEqual(len(read.functions), 0)
        self.assertEqual(read.taxonomy, None)
示例#10
0
def decode_reads(obj):
    """Custom JSON decoder for AnnotatedRead object

    Args:
        obj (obj): AnnotatedRead object to decode
    """
    if '__AnnotatedRead__' in obj:
        annotated_read = AnnotatedRead()
        annotated_read.__dict__.update(obj['__AnnotatedRead__'])
        return annotated_read
    elif '__DiamondHitList__' in obj:
        diamond_hit_list = DiamondHitList()
        diamond_hit_list.__dict__.update(obj['__DiamondHitList__'])
        return diamond_hit_list
    elif '__DiamondHit__' in obj:
        diamond_hit = DiamondHit()
        diamond_hit.__dict__.update(obj['__DiamondHit__'])
        return diamond_hit
    return obj
示例#11
0
    def test_2_compare_hits_3(self):
        # test 3 hits with 1 function, case 1.3
        hit = DiamondHit()
        hit.import_hit('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1	fig|485913.3.peg.8591	87.9	33	4	101	100	2	1	33	1.1e-09	58.5	UreA'.split('\t'))
        read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1')
        old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1')
        old_hit_list.add_hit(hit)
        read.hit_list = old_hit_list
        print ('* test 1 hit with 1 function, case 1.3  *')
        new_hits = ['NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1|100|2	fake_id	87.9	33	4	101	99	1	1	33	2.1e-07	58.9'
                    ]
        hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:10772:2071#CTCTCT/1|100|2')
        for new_hit in new_hits:
            hit = DiamondHit()
            hit.create_hit(new_hit.split('\t'))
            hit.annotate_hit(self.parser.ref_data)
            hit_list.add_hit(hit)
        compare_hits_erpk_lca(read, 100, 2, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data)
#        print('Read status:', read.get_status())
#        print('Read function:', read.get_functions())
        self.assertEqual(read.status, STATUS_BAD)
        self.assertEqual(len(read.functions), 0)
        self.assertEqual(read.taxonomy, None)
示例#12
0
def decode_sample(obj):
    """Custom JSON decoder for Sample object

    Args:
        obj (obj): Sample object to decode
    """
    if '__Sample__' in obj:
        sample = Sample()
        sample.__dict__.update(obj['__Sample__'])
        return sample
    elif '__AnnotatedRead__' in obj:
        annotated_read = AnnotatedRead()
        annotated_read.__dict__.update(obj['__AnnotatedRead__'])
        return annotated_read
    elif '__DiamondHitList__' in obj:
        diamond_hit_list = DiamondHitList()
        diamond_hit_list.__dict__.update(obj['__DiamondHitList__'])
        return diamond_hit_list
    elif '__DiamondHit__' in obj:
        diamond_hit = DiamondHit()
        diamond_hit.__dict__.update(obj['__DiamondHit__'])
        return diamond_hit
    return obj
示例#13
0
    def parse_background_output(self):
        """Reads and processes DIAMOND tabular output of the classification
        DIAMOND search.

        Note: this function takes existing list of hits and compares each
        of them with results of new similarity serach (against classification DB).
        For the comparison, it calls compare_hits_lca function.

        """
        tsvfile = os.path.join(self.assembly_dir,
                               'all_contigs_' + self.project.options.background_output_name)
        current_query_id = None
        hit_list = None
        length_cutoff = self.project.config.get_length_cutoff(
            self.project.options.get_collection())
        biscore_range_cutoff = self.project.config.get_biscore_range_cutoff(
            self.project.options.get_collection())
        print('Relative bit-score cutoff: ', biscore_range_cutoff,
              ', Length cutoff: ', length_cutoff)

        average_coverage = self.assembly.calculate_average_coverage()

        with open(tsvfile, 'r', newline='') as infile:
            tsvin = csv.reader(infile, delimiter='\t')
            function_id = ''
            contig_id = ''
            gene_id = ''
            coverage = ''
            for row in tsvin:
                if current_query_id is None:
                    current_query_id = row[0]
                    hit_list = DiamondHitList(current_query_id)

                hit = DiamondHit()
                hit.create_hit(row)
                # filtering by identity and length
                if hit.length < length_cutoff:
                    continue  # skip this hit

                if hit.query_id != current_query_id:
                    hit_list.annotate_hits(self.project.ref_data)
                    hit_list.filter_list_by_identity(self.project.ref_data)
                    # compare list of hits from search in background DB with existing
                    # hit from search in reference DB
                    current_query_id_tokens = current_query_id.split('|')
                    function_id = current_query_id_tokens[0]
                    contig_id = '_'.join(current_query_id_tokens[1].split('_')[:-1])
                    gene_id = '|'.join(current_query_id_tokens[:-2])
                    coverage = self.assembly.contigs[function_id][contig_id].get_coverage()
                    try:
                        compare_protein_hits_lca(
                            self.assembly.contigs[function_id][contig_id].genes[gene_id],
                            int(current_query_id_tokens[-2]),  # hit_start
                            int(current_query_id_tokens[-1]),  # hit_end
                            hit_list,
                            biscore_range_cutoff,
                            coverage,
                            average_coverage,
                            self.project.taxonomy_data,
                            self.project.ref_data
                            )
                    except KeyError:
                        print(' '.join(['Gene not found:', gene_id, 'in', function_id, contig_id]))
                    current_query_id = hit.query_id
                    hit_list = DiamondHitList(current_query_id)
                hit_list.add_hit(hit)
            hit_list.annotate_hits(self.project.ref_data)
            hit_list.filter_list_by_identity(self.project.ref_data)
            current_query_id_tokens = current_query_id.split('|')
            function_id = current_query_id_tokens[0]
            contig_id = '_'.join(current_query_id_tokens[1].split('_')[:-1])
            gene_id = '|'.join(current_query_id_tokens[:-2])
            coverage = self.assembly.contigs[function_id][contig_id].get_coverage()
            try:
                compare_protein_hits_lca(
                    self.assembly.contigs[function_id][contig_id].genes[gene_id],
                    int(current_query_id_tokens[-2]),  # hit_start
                    int(current_query_id_tokens[-1]),  # hit_end
                    hit_list,
                    biscore_range_cutoff,
                    coverage,
                    average_coverage,
                    self.project.taxonomy_data,
                    self.project.ref_data
                )
            except KeyError:
                print(' '.join(['Gene not found:', gene_id, 'in', function_id, contig_id]))
示例#14
0
    def parse_reference_output(self):
        """Reads and processes DIAMOND tabular output of the preselection
        DIAMOND search.

        Note: this function finds query sequences similar to reference
        proteins. Since a query sequence may have more than one areas of
        similarity (for instance, in fusion proteins of two subunits or
        in multi-domain proteins), it will try to find as many such areas
        as possible.

        DIAMOND hits are filtered by two parameters: length of alignment
        and amino acid identity %, which are defined in program config ini.
        """
        tsvfile = os.path.join(self.assembly_dir,
                               'all_contigs_' + self.project.options.ref_output_name)
        current_id = ''
        hit_list = DiamondHitList(current_id)
        identity_cutoff = self.project.config.get_identity_cutoff(
            self.project.options.get_collection())
        length_cutoff = self.project.config.get_length_cutoff(
            self.project.options.get_collection())
        print('Parse reference output: Identity cutoff: ',
              identity_cutoff,
              ', Length cutoff: ',
              length_cutoff)

        with open(tsvfile, 'r', newline='') as infile:
            tsvin = csv.reader(infile, delimiter='\t')
            for row in tsvin:
                hit = DiamondHit()
                hit.create_hit(row)
                # filtering by identity and length
                if hit.identity < identity_cutoff:
                    continue  # skip this line
                if hit.length < length_cutoff:
                    continue  # skip this line

                if hit.query_id != current_id:
                    # filter list for overlapping hits
                    hit_list.filter_list(self.project.config.get_overlap_cutoff(
                        self.project.options.get_collection()))
                    if hit_list.hits_number != 0:
                        # annotate_hits
                        hit_list.annotate_hits(self.project.ref_data)
                        function_id, contig_id, _ = parse_gene_id(current_id)
                        self.assembly.contigs[function_id][contig_id].\
                            genes[current_id].hit_list = hit_list

                    current_id = hit.query_id
                    hit_list = DiamondHitList(current_id)
                hit_list.add_hit(hit)
            hit_list.filter_list(
                self.project.config.get_overlap_cutoff(self.project.options.get_collection()))
            if hit_list.hits_number != 0:
                # annotate_hits
                hit_list.annotate_hits(self.project.ref_data)
                function_id, contig_id, _ = parse_gene_id(current_id)
                self.assembly.contigs[function_id][contig_id].genes[current_id].hit_list = \
                    hit_list
示例#15
0
    def test_2_compare_hits_6(self):
        # test hit with one function and many close homologs, case 2.1
        hit = DiamondHit()
        hit.import_hit('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1	fig|363754.4.peg.84	88.0	50	6	570	1	150	270	319	7.6e-22	99.0	UreC'.split('\t'))
        read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1')
        old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1')
        old_hit_list.add_hit(hit)
        read.hit_list = old_hit_list
        # test 20 hits, one function

        print ('* test_2_compare_hits_6: test 40 hits with 1 function, case 2.1*')
        new_hits = ['NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|363754.4.peg.84	88.0	50	6	570	1	150	270	319	2.8e-19	99.0',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|266835.9.peg.3902	90.0	50	5	590	1	150	290	339	4.7e-19	98.2',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|411684.3.peg.2730	88.0	50	6	570	1	150	270	319	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|176299.10.peg.2410	90.0	50	5	569	1	150	269	318	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|maad:AZF01_14085	86.0	50	7	570	1	150	270	319	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|266834.11.peg.3959	88.0	50	6	570	1	150	270	319	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|six:BSY16_122	88.0	50	6	568	1	150	269	318	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1144306.3.peg.717	86.0	50	7	569	1	150	269	318	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|meso:BSQ44_05880	88.0	50	6	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|118163.3.peg.2804	90.0	50	5	565	1	150	266	315	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|244592.3.peg.2201	90.0	50	5	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1150469.3.peg.1765	90.0	50	5	583	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|don:BSK21_03675	90.0	50	5	566	1	150	269	318	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1048680.4.peg.3848	86.0	50	7	569	1	150	269	318	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|311402.9.peg.5014	86.0	50	7	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|hoe:IMCC20628_02897	88.0	50	6	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|744979.4.peg.2070	88.0	50	6	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1123256.3.peg.2564	90.0	50	5	570	1	150	270	319	1.1e-18	97.1',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|labr:CHH27_19355	90.0	50	5	570	1	150	270	319	1.1e-18	97.1',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1156935.5.peg.1846	86.0	50	7	570	1	150	270	319	1.1e-18	97.1',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|314231.3.peg.911	90.0	50	5	570	1	150	270	319	1.1e-18	97.1',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|216596.11.peg.5042	86.0	50	7	570	1	150	270	319	1.1e-18	97.1',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|292564.3.peg.2428	88.0	50	6	574	1	150	270	319	1.1e-18	97.1',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1122218.3.peg.2653	88.0	50	6	570	1	150	270	319	1.4e-18	96.7',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|633131.3.peg.1456	88.0	50	6	586	1	150	287	336	1.4e-18	96.7',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|375451.14.peg.3633	88.0	50	6	569	1	150	270	319	1.4e-18	96.7',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|391593.3.peg.1570	88.0	50	6	569	1	150	270	319	1.4e-18	96.7',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1120961.3.peg.1492	86.0	50	7	570	1	150	270	319	1.8e-18	96.3',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|756067.3.peg.5652	88.0	50	6	581	1	150	266	315	1.8e-18	96.3',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|640510.4.peg.718	88.0	50	6	568	1	150	269	318	1.8e-18	96.3',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|272129.4.peg.175	88.0	50	6	452	1	150	266	315	1.8e-18	96.3',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|197221.4.peg.4	88.0	50	6	572	1	150	270	319	1.8e-18	96.3',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|314256.5.peg.1029	88.0	50	6	569	1	150	270	319	1.8e-18	96.3',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|313624.3.peg.4907	88.0	50	6	568	1	150	270	319	1.8e-18	96.3',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|402777.3.peg.1444	88.0	50	6	601	1	150	270	319	1.8e-18	96.3',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|179408.3.peg.7446	88.0	50	6	603	1	150	266	315	1.8e-18	96.3',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|643473.3.peg.1780	88.0	50	6	564	1	150	266	315	1.8e-18	96.3',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1173263.3.peg.2622	86.0	50	7	565	1	150	266	315	2.4e-18	95.9',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|391038.7.peg.2407	88.0	50	6	568	1	150	269	318	2.4e-18	95.9',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|103690.10.peg.4352	88.0	50	6	568	1	150	270	319	2.4e-18	95.9'
                     ]
        hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150')
        for new_hit in new_hits:
            hit = DiamondHit()
            hit.create_hit(new_hit.split('\t'))
            hit.annotate_hit(self.parser.ref_data)
            hit_list.add_hit(hit)
        compare_hits_erpk_lca(read, 1, 150, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data)
#        print('Read status:', read.get_status())
#        print('Read function:', read.get_functions())
        self.assertEqual(read.status, STATUS_GOOD)
        self.assertEqual(len(read.functions), 1)
        self.assertEqual(read.functions['UreC'], get_erpk_score(570, 150, 15))
        self.assertEqual(read.taxonomy, '2')
示例#16
0
    def test_2_compare_hits_11(self):
        # read with two hits
        hit1 = DiamondHit()
        hit1.import_hit('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1	fig|363754.4.peg.84	88.0	50	6	570	1	150	270	319	7.6e-22	99.0	UreC'.split('\t'))
        hit2 = DiamondHit()
        hit2.import_hit('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1	fig|485913.3.peg.8591	87.9	33	4	101	100	2	1	33	1.1e-09	58.5	UreA'.split('\t'))
        read = AnnotatedRead('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1')
        old_hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1')
        old_hit_list.add_hit(hit1)
        old_hit_list.add_hit(hit2)
        read.hit_list = old_hit_list

        print ('* test read with 2 hits                 *')
        new_hits = ['NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|100|2	fig|485913.3.peg.8591	87.9	33	4	101	99	1	1	33	2.1e-07	58.9',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|100|2	kegg|cap:CLDAP_03170	81.8	33	6	100	99	1	1	33	1.4e-06	56.2',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|100|2	fig|316274.7.peg.2519	78.8	33	7	100	99	1	1	33	6.8e-06	53.9'
                    ]
        hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|100|2')
        for new_hit in new_hits:
            hit = DiamondHit()
            hit.create_hit(new_hit.split('\t'))
            hit.annotate_hit(self.parser.ref_data)
            hit_list.add_hit(hit)
        compare_hits_erpk_lca(read, 100, 2, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data)
        
        new_hits = ['NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|266835.9.peg.3902	88.0	50	6	570	1	150	270	319	2.8e-19	99.0',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|363754.4.peg.84	90.0	50	5	590	1	150	290	339	4.7e-19	98.2',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|411684.3.peg.2730	88.0	50	6	570	1	150	270	319	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|176299.10.peg.2410	90.0	50	5	569	1	150	269	318	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|maad:AZF01_14085	86.0	50	7	570	1	150	270	319	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|266834.11.peg.3959	88.0	50	6	570	1	150	270	319	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|six:BSY16_122	88.0	50	6	568	1	150	269	318	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1144306.3.peg.717	86.0	50	7	569	1	150	269	318	6.2e-19	97.8',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|meso:BSQ44_05880	88.0	50	6	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|244592.3.peg.2201	90.0	50	5	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1150469.3.peg.1765	90.0	50	5	583	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|don:BSK21_03675	90.0	50	5	566	1	150	269	318	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|1048680.4.peg.3848	86.0	50	7	569	1	150	269	318	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|311402.9.peg.5014	86.0	50	7	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	kegg|hoe:IMCC20628_02897	88.0	50	6	570	1	150	270	319	8.1e-19	97.4',
                    'NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150	fig|744979.4.peg.2070	88.0	50	6	570	1	150	270	319	8.1e-19	97.4'
                     ]
        hit_list = DiamondHitList('NS500496_240_HYN75BGXX:1:11101:9460:3085#CTCTCT/1|1|150')
        for new_hit in new_hits:
            hit = DiamondHit()
            hit.create_hit(new_hit.split('\t'))
            hit.annotate_hit(self.parser.ref_data)
            hit_list.add_hit(hit)
        compare_hits_erpk_lca(read, 1, 150, hit_list, self.parser.config.get_biscore_range_cutoff(self.parser.collection), 15, 150, self.parser.taxonomy_data, self.parser.ref_data)

        # print('Read status:', read.status)
        # print('Read function:', read.functions)
        # print('Read hits:', read.show_hits())
        self.assertEqual(read.status, STATUS_GOOD)
        self.assertEqual(len(read.functions), 2)
        self.assertEqual(read.functions['UreA'], get_erpk_score(101, 150, 15))
        self.assertEqual(read.taxonomy, '28211')
def parse_background_output(parser):
    """Reads and processes DIAMOND tabular output of the second DIAMOND
    search.

    Args:
        parser (:obj:DiamondParser): parser object

    Note: this function takes existing list of hits and compares each
    of them with results of other similarity serach (against larger DB).
    For the comparison, it calls compare_hits_lca function, which
    in turn updates entries in the 'reads' dictionary.

    Raises:
        KeyError if read identifier not found in the 'reads' dictionary
    """
    tsvfile = os.path.join(
        parser.sample.work_directory, parser.sample.sample_id + '_' +
        parser.end + '_' + parser.options.background_output_name)

    coverage_data = load_coverage_data(parser)
    total_coverage = 0.0
    if coverage_data:
        for contig_id in coverage_data.keys():
            total_coverage += coverage_data[contig_id]
        average_coverage = total_coverage / len(coverage_data)
    else:
        average_coverage = 1.0

    current_query_id = None
    _hit_list = None
    length_cutoff = parser.config.get_length_cutoff(parser.collection)
    biscore_range_cutoff = parser.config.get_biscore_range_cutoff(
        parser.collection)
    print('Relative bitscore cutoff: ', biscore_range_cutoff,
          ', Length cutoff: ', length_cutoff)

    with open(tsvfile, 'r', newline='') as infile:
        tsvin = csv.reader(infile, delimiter='\t')
        for row in tsvin:
            if current_query_id is None:
                current_query_id = row[0]
                _hit_list = DiamondHitList(current_query_id)

            hit = DiamondHit()
            hit.create_hit(row)
            # filtering by length
            if hit.length < length_cutoff:
                continue

            if hit.query_id != current_query_id:
                _hit_list.annotate_hits(parser.ref_data)
                _hit_list.filter_list_by_identity(parser.ref_data)

                current_query_id_tokens = current_query_id.split('|')
                protein_id = '|'.join(current_query_id_tokens[:-2])
                hit_start = int(current_query_id_tokens[-2])
                hit_end = int(current_query_id_tokens[-1])
                # Coverage data can be used only if protein ID contains contig ID
                contig_id = '_'.join(
                    protein_id.split(' # ')[0].split('_')[:-1])[1:]
                coverage = 1.0
                if coverage_data is not None and contig_id in coverage_data:
                    coverage = coverage_data[contig_id]
                try:
                    compare_protein_hits_lca(parser.reads[protein_id],
                                             hit_start, hit_end, _hit_list,
                                             biscore_range_cutoff, coverage,
                                             average_coverage,
                                             parser.taxonomy_data,
                                             parser.ref_data)
                except KeyError:
                    print('Protein not found: ', protein_id)
                current_query_id = hit.query_id
                _hit_list = DiamondHitList(current_query_id)
            _hit_list.add_hit(hit)
        _hit_list.annotate_hits(parser.ref_data)
        _hit_list.filter_list_by_identity(parser.ref_data)
        current_query_id_tokens = current_query_id.split('|')
        protein_id = '|'.join(current_query_id_tokens[:-2])
        hit_start = int(current_query_id_tokens[-2])
        hit_end = int(current_query_id_tokens[-1])
        try:
            # Coverage data can be used only if protein ID contains contig ID
            contig_id = '_'.join(
                protein_id.split(' # ')[0].split('_')[:-1])[1:]
            coverage = 1.0
            if coverage_data is not None and contig_id in coverage_data:
                coverage = coverage_data[contig_id]
            compare_protein_hits_lca(parser.reads[protein_id], hit_start,
                                     hit_end, _hit_list, biscore_range_cutoff,
                                     coverage, average_coverage,
                                     parser.taxonomy_data, parser.ref_data)
        except KeyError:
            print('Protein not found: ', protein_id)
示例#18
0
    def parse_background_output(self):
        """Reads and processes DIAMOND tabular output of the second DIAMOND
        search.

        Note: this function takes existing list of hits and compares each
        of them with results of other similarity serach (against larger DB).
        For the comparison, it calls compare_hits_erpk_lca function, which
        in turn updates entries in the 'reads' dictionary.

        Raises:
            KeyError if read identifier not found in the 'reads' dictionary

        """
        if not self.reads:
            # Something went wrong and 'reads' dictionary is empty.
            # Let's try to import list of reads from file.
            self.reads = self.import_hit_list()

        tsvfile = os.path.join(
            self.sample.work_directory,
            self.sample.sample_id + '_' + self.end + '_' + self.options.background_output_name
        )

        average_read_length = self.sample.get_avg_read_length(self.end)

        current_query_id = None
        hit_list = None
        length_cutoff = self.config.get_length_cutoff(self.collection)
        bitscore_range_cutoff = self.config.get_biscore_range_cutoff(self.collection)
        print('Relative bit-score cutoff:', bitscore_range_cutoff,
              ', Length cutoff:', length_cutoff
              )

        with open(tsvfile, 'r', newline='') as infile:
            tsvin = csv.reader(infile, delimiter='\t')
            for row in tsvin:
                if current_query_id is None:
                    current_query_id = row[0]
                    hit_list = DiamondHitList(current_query_id)
                hit = DiamondHit()
                hit.create_hit(row)
                # filtering by length
                if hit.length < length_cutoff:
                    continue  # skip this line

                # when new query ID reached, process collected hits,
                # then start over with new query identifier
                if hit.query_id != current_query_id:
                    # assign functions to selected hits
                    hit_list.annotate_hits(self.ref_data)
                    hit_list.filter_list_by_identity(self.ref_data)
                    # extract initial read identifier from identifier of the hit
                    current_query_id_tokens = current_query_id.split('|')
                    read_id = '|'.join(current_query_id_tokens[:-2])
                    # compare list of hits from search in background DB
                    # with existing hit from the first similarity search
                    try:
                        compare_hits_erpk_lca(
                            self.reads[read_id],
                            int(current_query_id_tokens[-2]),  # hit_start
                            int(current_query_id_tokens[-1]),  # hit_end
                            hit_list, bitscore_range_cutoff, length_cutoff,
                            average_read_length, self.taxonomy_data, self.ref_data
                            )
                    except KeyError:
                        print('Read not found: ', read_id)
                    # starting over
                    current_query_id = hit.query_id
                    hit_list = DiamondHitList(current_query_id)
                hit_list.add_hit(hit)
            # when EOF reached, process collected hits
            # assign functions to selected hits
            hit_list.annotate_hits(self.ref_data)
            hit_list.filter_list_by_identity(self.ref_data)
            # extract initial read identifier from identifier of the hit
            current_query_id_tokens = current_query_id.split('|')
            read_id = '|'.join(current_query_id_tokens[:-2])
            # compare list of hits from search in background DB with
            # existing hit from the first similarity search
            try:
                compare_hits_erpk_lca(
                    self.reads[read_id],
                    int(current_query_id_tokens[-2]),  # hit_start
                    int(current_query_id_tokens[-1]),  # hit_end
                    hit_list, bitscore_range_cutoff, length_cutoff,
                    average_read_length, self.taxonomy_data, self.ref_data
                    )
            except KeyError:
                print('Read not found: ', read_id)
示例#19
0
    def parse_reference_output(self):
        """Reads and processes DIAMOND tabular output of the first DIAMOND
        search.

        Note: this function finds query sequences similar to reference
        proteins. Since a query sequence may have more than one areas of
        similarity (for instance, in fusion proteins of two subunits or
        in multi-domain proteins), it will try to find as many such areas
        as possible.

        DIAMOND hits are filtered by two parameters: length of alignment
        and amino acid identity %.

        This function does not return anything. Instead, it populates
        'reads' dictionary with AnnotatedRead objects.

        """
        tsvfile = os.path.join(
            self.options.get_project_dir(self.sample.sample_id),
            self.sample.sample_id + '_' + self.end + '_' + self.options.ref_output_name
        )
        current_sequence_read_id = ''
        hit_list = DiamondHitList(current_sequence_read_id)
        # TODO: cleanup identity_cutoff = self.config.get_identity_cutoff(self.collection)
        length_cutoff = self.config.get_length_cutoff(self.collection)
        print('Length cutoff:', length_cutoff)
        with open(tsvfile, 'r', newline='') as infile:
            tsvin = csv.reader(infile, delimiter='\t')
            for row in tsvin:
                hit = DiamondHit()
                (row[0], _) = parse_fastq_seqid(row[0])
                hit.create_hit(row)
                # filtering by length
                if hit.length < length_cutoff:
                    continue  # go to next hit

                if hit.query_id != current_sequence_read_id:
                    # when new query ID reached, process collected hits,
                    # then start over with new query identifier
                    # filtering: remove overlapping hits
                    hit_list.filter_list(self.config.get_overlap_cutoff(self.collection))
                    # if any hits left, assign function to hits and populate reads dictionary
                    hit_list.annotate_hits(self.ref_data)
                    hit_list.filter_list_by_identity(self.ref_data)
                    if hit_list.hits_number != 0:
                        read = AnnotatedRead(current_sequence_read_id)
                        read.hit_list = hit_list
                        self.reads[current_sequence_read_id] = read
                    # start over
                    current_sequence_read_id = hit.query_id
                    hit_list = DiamondHitList(current_sequence_read_id)
                hit_list.add_hit(hit)
            # when EOF reached, process collected hits
            hit_list.filter_list(self.config.get_overlap_cutoff(self.collection))
            hit_list.annotate_hits(self.ref_data)
            hit_list.filter_list_by_identity(self.ref_data)
            if hit_list.hits_number != 0:
                read = AnnotatedRead(current_sequence_read_id)
                read.hit_list = hit_list
                self.reads[current_sequence_read_id] = read