def setUp(self):
     #self.parser = DiamondParser(config_file=config_path, project_file=project_path, sample=sample, end=end)
     self.project = Project(config_file=config_path, project_file=project_path)
     for sample_id in self.project.list_samples():
         sample = Sample(sample_id=sample_id)
         sample.load_sample(self.project.options)
         self.project.samples[sample_id] = sample
示例#2
0
 def setUp(self):
     self.project = Project(config_file=config_path, project_file=project_path)
     self.project.load_project()
     self.parser = DiamondParser(config = self.project.config, 
                         options=self.project.options, 
                         taxonomy_data=self.project.taxonomy_data,
                         ref_data=self.project.ref_data,
                         sample=self.project.samples[sample_id], 
                         end=end)
 def setUp(self):
     self.project = Project(config_file=config_path,
                            project_file=project_path)
     self.project.load_project()
     self.parser = DiamondParser(config=self.project.config,
                                 options=self.project.options,
                                 taxonomy_data=self.project.taxonomy_data,
                                 ref_data=self.project.ref_data,
                                 sample=self.project.samples[sample_id],
                                 end=end)
     self.taxonomy_data = TaxonomyData(self.project.config, '')
     self.node = Node(rank='genus',
                      name='Escherichia',
                      taxid='561',
                      parent='543',
                      children=None)
     self.tree = Tree()
示例#4
0
 def setUp(self):
     project = Project(config_file=config_path, project_file=project_path)
     sample = Sample(sample_id=sample_id)
     sample.load_sample(project.options)
     self.parser = DiamondParser(config = project.config, 
                         options=project.options, 
                         taxonomy_data=project.taxonomy_data,
                         ref_data=project.ref_data,
                         sample=sample, 
                         end=end)
示例#5
0
    def findProject(self, path):
        query = 'select name, path from projects where path = "{}"'.format(
            path)
        result = self.db.fetch(query)

        if len(result) > 0:
            name, path = result[0]
            project = Project(name, path, self.db)
            return project
        else:
            raise Exception
示例#6
0
def main():
    """Main function calling functional profiling module"""
    args = get_args()
    if args.prot:
        protein_pipeline(args)
    else:
        if args.sample is None:
            print(
                'Running functional profiling for all samples in the project')
        else:
            print('Running functional profiling only for ', args.sample)
        project = Project(config_file=args.config, project_file=args.project)
        if project.is_paired_end():
            fastq_pe_pipeline(project,
                              sample_identifier=args.sample,
                              end_identifier=args.end)
        else:
            fastq_pipeline(project,
                           sample_identifier=args.sample,
                           end_identifier=args.end)
    print('Done!')
示例#7
0
    def createProject(self, name='', path=''):

        if name == '':
            print('Project name: ')
            name = self.stdin.read()

        if path == '':
            print('Project path: ')
            path = self.stdin.read()

        project = Project(name, path, self.db)
        # project.create()
        project.discover()
        project.save(self.db)
        return project
def protein_pipeline(args):
    """Functional profiling pipeline for the entire project.

    Args:
        args: ArgumentParser namespace with defined args.config (path to
            program config ini file) and args.project (path to project
            options ini file)
    """
    project = Project(config_file=args.config, project_file=args.project)
    sample_ids = []

    for sample_id in project.list_samples():
        if args.sample is not None:
            if args.sample != sample_id:
                continue
        sample = Sample(sample_id)
        sample.load_sample(project.options)
        project.samples[sample_id] = sample
        project.samples[sample_id].is_paired_end = False
        project.samples[sample_id].rpkg_scaling_factor = None
        project.samples[sample_id].rpkm_scaling_factor = None
        sample_ids.append(sample_id)

    for sample_id in sample_ids:
        # End identifier in protein pipeline is always pe1
        project.samples[sample_id].reads[
            'pe1'] = functional_profiling_pipeline(
                project, sample=project.samples[sample_id])
        export_sample(project.samples[sample_id])
        # Generate output for the sample or delete sample from memory
        generate_protein_sample_report(project,
                                       sample_id,
                                       metric='proteincount')
        project.options.set_sample_data(project.samples[sample_id])

    # Generate output for the project
    if args.sample is None:
        # Skip project report if the pipeline is running for only one sample
        generate_protein_project_report(project)
        generate_output(project)

    project.save_project_options()
class TaxonomyProfilingTest(unittest.TestCase):
    def setUp(self):
        self.project = Project(config_file=config_path,
                               project_file=project_path)
        self.project.load_project()
        self.parser = DiamondParser(config=self.project.config,
                                    options=self.project.options,
                                    taxonomy_data=self.project.taxonomy_data,
                                    ref_data=self.project.ref_data,
                                    sample=self.project.samples[sample_id],
                                    end=end)
        self.taxonomy_data = TaxonomyData(self.project.config, '')
        self.node = Node(rank='genus',
                         name='Escherichia',
                         taxid='561',
                         parent='543',
                         children=None)
        self.tree = Tree()

# Tests of TaxonomyData class

    def test_0010_init_taxonomy_data(self):
        self.assertEqual(len(self.taxonomy_data.data),
                         6568)  # Nitrogen collection

    def test_0020_is_exist(self):
        self.assertTrue(self.taxonomy_data.is_exist('1'))
        self.assertTrue(self.taxonomy_data.is_exist('0'))
        self.assertTrue(self.taxonomy_data.is_exist('562'))
        self.assertFalse(self.taxonomy_data.is_exist('fake_identifier'))

    def test_0030_get_name(self):
        self.assertEqual(self.taxonomy_data.get_name('1'), 'root')
        self.assertEqual(self.taxonomy_data.get_name('0'), 'Unknown')
        self.assertEqual(self.taxonomy_data.get_name('562'),
                         'Escherichia coli')

    def test_0040_get_rank(self):
        self.assertEqual(self.taxonomy_data.get_rank('1'),
                         'norank')  # Test root
        self.assertEqual(self.taxonomy_data.get_rank('0'),
                         'norank')  # Test Unknown
        self.assertEqual(self.taxonomy_data.get_rank('2'),
                         'superkingdom')  # Test Bacteria
        self.assertEqual(self.taxonomy_data.get_rank('562'),
                         'species')  # Test E. coli

    def test_0050_get_parent(self):
        self.assertEqual(self.taxonomy_data.get_parent('1'), '1')  # Test root
        self.assertEqual(self.taxonomy_data.get_parent('0'),
                         '1')  # Test Unknown
        self.assertEqual(self.taxonomy_data.get_parent('2'),
                         '131567')  # Test Bacteria
        self.assertEqual(self.taxonomy_data.get_parent('562'),
                         '561')  # Test E. coli

    def test_0060_get_lca(self):
        self.assertEqual(self.taxonomy_data.get_lca(['1']), '1')  # Test root
        self.assertEqual(self.taxonomy_data.get_lca(['']),
                         '0')  # Test empty string
        self.assertEqual(self.taxonomy_data.get_lca([]),
                         '0')  # Test empty list
        self.assertEqual(self.taxonomy_data.get_lca(['0']),
                         '0')  # Test Unknown
        # Anything with root goes to Unknown
        self.assertEqual(self.taxonomy_data.get_lca(['571', '1']),
                         '0')  # K. oxytoca, root
        # Anything with Unknown is ignored
        self.assertEqual(self.taxonomy_data.get_lca(['571', '0']),
                         '571')  # K. oxytoca, Unknown
        self.assertEqual(self.taxonomy_data.get_lca(['2', '2157']),
                         '131567')  # Bacteria, Archaea
        self.assertEqual(self.taxonomy_data.get_lca(['571', '573']),
                         '570')  # K. oxytoca, K. pneumoniae

    def test_0070_get_upper_level_taxon(self):
        self.assertEqual(self.taxonomy_data.get_upper_level_taxon('1'),
                         ('1', 'norank'))  # Test root
        self.assertEqual(self.taxonomy_data.get_upper_level_taxon('0'),
                         ('1', 'norank'))  # Test Unknown
        # For Bacteria, returns 1 (root), not 131567 ('cellular organisms')
        self.assertEqual(self.taxonomy_data.get_upper_level_taxon('2'),
                         ('1', 'norank'))  # Test Bacteria
        # For 651137 (Thaumarchaeota), returns 2157 (Archaea), not 1783275 ('TACK group')
        self.assertEqual(self.taxonomy_data.get_upper_level_taxon('651137'),
                         ('2157', 'superkingdom'))
        # Test 44260 (Moorella): skip parent taxon 42857 and report 186814 (Thermoanaerobacteriaceae family)
        self.assertEqual(self.taxonomy_data.get_upper_level_taxon('44260'),
                         ('186814', 'family'))

    def test_0080_get_upper_level_taxon(self):
        self.assertEqual(self.taxonomy_data.get_taxonomy_lineage('1'),
                         '')  # Test root
        self.assertEqual(self.taxonomy_data.get_taxonomy_lineage('0'),
                         'Unknown')  # Test Unknown
        # For Bacteria, returns 1 (root), not 131567 ('cellular organisms')
        self.assertEqual(self.taxonomy_data.get_taxonomy_lineage('2'),
                         'Bacteria')  # Test Bacteria
        # Test 651137 (Thaumarchaeota), returns 2157 (Archaea), not 1783275 ('TACK group')
        self.assertEqual(self.taxonomy_data.get_taxonomy_lineage('651137'),
                         'Archaea_Thaumarchaeota')
        # Test 1660251 (Acidobacteria bacterium Mor1): skip two taxa and report 57723 (Acidobacteria)
        self.assertEqual(
            self.taxonomy_data.get_taxonomy_lineage('1525'),
            'Bacteria_Firmicutes_Clostridia_Thermoanaerobacterales_Thermoanaerobacteraceae_Moorella_Moorella_thermoacetica'
        )

# Tests of Node class

    def test_0090_init_node(self):
        node = Node(rank='genus',
                    name='Escherichia',
                    taxid='561',
                    parent='543',
                    children=None)
        self.assertEqual(node.rank, 'genus')
        self.assertEqual(node.name, 'Escherichia')
        self.assertEqual(node.taxid, '561')
        self.assertEqual(node.parent, '543')

    def test_0100_add_child(self):
        self.node.add_child('562')
        self.assertEqual(len(self.node.children), 1)
        self.assertTrue(self.node.has_child('562'))
        self.assertFalse(self.node.has_child('561'))

    def test_0110_set_parent(self):
        node = Node(rank='species')
        node.set_parent('')
        self.assertIsNone(node.parent)
        # Set parent if None
        node.set_parent('561')
        self.assertEqual(node.parent, '561')
        # Change parent
        node.set_parent('2')
        self.assertEqual(node.parent, '2')

    def test_0120_set_taxid(self):
        node = Node(rank='species')
        # Set taxid if None
        node.set_taxid('561')
        self.assertEqual(node.taxid, '561')
        # Changing taxid is not possible
        node.set_taxid('2')
        self.assertEqual(node.taxid, '561')

    def test_0130_set_rank(self):
        self.assertEqual(self.node.rank, 'genus')
        # Change rank
        self.assertTrue(self.node.set_rank('species'))
        self.assertEqual(self.node.rank, 'species')
        # Rank must be defined in RANKS
        self.assertFalse(self.node.set_rank('#^$%@!'))
        self.assertEqual(self.node.rank, 'species')

    def test_0140_set_attribute(self):
        self.node.set_attribute('score', 0.001)
        self.assertEqual(self.node.attributes['score'], 0.001)
        self.node.set_attribute('score', 1)
        self.assertEqual(self.node.attributes['score'], 1)

    def test_0150_add_attribute(self):
        self.node.set_attribute('score_float', 0.5)
        self.node.add_attribute('score_float', 0.2)
        self.assertEqual(self.node.attributes['score_float'], 0.7)
        self.node.set_attribute('score_int', 1)
        self.node.add_attribute('score_int', 999)
        self.assertEqual(self.node.attributes['score_int'], 1000)

    def test_0160_get_attribute(self):
        self.node.set_attribute('score_float', 0.5)
        self.assertEqual(self.node.get_attribute('score_float'), 0.5)
        self.assertIsNone(self.node.get_attribute('nonexisting_key'))

    def test_0170_is_in_children(self):
        self.node.add_child('562')
        self.assertEqual(len(self.node.children), 1)
        self.assertTrue(self.node.has_child('562'))
        self.assertFalse(self.node.has_child('561'))

# Tests of Tree class

    def test_0180_init_tree(self):
        tree = Tree()
        self.assertEqual(tree.root.taxid, '1')
        self.assertEqual(len(tree.data), 1)
        self.assertEqual(tree.data[ROOT_TAXONOMY_ID].taxid, ROOT_TAXONOMY_ID)

    def test_0190_add_node(self):
        # Adding node with non-existing parent must fail
        self.assertFalse(
            self.tree.add_node(
                Node(rank='genus',
                     name='Escherichia',
                     taxid='561',
                     parent='543',
                     children=None)))
        # Adding empty node must fail
        self.assertFalse(self.tree.add_node(Node(rank='species')))
        # Adding second root must fail
        self.assertFalse(
            self.tree.add_node(
                Node(rank='norank',
                     name='root',
                     taxid='1',
                     parent='1',
                     children=None)))
        # Adding node with existing parent must succeed
        self.assertTrue(
            self.tree.add_node(
                Node(rank='superkingdom',
                     name='Bacteria',
                     taxid='2',
                     parent='1',
                     children=None)))
        self.assertEqual(len(self.tree.get_node('1').children), 1)
        # Adding node if its parent has children must succeed
        self.assertTrue(
            self.tree.add_node(
                Node(rank='superkingdom',
                     name='Archaea',
                     taxid='2157',
                     parent='1',
                     children=None)))
        self.assertEqual(len(self.tree.get_node('1').children), 2)

    def test_0200_get_node(self):
        self.tree.add_node(
            Node(rank='superkingdom',
                 name='Bacteria',
                 taxid='2',
                 parent='1',
                 children=None))
        # Getting existing node must succeed
        self.assertEqual(self.tree.get_node('2').parent, '1')
        # Getting non-existing node must fail
        self.assertIsNone(self.tree.get_node('561'))

    def test_0210_is_in_tree(self):
        self.tree.add_node(
            Node(rank='superkingdom',
                 name='Bacteria',
                 taxid='2',
                 parent='1',
                 children=None))
        self.assertTrue(self.tree.is_in_tree('2'))
        self.assertFalse(self.tree.is_in_tree('562'))
        self.assertFalse(self.tree.is_in_tree(True))

    def test_0220_add_node_recursively(self):
        self.assertFalse(self.tree.is_in_tree('562'))
        self.assertTrue(
            self.tree.add_node_recursively(
                Node(rank='species',
                     name='Escherichia coli',
                     taxid='562',
                     parent='561',
                     children=None), self.taxonomy_data))
        self.assertTrue(self.tree.is_in_tree('562'))
        self.assertTrue(self.tree.is_in_tree('561'))
        self.assertTrue(self.tree.is_in_tree('2'))
        # Adding second root must fail
        self.assertFalse(
            self.tree.add_node_recursively(
                Node(rank='norank',
                     name='root',
                     taxid='1',
                     parent='1',
                     children=None), self.taxonomy_data))
        # Adding node with existing parent must succeed
        self.assertFalse(self.tree.is_in_tree('564'))
        self.assertTrue(
            self.tree.add_node_recursively(
                Node(rank='species',
                     name='Escherichia fergusonii',
                     taxid='564',
                     parent='561',
                     children=None), self.taxonomy_data))
        self.assertTrue(self.tree.is_in_tree('564'))

    def test_0230_add_attribute(self):
        self.tree.add_node(
            Node(rank='superkingdom',
                 name='Bacteria',
                 taxid='2',
                 parent='1',
                 children=None))
        # Add attribute to existing node
        self.assertFalse(self.tree.get_node('2').attributes)
        self.tree.add_attribute('2', 'score_float', 0.42, self.taxonomy_data)
        self.assertTrue(self.tree.get_node('2').attributes)
        self.assertEqual(
            self.tree.get_node('2').attributes['score_float'], 0.42)
        # Add attribute to non-existing node
        self.assertFalse(self.tree.get_node('1').attributes)
        self.tree.add_attribute('2157', 'score_float', 0.42,
                                self.taxonomy_data)
        self.assertTrue(self.tree.get_node('1').attributes)
        self.assertEqual(
            self.tree.get_node('1').attributes['score_float'], 0.42)

    def test_0240_add_attribute_recursively(self):
        self.tree.add_node(
            Node(rank='superkingdom',
                 name='Bacteria',
                 taxid='2',
                 parent='1',
                 children=None))
        # Add attribute to existing node
        self.assertFalse(self.tree.get_node('2').attributes)
        self.tree.add_attribute_recursively('2', 'score_float', 0.42,
                                            self.taxonomy_data)
        self.assertTrue(self.tree.get_node('2').attributes)
        self.assertEqual(
            self.tree.get_node('2').attributes['score_float'], 0.42)
        self.assertEqual(
            self.tree.get_node('1').attributes['score_float'], 0.42)
        # Add attribute to non-existing node
        self.tree.add_attribute_recursively('2157', 'score_float', 0.42,
                                            self.taxonomy_data)
        self.assertEqual(
            self.tree.get_node('1').attributes['score_float'], 0.84)

    def test_0250_get_parent(self):
        self.assertTrue(
            self.tree.add_node_recursively(
                Node(rank='species',
                     name='Escherichia coli',
                     taxid='562',
                     parent='561',
                     children=None), self.taxonomy_data))
        # Getting parent of existing node must succeed
        node = self.tree.get_node('561')
        self.assertEqual(node.parent, '543')
        parent_node = self.tree.get_parent(node, self.taxonomy_data)
        self.assertEqual(parent_node.taxid, '543')
        # Getting parent of non-existing node must fail
        node = Node(rank='species')
        parent_node = self.tree.get_parent(node, self.taxonomy_data)
        self.assertIsNone(parent_node)


# Tests of TaxonomyProfile class

    def test_0260_init_taxonomy_profile(self):
        taxonomy_profile = TaxonomyProfile()
        self.assertIsNotNone(taxonomy_profile.tree)
        self.assertEqual(len(taxonomy_profile.tree.data), 1)
        self.assertEqual(taxonomy_profile.tree.root.taxid, ROOT_TAXONOMY_ID)
        self.assertEqual(taxonomy_profile.tree.data[ROOT_TAXONOMY_ID].taxid,
                         ROOT_TAXONOMY_ID)

    def test_0270_make_function_taxonomy_profile(self):
        self.project.import_reads_json(sample_id, ENDS)
        scores = get_function_taxonomy_scores(self.project,
                                              sample_id=sample_id,
                                              metric='fpk')
        sample_scores_taxonomy = slice_function_taxonomy_scores(
            scores, sample_id)
        taxonomy_profile = TaxonomyProfile()
        taxonomy_profile.make_function_taxonomy_profile(
            self.project.taxonomy_data, sample_scores_taxonomy)
        print(taxonomy_profile)
        self.assertEqual(len(taxonomy_profile.tree.data), 22)
        self.assertEqual(
            taxonomy_profile.tree.data[ROOT_TAXONOMY_ID].attributes['NirK']
            ['count'], 1.0)
        self.assertEqual(
            taxonomy_profile.tree.data[ROOT_TAXONOMY_ID].attributes['UreA']
            ['count'], 3.0)
        self.assertEqual(
            taxonomy_profile.tree.data['118883'].attributes['UreC']['count'],
            1.0)

    def test_0280_str(self):
        taxonomy_profile = TaxonomyProfile()
        self.assertEqual(
            str(taxonomy_profile),
            '1\tnorank\troot\tParent:None\tChildren:None\tScore:N/A\tIdentity:N/A\tRead count:N/A\n'
        )

    def test_0290_stringify_node(self):
        self.project.import_reads_json(sample_id, ENDS)
        scores = get_function_taxonomy_scores(self.project,
                                              sample_id=sample_id,
                                              metric='fpk')
        sample_scores_taxonomy = slice_function_taxonomy_scores(
            scores, sample_id)
        taxonomy_profile = TaxonomyProfile()
        taxonomy_profile.make_function_taxonomy_profile(
            self.project.taxonomy_data, sample_scores_taxonomy)
        self.assertEqual(
            taxonomy_profile.stringify_node('118883', 0),
            "118883\tfamily\tSulfolobaceae\tParent:2281\tChildren:None\tUreC:{'count': 1.0, 'hit_count': 1.0"
            ", 'identity': 68.8, 'fpk': 0.5984440454817475}\n")

    def test_0300_convert_profile_into_df(self):
        self.project.import_reads_json(sample_id, ENDS)
        scores = get_function_taxonomy_scores(self.project,
                                              sample_id=sample_id,
                                              metric='fpk')
        sample_scores_taxonomy = slice_function_taxonomy_scores(
            scores, sample_id)
        taxonomy_profile = TaxonomyProfile()
        taxonomy_profile.make_function_taxonomy_profile(
            self.project.taxonomy_data, sample_scores_taxonomy)
        result = taxonomy_profile.convert_profile_into_df(metric='fpk')
        self.assertEqual(result.iloc[0][1], 'root')
        self.assertEqual(result.iloc[1][0], 'superkingdom')

    def test_0310_convert_node_into_dict(self):
        self.project.import_reads_json(sample_id, ENDS)
        scores = get_function_taxonomy_scores(self.project,
                                              sample_id=sample_id,
                                              metric='fpk')
        sample_scores_taxonomy = slice_function_taxonomy_scores(
            scores, sample_id)
        print(sample_scores_taxonomy)
        taxonomy_profile = TaxonomyProfile()
        taxonomy_profile.make_function_taxonomy_profile(
            self.project.taxonomy_data, sample_scores_taxonomy)
        result, attributes = taxonomy_profile.convert_node_into_dict(
            '118883', ['UreC', 'UreA'], 1, metric='fpk')
        self.assertEqual(result[1][('', 'Taxon name')], 'Sulfolobaceae')
        self.assertEqual(result[1][('UreC', '1.Score')], 0.5984440454817475)
        self.assertEqual(result[1][('UreA', '1.Score')], 0.0)
        self.assertEqual(attributes['UreC']['fpk'], 0.5984440454817475)
        self.assertEqual(attributes['UreA']['fpk'], 0.0)

    def test_0320_convert_profile_into_score_df(self):
        self.project.import_reads_json(sample_id, ENDS)
        scores = get_function_taxonomy_scores(self.project,
                                              sample_id=sample_id,
                                              metric='fpk')
        sample_scores_taxonomy = slice_function_taxonomy_scores(
            scores, sample_id)
        taxonomy_profile = TaxonomyProfile()
        taxonomy_profile.make_function_taxonomy_profile(
            self.project.taxonomy_data, sample_scores_taxonomy)
        result = taxonomy_profile.convert_profile_into_score_df(metric='fpk')
        self.assertEqual(result.iloc[0][1], 'root')
        self.assertEqual(result.iloc[1][0], 'superkingdom')

    def test_0330_convert_node_into_dict(self):
        self.project.import_reads_json(sample_id, ENDS)
        scores = get_function_taxonomy_scores(self.project,
                                              sample_id=sample_id,
                                              metric='fpk')
        sample_scores_taxonomy = slice_function_taxonomy_scores(
            scores, sample_id)
        taxonomy_profile = TaxonomyProfile()
        taxonomy_profile.make_function_taxonomy_profile(
            self.project.taxonomy_data, sample_scores_taxonomy)
        result, attributes = taxonomy_profile.convert_node_into_values_dict(
            '118883', ['UreC', 'UreA'], 1, metric='fpk')
        self.assertEqual(result[1][('', 'Taxon name')], 'Sulfolobaceae')
        self.assertEqual(result[1][('UreC', 'fpk')], 0.5984440454817475)
        self.assertEqual(result[1][('UreA', 'fpk')], 0.0)
        self.assertEqual(attributes['UreC']['fpk'], 0.5984440454817475)
        self.assertNotIn('UreA', attributes.keys())

    def tearDown(self):
        self.parser = None
示例#10
0
class FamaReportTest(unittest.TestCase):

    def setUp(self):
        self.project = Project(config_file=config_path, project_file=project_path)
        self.project.load_project()
        self.parser = DiamondParser(config = self.project.config, 
                            options=self.project.options, 
                            taxonomy_data=self.project.taxonomy_data,
                            ref_data=self.project.ref_data,
                            sample=self.project.samples[sample_id], 
                            end=end)


    @unittest.skip("for faster testing")
    def test_1_collection_pdf_output(self):
        outfile = os.path.join(data_dir,'collection_list.pdf')
        if (os.path.exists(outfile)):
            os.remove(outfile)
        pdf = FPDF('P', 'mm', 'Letter')
        pdf.add_page()
        pdf.add_font('DejaVu', '', '/usr/share/fonts/truetype/dejavu/DejaVuSansCondensed.ttf', uni=True)
        pdf.add_font('DejaVu', 'B', '/usr/share/fonts/truetype/dejavu/DejaVuSansCondensed-Bold.ttf', uni=True)
        pdf.set_font('DejaVu', 'B', 16)
        pdf.cell(40, 10, 'List of functions in collection ' + self.parser.collection)
        pdf.ln(h = '')
        pdf.set_font('DejaVu', '', 10)
        
        v_limit = 40
        for function in self.parser.ref_data.functions_dict:
            v_limit += 10
            pdf.cell(v_limit, 10, function + '  ' + self.parser.ref_data.functions_dict[function]['name'] 
                    + ': group ' + self.parser.ref_data.functions_dict[function]['group'] )
            pdf.ln(h = 10)

        outfile = os.path.join(data_dir,'collection_list.pdf')
        pdf.output(outfile, 'F')
        self.assertTrue(os.path.exists(outfile))
        # If function names contain 'bad' symbols, this test fails

    @unittest.skip("for faster testing")
    def test_2_get_functions_in_group(self):
        urease_list = self.parser.ref_data.get_functions_in_group('Urease')
        self.assertEqual(len(urease_list), 3)
        self.assertEqual(sorted(urease_list)[0], 'UreA')
        
#    @unittest.skip("for faster testing")
    def test_3_generate_pdf_report(self):
        self.parser.parse_background_output()
        generate_pdf_report(self.parser)

    @unittest.skip("for faster testing")
    def test_4_generate_functions_chart(self):
        self.parser.parse_background_output()
        generate_functions_chart(self.parser)

    @unittest.skip("for faster testing")
    def test_5_generate_functions_xlsx(self):
        for sample_id in self.project.list_samples():
            self.project.import_reads_json(sample_id, self.project.ENDS)
        metrics = 'efpkg'
        scores = get_function_scores(self.project, sample_id=None, metrics=metrics)
        generate_function_sample_xlsx(self.project, 
                            scores, 
                            metrics=metrics, 
                            sample_id = None)
        self.assertTrue(len(scores) > 0)


    @unittest.skip("for faster testing")
    def test_6_generate_functions_markdown(self):
        for sample_id in self.project.list_samples():
            self.project.import_reads_json(sample_id, self.project.ENDS)
        create_functions_markdown_document(self.project)

    @unittest.skip("for faster testing")
    def test_7_generate_function_table(self):
        function = 'AmoA'
        for sample_id in self.project.list_samples():
            self.project.import_reads_json(sample_id, self.project.ENDS)
        outfile = 'test_output_20181130.txt'
        with open(outfile, 'w') as of:
            for read_id,read in self.project.samples[sample_id].reads[end].items():
#                if function in read.get_functions():
                for hit in read.get_hit_list().get_hits():
                    if function in hit.get_functions():
                        print (read_id, function, read.status, hit.get_subject_id())
                        of.write(('\t').join([read_id, function, read.status, hit.get_subject_id()]) + '\n')
            of.closed
            
    @unittest.skip("for faster testing")
    def test_8_predict_insert_size(self):
        
        #Should take into account https://github.com/ksahlin/GetDistr/blob/master/getdistr/model.py

        sample_id = 'sample1'
#        for sample_id in self.project.list_samples():
        self.project.import_reads_json(sample_id, self.project.ENDS)
        outfile = sample_id + '_insert_size_data.txt'
        fragment_list = []
        fragment_weights = defaultdict(float)
        gene_length_threshold = 150
        alignment_length_threshold = 15
        read_data = defaultdict(dict)
        print ('pe1 reads', str(len(self.project.samples[sample_id].reads['pe1'])))
        print ('pe2 reads', str(len(self.project.samples[sample_id].reads['pe2'])))
        for read_id,read1 in self.project.samples[sample_id].reads['pe1'].items():
            if read1.get_status() != 'function':
                continue
            if read_id not in self.project.samples[sample_id].reads['pe2']:
                continue
#            print ('Found read with two mapped ends')
            read2 =self.project.samples[sample_id].reads['pe2'][read_id]
            if read2.get_status() != 'function':
                continue
            for hit in read1.get_hit_list().get_hits():
                if hit.get_subject_id() not in [h.get_subject_id() for h in read2.get_hit_list().get_hits()]:
#                    print ('Different target proteins: skipped')
                    continue
                if hit.s_len*3 < gene_length_threshold:
#                    print ('Target protein shorter than threshold: skipped')
                    continue
                if hit.s_end - hit.s_start < alignment_length_threshold:
                    continue
                for hit2 in read2.get_hit_list().get_hits():
                    if hit.get_subject_id() != hit2.get_subject_id():
                        continue
#                    print ('Found read with two hits in one protein')
                    if hit2.s_end - hit2.s_start < alignment_length_threshold:
                        continue
#                    print ('Found read with two hits in one protein longer than alignment cutoff')
                    if (hit.s_end - hit2.s_start) > (hit2.s_end - hit.s_start):
                        # Do not count overhangs
                        #fragment_length = 3 * (hit.s_end - hit2.s_start)
                        # Count overhangs
                        insert_size = 3 * (hit.s_end - hit2.s_start) + hit2.q_start - 1 + len(read2.sequence)  - hit.q_end
                    else:
                        # Do not count overhangs
                        #fragment_length = 3 * (hit2.s_end - hit.s_start)
                        # Count overhangs
                        insert_size = 3 * (hit2.s_end - hit.s_start) + hit.q_start - 1 + len(read1.sequence)  - hit2.q_end
                    
                    
                    #fragment_weight = (gene_length_threshold - fragment_length + 1)/(3*hit.s_len - 3*alignment_length_threshold + 1)
                    #fragment_weights[insert_size] += fragment_weight
                    #fragment_list.append([fragment_length, fragment_weight])
                    read_data[read_id]['tlen'] = insert_size
                    read_data[read_id]['rlen'] = (len(read1.sequence) + len(read2.sequence)) / 2
                    read_data[read_id]['ref_len'] = hit.s_len*3
                    read_data[read_id]['ref_name'] = hit.get_subject_id()
                    if not read_data[read_id]['tlen'] > 0:
                        print(read_id, str(read_data[read_id]['rlen']), str(insert_size), str(read_data[read_id]['ref_len']), read_data[read_id]['ref_name'])
                        print(hit)
                        print(hit2)

                    break
        #~ if len(fragment_list) > 0:
            #~ return int(sum(fragment_list) / len(fragment_list))
        #~ else:
            #~ return 0
#        print (fragment_list)
        avg_fragment_length = get_lib_est(read_data, self.project.options.get_work_dir())
        with open(outfile, 'w') as of:
            for read_id in read_data:
                of.write(read_data[read_id]['ref_name'] + '\t' + str(read_data[read_id]['ref_len'])  + '\t' + str(read_data[read_id]['rlen']) + '\t' + str(read_data[read_id]['tlen'])+ '\n')
                #of.write(str(fragment[0]) + '\t' + str(fragment[1]) + '\n')
            #~ for length in sorted(fragment_weights.keys()):
                #~ of.write(str(length) + '\t' + str(fragment_weights[length]) + '\n')
            of.closed

        self.assertTrue(int(avg_fragment_length) > 0)


    @unittest.skip("for faster testing")
    def test_9_find_fragment_length(self):

        for sample_id in self.project.list_samples():
        #sample_id = 'sample1'
            self.project.import_reads_json(sample_id, self.project.ENDS)
            #avg_fragment_length = self.project.find_fragment_length(self.project.samples[sample_id])
            avg_fragment_length = self.project.samples[sample_id].estimate_average_insert_size(self.project.config.get_length_cutoff(self.project.options.get_collection(sample_id)))
            print('Insert size for',sample_id,'is',str(avg_fragment_length))
        
        self.assertTrue(int(avg_fragment_length) > 0)

    @unittest.skip("for faster testing")
    def test_10_generate_markdown(self):

        #for sample_id in self.project.list_samples():
        sample_id = 'sample1'
        metrics = 'efpkg'
        self.project.import_reads_json(sample_id, self.project.ENDS)
        scores = get_function_scores(self.project, sample_id=sample_id, metrics=metrics)
        generate_project_markdown_document(self.project, scores, sample_id = sample_id, metrics = metrics)
        outfile = sanitize_file_name(os.path.join(self.project.options.get_work_dir(), 'index.md'))
        with open (outfile, 'r') as f:
            line = f.readline()
            f.close()
        self.assertEqual(line, '# ' + self.project.options.get_name() + '\n')

    @unittest.skip("for faster testing")
    def test_11_generate_functions_stamp_input(self):

        for sample_id in self.project.list_samples():
            self.project.import_reads_json(sample_id, self.project.ENDS)
        metrics = 'efpkg'
#        metrics = 'fragmentcount'
        scores = get_function_scores(self.project, metrics=metrics)
        generate_functions_stamp_input(self.project, scores, metrics)

    @unittest.skip("for faster testing")
    def test_12_generate_functions_taxonomy_stamp_input(self):

        for sample_id in self.project.list_samples():
            self.project.import_reads_json(sample_id, self.project.ENDS)
        metrics = 'efpkg'
#        metrics = 'fragmentcount'
        scores = get_function_taxonomy_scores(self.project,metrics=metrics)
        generate_functions_taxonomy_stamp_input(self.project, scores, metrics)

    @unittest.skip("for faster testing")
    def test_5_generate_functions_samples_xlsx(self):
        for sample_id in self.project.list_samples():
            self.project.import_reads_json(sample_id, self.project.ENDS)
        metrics = 'efpkg'
        rank = 'phylum'
        scores = get_function_taxonomy_scores(self.project, metrics=metrics)
        generate_sample_taxonomy_function_xlsx(self.project, 
                            scores, 
                            metrics=metrics, 
                            rank = rank)
        self.assertTrue(len(scores) > 0)

    @unittest.skip("for faster testing")
    def test_collect_taxonomic_data(self):
        print ('Load project from JSON')
        sample_id = 'FW306-ZV-1'
        metrics = 'efpkg'
        self.project.import_reads_json(sample_id, ENDS)
        
        sample_scores = get_function_taxonomy_scores(self.project, sample_id = sample_id, metrics = metrics)# autovivify(2, float)
        
                
        with open('sample1_taxonomy_scores.tsv', 'w') as of:
            of.write('Taxonomy_ID\tFunction_ID\tScore\n')
            for taxonomy_id in sorted(sample_scores.keys()):
                for function_id in sorted(sample_scores[taxonomy_id].keys()):
                    of.write('\t'.join([taxonomy_id,function_id,str(sample_scores[taxonomy_id][function_id][sample_id][metrics])]) + '\n')
            of.close()
                
        self.assertEqual(len(self.project.samples), 6)


    def tearDown(self):
        self.parser = None
示例#11
0
def rename_sample(config_file, project_file, old_id, new_id):
    """This function is doing all the work to rename a sample

    Args:
        config_file (str): path to program config ini file
        project_file (str): path to project options ini file
        old_id (str): existing sample identifier
        new_id (str): new sample identifier

    """
    project = Project(config_file=config_file, project_file=project_file)
    project.load_project()
    check_id(project, old_id, new_id)

    # Rename files
    rename_files(project, old_id, new_id)

    # Change samples

    new_sample = project.samples[old_id]
    new_sample.sample_id = new_id
    # Delete sample output directory
    shutil.rmtree(
        os.path.join(project.options.get_project_dir(old_id),
                     project.options.get_output_subdir(old_id)))

    items = []
    for option in project.options.parser.options(old_id):
        if option not in project.options.parser.defaults():
            items.append([option, project.options.parser.get(old_id, option)])
    project.options.parser.add_section(new_id)
    for item in items:
        project.options.parser.set(new_id, item[0], item[1])
    project.options.parser.remove_section(old_id)

    project.samples.pop(old_id, None)
    project.samples[new_id] = new_sample
    project.options.set_sample_data(project.samples[new_id])
    project.save_project_options()

    # Re-open project with new version of project file
    project = Project(config_file=config_file, project_file=project_file)
    project.load_project()
    os.mkdir(
        os.path.join(project.options.get_project_dir(new_id),
                     project.options.get_output_subdir(new_id)))
    project.import_reads_json(new_id, ENDS)

    for end in ENDS:
        if not project.samples[new_id].is_paired_end and end == 'pe2':
            continue

        parser = DiamondParser(config=project.config,
                               options=project.options,
                               taxonomy_data=project.taxonomy_data,
                               ref_data=project.ref_data,
                               sample=project.samples[new_id],
                               end=end)
        parser.reads = project.samples[new_id].reads[end]
        # Re-create output files
        # Generate output
        generate_fastq_report(parser)
        generate_pdf_report(parser)
        make_functions_chart(parser)
示例#12
0
def main():
    """Loads annotated reads and calls report generators"""
    args = get_args()
    if args.prot:
        project = Project(config_file=args.config, project_file=args.project)
        project.load_project()
        for sample_id in project.list_samples():
            if args.sample is not None:
                if args.sample != sample_id:
                    continue
            project.options.set_sample_data(project.samples[sample_id])
            project.import_reads_json(sample_id, ENDS)
            fama_report.generate_protein_sample_report(project,
                                                       sample_id,
                                                       metrics='readcount')

        if args.sample is None:
            # Skip project report generation if the pipeline is running for only one sample
            fama_report.generate_protein_project_report(project)
        generate_output(project)

    else:
        project = Project(config_file=args.config, project_file=args.project)
        project.load_project()
        for sample_id in project.list_samples():
            if args.sample is not None and args.sample != sample_id:
                continue
            if project.samples[
                    sample_id].rpkg_scaling_factor is None or project.samples[
                        sample_id].rpkg_scaling_factor == 0.0:
                project.samples[sample_id].import_rpkg_scaling_factor()
                if project.samples[
                        sample_id].rpkg_scaling_factor is None or project.samples[
                            sample_id].rpkg_scaling_factor == 0.0:
                    run_microbecensus(sample=project.samples[sample_id],
                                      config=project.config)
                    project.samples[sample_id].import_rpkg_scaling_factor()
            project.options.set_sample_data(project.samples[sample_id])
            project.import_reads_json(sample_id, ENDS)
            print('Generating report for', sample_id)
            fama_report.generate_sample_report(project,
                                               sample_id,
                                               metrics=args.metrics)
        print('Generating report for project')
        project.generate_report(metrics=args.metrics)
        project.save_project_options()
        print('Done!')
示例#13
0
class DiamondParserTest(unittest.TestCase):

    def setUp(self):
        #self.parser = DiamondParser(config_file=config_path, project_file=project_path, sample=sample, end=end)
        self.project = Project(config_file=config_path, project_file=project_path)
        for sample_id in self.project.list_samples():
            sample = Sample(sample_id=sample_id)
            sample.load_sample(self.project.options)
            self.project.samples[sample_id] = sample
        
    def test_1_test_pipeline(self):
        #parser = functional_profiling_pipeline(config_file=config_path, project_file=project_path, sample=sample, end=end)
        pass

    def test_2_load_proteins(self):
        self.project.import_reads_json(sample, ENDS)
        protein = 'D16-4706_contig_11213_8'
        self.assertTrue(protein in self.project.samples[sample].reads[end])
        print(self.project.samples[sample].reads[end][protein].taxonomy)
        

    def test_3_protein_taxonomy(self):
        self.project.import_reads_json(sample, ENDS)
        protein = 'D16-4706_contig_11213_7'
        print('D16-4706_contig_11213_7 taxonomy')
        print(self.project.samples[sample].reads[end][protein].taxonomy)
        
        parser = DiamondParser(config=self.project.config,
                       options=self.project.options,
                       taxonomy_data=self.project.taxonomy_data,
                       ref_data=self.project.ref_data,
                       sample=self.project.samples[sample],
                       end=end)
        parser.parse_reference_output()
        print(str(parser.reads[protein]))
        
#        parse_background_output(parser)
        hit_line = 'D16-4706_contig_11213_7|4|257	fig|408672.3.peg.2637	63.0	254	94	256	1	254	2	255	1.1e-97	362.1'
        hit = DiamondHit()
        hit.create_hit(tabular_output_fields=hit_line.split('\t'))
        hit_list = DiamondHitList('D16-4706_contig_11213_7|4|257')
        hit_list.add_hit(hit)
        hit_list.annotate_hits(self.project.ref_data)
        hit_list.filter_list_by_identity(self.project.ref_data)
        print('hit_list')
        print(hit_list)
        
        compare_protein_hits_lca(parser.reads[protein], 4, 257, hit_list, 0.03, 1.0, 1.0, self.project.taxonomy_data, self.project.ref_data)
        print(parser.reads[protein].taxonomy)
        self.assertEqual(parser.reads[protein].taxonomy, '408672')
        

    def test_4_sample_taxonomy(self):
        with open('samples_taxonomy2.txt', 'w') as outfile:
            for sample_id in self.project.list_samples():
                self.project.import_reads_json(sample_id, ENDS)
                taxonomy_ids = []
                for protein_id, protein in self.project.samples[sample_id].reads[end].items():
                    taxonomy_ids.append(protein.taxonomy)
                lca_taxonomy = self.project.taxonomy_data.get_lca(taxonomy_ids)
                outfile.write('\t'.join([sample_id, lca_taxonomy, self.project.taxonomy_data.get_name(lca_taxonomy)]) + '\n')
        
        #~ self.project.import_reads_json(sample, ENDS)
        #~ for protein_id, protein in self.project.samples[sample].reads[end].items():
            #~ taxonomy_ids.append(protein.taxonomy)
        #~ lca_taxonomy = self.project.taxonomy_data.get_lca(taxonomy_ids)
        #~ print(sample, lca_taxonomy, self.project.taxonomy_data.get_name(lca_taxonomy))
        #~ self.assertEqual(lca_taxonomy, '28216')


    def test_5_export_proteins(self):
        with open('proteins.faa', 'w') as outfile:
            for sample_id in self.project.list_samples():
                self.project.import_reads_json(sample_id, ENDS)
                for protein_id, protein in self.project.samples[sample_id].reads[end].items():
                    if protein.status == 'function':
                        outfile.write('>' + protein_id + '|' + sample_id + '|' + ';'.join(protein.functions.keys()) +  '|' + self.project.taxonomy_data.get_name(protein.taxonomy) + '\n')
                        outfile.write(protein.sequence + '\n\n')

    def test_6_export_protein_table(self):
        out_file = os.path.join(self.project.options.work_dir, 'proteins.list.txt')
        with open(out_file, 'w') as outfile:
            for sample_id in self.project.list_samples():
                self.project.import_reads_json(sample_id, ENDS)
                for protein_id, protein in self.project.samples[sample_id].reads[end].items():
                    if protein.status == 'function':
                        protein_length = len(protein.sequence)
                        ref_length = int(protein.hit_list.hits[0].s_len)
                        outfile.write('\t'.join([
                            sample_id,
                            protein_id,
                            ';'.join(sorted(protein.functions.keys())),
                            '{0:.4f}'.format(protein_length/ref_length),
                            protein.taxonomy,
                            self.project.taxonomy_data.data[protein.taxonomy]['name']
                            ]) + '\n')
    def tearDown(self):
        self.parser = None
示例#14
0
 def test_load_project(self):
     print ('Load project from INI file')
     self.project = None
     self.project = Project(config_file=config_path, project_file=project_path)
     self.project.load_project()
     self.assertEqual(len(self.project.samples), 6)
示例#15
0
 def setUp(self):
     self.project = Project(config_file=config_path, project_file=project_path)
     for sample_id in self.project.list_samples():
         sample = Sample(sample_id=sample_id)
         sample.load_sample(self.project.options)
         self.project.samples[sample_id] = sample
示例#16
0
class ProjectTest(unittest.TestCase):

    def setUp(self):
        self.project = Project(config_file=config_path, project_file=project_path)
        for sample_id in self.project.list_samples():
            sample = Sample(sample_id=sample_id)
            sample.load_sample(self.project.options)
            self.project.samples[sample_id] = sample
    
    @unittest.skip("for faster testing")
    def test_project_options(self):
        print ('Print list of samples1')
        options = ProjectOptions(project_path)
        print (options.parser.sections())
        self.assertEqual(len(options.parser.sections()), 6)
        self.assertEqual(options.parser.sections()[0], 'sample1')
    
    @unittest.skip("for faster testing")
    def test_list_samples(self):
        print ('Print list of samples2')
        samples = self.project.list_samples()
        print (samples)
        self.assertEqual(len(samples), 6)
        self.assertEqual(samples[0], 'sample1')
        
    @unittest.skip("for faster testing")
    def test_check_project(self):
        print ('Print problems found in test project: ')
        self.project.check_project()

    @unittest.skip("for faster testing")
    def test_check_config(self):
        print ('Print problems found in test project: ')
        self.project.check_project()
        self.assertEqual(self.project.config.get_biscore_range_cutoff(self.project.options.get_collection()), 0.2)
        self.assertEqual(self.project.config.get_identity_cutoff(self.project.options.get_collection()), 40.0)
        ranks_cutoff = self.project.config.get_ranks_cutoffs(self.project.options.get_collection())
        
        print(ranks_cutoff)
        print(ranks_cutoff['species'])
    
    @unittest.skip("for faster testing")
    def test_load_project(self):
        print ('Load project from INI file')
        self.project = None
        self.project = Project(config_file=config_path, project_file=project_path)
        self.project.load_project()
        self.assertEqual(len(self.project.samples), 6)

    @unittest.skip("for faster testing")
    def test_collect_fragment_stats(self):
        print ('Load project from JSON')
        #self.project.load_functional_profile()
        
        sample_stats = autovivify(2, int)
        for sample_id in self.project.list_samples():
            if not self.project.samples[sample_id].is_paired_end:
                continue
            self.project.import_reads_json(sample_id, self.project.ENDS)
            print('Mapping data loaded for sample', sample_id)
            both_ends_mapped_reads = {}
            pe1_multiple_functions = {}
            pe2_multiple_functions = {}
            sample_stats['reads_pe1_total'][sample_id] = len(self.project.samples[sample_id].reads['pe1'])
            sample_stats['reads_pe2_total'][sample_id] = len(self.project.samples[sample_id].reads['pe2'])
            pe1_reads = self.project.samples[sample_id].reads['pe1']
            pe2_reads = self.project.samples[sample_id].reads['pe2']
            for read in pe1_reads:
                if pe1_reads[read].get_status() == 'function,besthit' or pe1_reads[read].get_status() == 'function':
                    if read in pe2_reads and (pe2_reads[read].get_status() == 'function,besthit' or pe2_reads[read].get_status() == 'function'):
                        sample_stats['both ends mapped'][sample_id] += 1
                        both_ends_mapped_reads[read] = 1
                    else:
                        sample_stats['pe1 mapped only'][sample_id] += 1
            for read in pe2_reads:
                if read not in both_ends_mapped_reads and (pe2_reads[read].get_status() == 'function,besthit' or pe2_reads[read].get_status() == 'function'):
                        sample_stats['pe2 mapped only'][sample_id] += 1

            for read in pe1_reads:
                if pe1_reads[read].get_status() == 'function,besthit' or pe1_reads[read].get_status() == 'function':
                    sample_stats['reads_pe1_mapped'][sample_id] += 1
                    if len(pe1_reads[read].get_functions()) == 1:
                        sample_stats['pe1 single function'][sample_id] += 1
                    elif len(pe1_reads[read].get_functions()) > 1:
                        sample_stats['pe1 multiple functions'][sample_id] += 1
                        pe1_multiple_functions[read] = 1

            for read in pe2_reads:
                if pe2_reads[read].get_status() == 'function,besthit' or pe2_reads[read].get_status() == 'function':
                    sample_stats['reads_pe2_mapped'][sample_id] += 1
                    if len(pe2_reads[read].get_functions()) == 1:
                        sample_stats['pe2 single function'][sample_id] += 1
                    elif len(pe2_reads[read].get_functions()) > 1:
                        sample_stats['pe2 multiple functions'][sample_id] += 1
                        pe2_multiple_functions[read] = 1
            
            for read in both_ends_mapped_reads:
                if len(pe1_reads[read].get_functions()) == 1:
                    sample_stats['pe1 single function, both ends mapped'][sample_id] += 1
                elif len(pe1_reads[read].get_functions()) > 1:
                    sample_stats['pe1 multiple functions, both ends mapped'][sample_id] += 1
                if len(pe2_reads[read].get_functions()) == 1:
                    sample_stats['pe2 single function, both ends mapped'][sample_id] += 1
                elif len(pe2_reads[read].get_functions()) > 1:
                    sample_stats['pe2 multiple functions, both ends mapped'][sample_id] += 1
                
            for read in pe1_reads:
                if pe1_reads[read].get_status() == 'function,besthit' or pe1_reads[read].get_status() == 'function':
                    hits = pe1_reads[read].get_hit_list()
                    if len(hits.get_hits()) == 1:
                        sample_stats['pe1 single hit'][sample_id] += 1
                    elif len(hits.get_hits()) > 1:
                        sample_stats['pe1 multiple hits'][sample_id] += 1

            for read in pe2_reads:
                if pe2_reads[read].get_status() == 'function,besthit' or pe2_reads[read].get_status() == 'function':
                    hits = pe2_reads[read].get_hit_list()
                    if len(hits.get_hits()) == 1:
                        sample_stats['pe2 single hit'][sample_id] += 1
                    elif len(hits.get_hits()) > 1:
                        sample_stats['pe2 multiple hits'][sample_id] += 1

            for read in pe1_multiple_functions:
                if pe1_reads[read].get_status() == 'function,besthit' or pe1_reads[read].get_status() == 'function':
                    hits = pe1_reads[read].get_hit_list()
                    if len(hits.get_hits()) == 1:
                        sample_stats['pe1 multiple functions, single hit'][sample_id] += 1
                    elif len(hits.get_hits()) > 1:
                        sample_stats['pe1 multiple functions, multiple hits'][sample_id] += 1
                
            for read in pe2_multiple_functions:
                if pe2_reads[read].get_status() == 'function,besthit' or pe2_reads[read].get_status() == 'function':
                    hits = pe2_reads[read].get_hit_list()
                    if len(hits.get_hits()) == 1:
                        sample_stats['pe2 multiple functions, single hit'][sample_id] += 1
                    elif len(hits.get_hits()) > 1:
                        sample_stats['pe2 multiple functions, multiple hits'][sample_id] += 1
            
            for read in both_ends_mapped_reads:
                hits = pe1_reads[read].get_hit_list()
                if len(hits.get_hits()) == 1:
                    sample_stats['pe1 single hit, both ends mapped'][sample_id] += 1
                elif len(hits.get_hits()) > 1:
                    sample_stats['pe1 multiple hits, both ends mapped'][sample_id] += 1
                hits = pe2_reads[read].get_hit_list()
                if len(hits.get_hits()) == 1:
                    sample_stats['pe2 single hit, both ends mapped'][sample_id] += 1
                elif len(hits.get_hits()) > 1:
                    sample_stats['pe2 multiple hits, both ends mapped'][sample_id] += 1
            
            self.project.samples[sample_id].reads = None
                
        with open('outfile.tsv', 'w') as of:
            for sample_id in self.project.list_samples():
                of.write('\t' + sample_id)
            of.write('\n')
            for item in sorted(sample_stats.keys()):
                of.write(item)
                for sample_id in self.project.list_samples():
                    if sample_id in sample_stats[item]:
                        of.write('\t' + str(sample_stats[item][sample_id]))
                    else:
                        of.write('\t0')
                of.write('\n')
            of.close()
                
        self.assertEqual(len(self.project.samples), 6)

    @unittest.skip("for faster testing")
    def test_top_size(self):
        print ('Load reads from JSON')
        sample_id = 'sample3'
        self.project.import_reads_json(sample_id,['pe1',])
        
        tsvfile = '/mnt/data3/FEBA/4703/nitrogen_v7.1_fama/sample3_pe1_bgr_tabular_output.txt'
        outfile = 'top_hit_count.txt'

        current_query_id = None
        top_size = 0
        identity_cutoff = 50.0
        length_cutoff = 15
        bitscore_range_cutoff = 0.97
        bitscore_cutoff = 0.0
        print ('Identity cutoff: ', identity_cutoff, ', Length cutoff: ', length_cutoff)

        with open(outfile, 'w') as of:
            with open(tsvfile, 'r', newline='') as f:
                tsvin = csv.reader(f, delimiter='\t')
                for row in tsvin:
                    if current_query_id is None:
                        current_query_id = row[0]
                        bitscore_cutoff = float(row[11]) * bitscore_range_cutoff
                    
                    # filtering by identity and length
                    if float(row[2]) < identity_cutoff:
                        continue # skip this line
                    if float(row[3]) < length_cutoff:
                        continue # skip this line

                    if row[0] != current_query_id:
                        read_id = current_query_id.split('|')[0]
                        of.write(read_id + '\t' + str(top_size) + '\t' + self.project.samples[sample_id].reads['pe1'][read_id].get_status() + '\n')
                        current_query_id = row[0]
                        top_size = 0
                        bitscore_cutoff = float(row[11]) * bitscore_range_cutoff

                    if float(row[11]) >= bitscore_cutoff:
                        top_size += 1
                f.closed
            of.closed

        
    def tearDown(self):
        self.project = None