def setUp(self): #self.parser = DiamondParser(config_file=config_path, project_file=project_path, sample=sample, end=end) self.project = Project(config_file=config_path, project_file=project_path) for sample_id in self.project.list_samples(): sample = Sample(sample_id=sample_id) sample.load_sample(self.project.options) self.project.samples[sample_id] = sample
def setUp(self): self.project = Project(config_file=config_path, project_file=project_path) self.project.load_project() self.parser = DiamondParser(config = self.project.config, options=self.project.options, taxonomy_data=self.project.taxonomy_data, ref_data=self.project.ref_data, sample=self.project.samples[sample_id], end=end)
def setUp(self): self.project = Project(config_file=config_path, project_file=project_path) self.project.load_project() self.parser = DiamondParser(config=self.project.config, options=self.project.options, taxonomy_data=self.project.taxonomy_data, ref_data=self.project.ref_data, sample=self.project.samples[sample_id], end=end) self.taxonomy_data = TaxonomyData(self.project.config, '') self.node = Node(rank='genus', name='Escherichia', taxid='561', parent='543', children=None) self.tree = Tree()
def setUp(self): project = Project(config_file=config_path, project_file=project_path) sample = Sample(sample_id=sample_id) sample.load_sample(project.options) self.parser = DiamondParser(config = project.config, options=project.options, taxonomy_data=project.taxonomy_data, ref_data=project.ref_data, sample=sample, end=end)
def findProject(self, path): query = 'select name, path from projects where path = "{}"'.format( path) result = self.db.fetch(query) if len(result) > 0: name, path = result[0] project = Project(name, path, self.db) return project else: raise Exception
def main(): """Main function calling functional profiling module""" args = get_args() if args.prot: protein_pipeline(args) else: if args.sample is None: print( 'Running functional profiling for all samples in the project') else: print('Running functional profiling only for ', args.sample) project = Project(config_file=args.config, project_file=args.project) if project.is_paired_end(): fastq_pe_pipeline(project, sample_identifier=args.sample, end_identifier=args.end) else: fastq_pipeline(project, sample_identifier=args.sample, end_identifier=args.end) print('Done!')
def createProject(self, name='', path=''): if name == '': print('Project name: ') name = self.stdin.read() if path == '': print('Project path: ') path = self.stdin.read() project = Project(name, path, self.db) # project.create() project.discover() project.save(self.db) return project
def protein_pipeline(args): """Functional profiling pipeline for the entire project. Args: args: ArgumentParser namespace with defined args.config (path to program config ini file) and args.project (path to project options ini file) """ project = Project(config_file=args.config, project_file=args.project) sample_ids = [] for sample_id in project.list_samples(): if args.sample is not None: if args.sample != sample_id: continue sample = Sample(sample_id) sample.load_sample(project.options) project.samples[sample_id] = sample project.samples[sample_id].is_paired_end = False project.samples[sample_id].rpkg_scaling_factor = None project.samples[sample_id].rpkm_scaling_factor = None sample_ids.append(sample_id) for sample_id in sample_ids: # End identifier in protein pipeline is always pe1 project.samples[sample_id].reads[ 'pe1'] = functional_profiling_pipeline( project, sample=project.samples[sample_id]) export_sample(project.samples[sample_id]) # Generate output for the sample or delete sample from memory generate_protein_sample_report(project, sample_id, metric='proteincount') project.options.set_sample_data(project.samples[sample_id]) # Generate output for the project if args.sample is None: # Skip project report if the pipeline is running for only one sample generate_protein_project_report(project) generate_output(project) project.save_project_options()
class TaxonomyProfilingTest(unittest.TestCase): def setUp(self): self.project = Project(config_file=config_path, project_file=project_path) self.project.load_project() self.parser = DiamondParser(config=self.project.config, options=self.project.options, taxonomy_data=self.project.taxonomy_data, ref_data=self.project.ref_data, sample=self.project.samples[sample_id], end=end) self.taxonomy_data = TaxonomyData(self.project.config, '') self.node = Node(rank='genus', name='Escherichia', taxid='561', parent='543', children=None) self.tree = Tree() # Tests of TaxonomyData class def test_0010_init_taxonomy_data(self): self.assertEqual(len(self.taxonomy_data.data), 6568) # Nitrogen collection def test_0020_is_exist(self): self.assertTrue(self.taxonomy_data.is_exist('1')) self.assertTrue(self.taxonomy_data.is_exist('0')) self.assertTrue(self.taxonomy_data.is_exist('562')) self.assertFalse(self.taxonomy_data.is_exist('fake_identifier')) def test_0030_get_name(self): self.assertEqual(self.taxonomy_data.get_name('1'), 'root') self.assertEqual(self.taxonomy_data.get_name('0'), 'Unknown') self.assertEqual(self.taxonomy_data.get_name('562'), 'Escherichia coli') def test_0040_get_rank(self): self.assertEqual(self.taxonomy_data.get_rank('1'), 'norank') # Test root self.assertEqual(self.taxonomy_data.get_rank('0'), 'norank') # Test Unknown self.assertEqual(self.taxonomy_data.get_rank('2'), 'superkingdom') # Test Bacteria self.assertEqual(self.taxonomy_data.get_rank('562'), 'species') # Test E. coli def test_0050_get_parent(self): self.assertEqual(self.taxonomy_data.get_parent('1'), '1') # Test root self.assertEqual(self.taxonomy_data.get_parent('0'), '1') # Test Unknown self.assertEqual(self.taxonomy_data.get_parent('2'), '131567') # Test Bacteria self.assertEqual(self.taxonomy_data.get_parent('562'), '561') # Test E. coli def test_0060_get_lca(self): self.assertEqual(self.taxonomy_data.get_lca(['1']), '1') # Test root self.assertEqual(self.taxonomy_data.get_lca(['']), '0') # Test empty string self.assertEqual(self.taxonomy_data.get_lca([]), '0') # Test empty list self.assertEqual(self.taxonomy_data.get_lca(['0']), '0') # Test Unknown # Anything with root goes to Unknown self.assertEqual(self.taxonomy_data.get_lca(['571', '1']), '0') # K. oxytoca, root # Anything with Unknown is ignored self.assertEqual(self.taxonomy_data.get_lca(['571', '0']), '571') # K. oxytoca, Unknown self.assertEqual(self.taxonomy_data.get_lca(['2', '2157']), '131567') # Bacteria, Archaea self.assertEqual(self.taxonomy_data.get_lca(['571', '573']), '570') # K. oxytoca, K. pneumoniae def test_0070_get_upper_level_taxon(self): self.assertEqual(self.taxonomy_data.get_upper_level_taxon('1'), ('1', 'norank')) # Test root self.assertEqual(self.taxonomy_data.get_upper_level_taxon('0'), ('1', 'norank')) # Test Unknown # For Bacteria, returns 1 (root), not 131567 ('cellular organisms') self.assertEqual(self.taxonomy_data.get_upper_level_taxon('2'), ('1', 'norank')) # Test Bacteria # For 651137 (Thaumarchaeota), returns 2157 (Archaea), not 1783275 ('TACK group') self.assertEqual(self.taxonomy_data.get_upper_level_taxon('651137'), ('2157', 'superkingdom')) # Test 44260 (Moorella): skip parent taxon 42857 and report 186814 (Thermoanaerobacteriaceae family) self.assertEqual(self.taxonomy_data.get_upper_level_taxon('44260'), ('186814', 'family')) def test_0080_get_upper_level_taxon(self): self.assertEqual(self.taxonomy_data.get_taxonomy_lineage('1'), '') # Test root self.assertEqual(self.taxonomy_data.get_taxonomy_lineage('0'), 'Unknown') # Test Unknown # For Bacteria, returns 1 (root), not 131567 ('cellular organisms') self.assertEqual(self.taxonomy_data.get_taxonomy_lineage('2'), 'Bacteria') # Test Bacteria # Test 651137 (Thaumarchaeota), returns 2157 (Archaea), not 1783275 ('TACK group') self.assertEqual(self.taxonomy_data.get_taxonomy_lineage('651137'), 'Archaea_Thaumarchaeota') # Test 1660251 (Acidobacteria bacterium Mor1): skip two taxa and report 57723 (Acidobacteria) self.assertEqual( self.taxonomy_data.get_taxonomy_lineage('1525'), 'Bacteria_Firmicutes_Clostridia_Thermoanaerobacterales_Thermoanaerobacteraceae_Moorella_Moorella_thermoacetica' ) # Tests of Node class def test_0090_init_node(self): node = Node(rank='genus', name='Escherichia', taxid='561', parent='543', children=None) self.assertEqual(node.rank, 'genus') self.assertEqual(node.name, 'Escherichia') self.assertEqual(node.taxid, '561') self.assertEqual(node.parent, '543') def test_0100_add_child(self): self.node.add_child('562') self.assertEqual(len(self.node.children), 1) self.assertTrue(self.node.has_child('562')) self.assertFalse(self.node.has_child('561')) def test_0110_set_parent(self): node = Node(rank='species') node.set_parent('') self.assertIsNone(node.parent) # Set parent if None node.set_parent('561') self.assertEqual(node.parent, '561') # Change parent node.set_parent('2') self.assertEqual(node.parent, '2') def test_0120_set_taxid(self): node = Node(rank='species') # Set taxid if None node.set_taxid('561') self.assertEqual(node.taxid, '561') # Changing taxid is not possible node.set_taxid('2') self.assertEqual(node.taxid, '561') def test_0130_set_rank(self): self.assertEqual(self.node.rank, 'genus') # Change rank self.assertTrue(self.node.set_rank('species')) self.assertEqual(self.node.rank, 'species') # Rank must be defined in RANKS self.assertFalse(self.node.set_rank('#^$%@!')) self.assertEqual(self.node.rank, 'species') def test_0140_set_attribute(self): self.node.set_attribute('score', 0.001) self.assertEqual(self.node.attributes['score'], 0.001) self.node.set_attribute('score', 1) self.assertEqual(self.node.attributes['score'], 1) def test_0150_add_attribute(self): self.node.set_attribute('score_float', 0.5) self.node.add_attribute('score_float', 0.2) self.assertEqual(self.node.attributes['score_float'], 0.7) self.node.set_attribute('score_int', 1) self.node.add_attribute('score_int', 999) self.assertEqual(self.node.attributes['score_int'], 1000) def test_0160_get_attribute(self): self.node.set_attribute('score_float', 0.5) self.assertEqual(self.node.get_attribute('score_float'), 0.5) self.assertIsNone(self.node.get_attribute('nonexisting_key')) def test_0170_is_in_children(self): self.node.add_child('562') self.assertEqual(len(self.node.children), 1) self.assertTrue(self.node.has_child('562')) self.assertFalse(self.node.has_child('561')) # Tests of Tree class def test_0180_init_tree(self): tree = Tree() self.assertEqual(tree.root.taxid, '1') self.assertEqual(len(tree.data), 1) self.assertEqual(tree.data[ROOT_TAXONOMY_ID].taxid, ROOT_TAXONOMY_ID) def test_0190_add_node(self): # Adding node with non-existing parent must fail self.assertFalse( self.tree.add_node( Node(rank='genus', name='Escherichia', taxid='561', parent='543', children=None))) # Adding empty node must fail self.assertFalse(self.tree.add_node(Node(rank='species'))) # Adding second root must fail self.assertFalse( self.tree.add_node( Node(rank='norank', name='root', taxid='1', parent='1', children=None))) # Adding node with existing parent must succeed self.assertTrue( self.tree.add_node( Node(rank='superkingdom', name='Bacteria', taxid='2', parent='1', children=None))) self.assertEqual(len(self.tree.get_node('1').children), 1) # Adding node if its parent has children must succeed self.assertTrue( self.tree.add_node( Node(rank='superkingdom', name='Archaea', taxid='2157', parent='1', children=None))) self.assertEqual(len(self.tree.get_node('1').children), 2) def test_0200_get_node(self): self.tree.add_node( Node(rank='superkingdom', name='Bacteria', taxid='2', parent='1', children=None)) # Getting existing node must succeed self.assertEqual(self.tree.get_node('2').parent, '1') # Getting non-existing node must fail self.assertIsNone(self.tree.get_node('561')) def test_0210_is_in_tree(self): self.tree.add_node( Node(rank='superkingdom', name='Bacteria', taxid='2', parent='1', children=None)) self.assertTrue(self.tree.is_in_tree('2')) self.assertFalse(self.tree.is_in_tree('562')) self.assertFalse(self.tree.is_in_tree(True)) def test_0220_add_node_recursively(self): self.assertFalse(self.tree.is_in_tree('562')) self.assertTrue( self.tree.add_node_recursively( Node(rank='species', name='Escherichia coli', taxid='562', parent='561', children=None), self.taxonomy_data)) self.assertTrue(self.tree.is_in_tree('562')) self.assertTrue(self.tree.is_in_tree('561')) self.assertTrue(self.tree.is_in_tree('2')) # Adding second root must fail self.assertFalse( self.tree.add_node_recursively( Node(rank='norank', name='root', taxid='1', parent='1', children=None), self.taxonomy_data)) # Adding node with existing parent must succeed self.assertFalse(self.tree.is_in_tree('564')) self.assertTrue( self.tree.add_node_recursively( Node(rank='species', name='Escherichia fergusonii', taxid='564', parent='561', children=None), self.taxonomy_data)) self.assertTrue(self.tree.is_in_tree('564')) def test_0230_add_attribute(self): self.tree.add_node( Node(rank='superkingdom', name='Bacteria', taxid='2', parent='1', children=None)) # Add attribute to existing node self.assertFalse(self.tree.get_node('2').attributes) self.tree.add_attribute('2', 'score_float', 0.42, self.taxonomy_data) self.assertTrue(self.tree.get_node('2').attributes) self.assertEqual( self.tree.get_node('2').attributes['score_float'], 0.42) # Add attribute to non-existing node self.assertFalse(self.tree.get_node('1').attributes) self.tree.add_attribute('2157', 'score_float', 0.42, self.taxonomy_data) self.assertTrue(self.tree.get_node('1').attributes) self.assertEqual( self.tree.get_node('1').attributes['score_float'], 0.42) def test_0240_add_attribute_recursively(self): self.tree.add_node( Node(rank='superkingdom', name='Bacteria', taxid='2', parent='1', children=None)) # Add attribute to existing node self.assertFalse(self.tree.get_node('2').attributes) self.tree.add_attribute_recursively('2', 'score_float', 0.42, self.taxonomy_data) self.assertTrue(self.tree.get_node('2').attributes) self.assertEqual( self.tree.get_node('2').attributes['score_float'], 0.42) self.assertEqual( self.tree.get_node('1').attributes['score_float'], 0.42) # Add attribute to non-existing node self.tree.add_attribute_recursively('2157', 'score_float', 0.42, self.taxonomy_data) self.assertEqual( self.tree.get_node('1').attributes['score_float'], 0.84) def test_0250_get_parent(self): self.assertTrue( self.tree.add_node_recursively( Node(rank='species', name='Escherichia coli', taxid='562', parent='561', children=None), self.taxonomy_data)) # Getting parent of existing node must succeed node = self.tree.get_node('561') self.assertEqual(node.parent, '543') parent_node = self.tree.get_parent(node, self.taxonomy_data) self.assertEqual(parent_node.taxid, '543') # Getting parent of non-existing node must fail node = Node(rank='species') parent_node = self.tree.get_parent(node, self.taxonomy_data) self.assertIsNone(parent_node) # Tests of TaxonomyProfile class def test_0260_init_taxonomy_profile(self): taxonomy_profile = TaxonomyProfile() self.assertIsNotNone(taxonomy_profile.tree) self.assertEqual(len(taxonomy_profile.tree.data), 1) self.assertEqual(taxonomy_profile.tree.root.taxid, ROOT_TAXONOMY_ID) self.assertEqual(taxonomy_profile.tree.data[ROOT_TAXONOMY_ID].taxid, ROOT_TAXONOMY_ID) def test_0270_make_function_taxonomy_profile(self): self.project.import_reads_json(sample_id, ENDS) scores = get_function_taxonomy_scores(self.project, sample_id=sample_id, metric='fpk') sample_scores_taxonomy = slice_function_taxonomy_scores( scores, sample_id) taxonomy_profile = TaxonomyProfile() taxonomy_profile.make_function_taxonomy_profile( self.project.taxonomy_data, sample_scores_taxonomy) print(taxonomy_profile) self.assertEqual(len(taxonomy_profile.tree.data), 22) self.assertEqual( taxonomy_profile.tree.data[ROOT_TAXONOMY_ID].attributes['NirK'] ['count'], 1.0) self.assertEqual( taxonomy_profile.tree.data[ROOT_TAXONOMY_ID].attributes['UreA'] ['count'], 3.0) self.assertEqual( taxonomy_profile.tree.data['118883'].attributes['UreC']['count'], 1.0) def test_0280_str(self): taxonomy_profile = TaxonomyProfile() self.assertEqual( str(taxonomy_profile), '1\tnorank\troot\tParent:None\tChildren:None\tScore:N/A\tIdentity:N/A\tRead count:N/A\n' ) def test_0290_stringify_node(self): self.project.import_reads_json(sample_id, ENDS) scores = get_function_taxonomy_scores(self.project, sample_id=sample_id, metric='fpk') sample_scores_taxonomy = slice_function_taxonomy_scores( scores, sample_id) taxonomy_profile = TaxonomyProfile() taxonomy_profile.make_function_taxonomy_profile( self.project.taxonomy_data, sample_scores_taxonomy) self.assertEqual( taxonomy_profile.stringify_node('118883', 0), "118883\tfamily\tSulfolobaceae\tParent:2281\tChildren:None\tUreC:{'count': 1.0, 'hit_count': 1.0" ", 'identity': 68.8, 'fpk': 0.5984440454817475}\n") def test_0300_convert_profile_into_df(self): self.project.import_reads_json(sample_id, ENDS) scores = get_function_taxonomy_scores(self.project, sample_id=sample_id, metric='fpk') sample_scores_taxonomy = slice_function_taxonomy_scores( scores, sample_id) taxonomy_profile = TaxonomyProfile() taxonomy_profile.make_function_taxonomy_profile( self.project.taxonomy_data, sample_scores_taxonomy) result = taxonomy_profile.convert_profile_into_df(metric='fpk') self.assertEqual(result.iloc[0][1], 'root') self.assertEqual(result.iloc[1][0], 'superkingdom') def test_0310_convert_node_into_dict(self): self.project.import_reads_json(sample_id, ENDS) scores = get_function_taxonomy_scores(self.project, sample_id=sample_id, metric='fpk') sample_scores_taxonomy = slice_function_taxonomy_scores( scores, sample_id) print(sample_scores_taxonomy) taxonomy_profile = TaxonomyProfile() taxonomy_profile.make_function_taxonomy_profile( self.project.taxonomy_data, sample_scores_taxonomy) result, attributes = taxonomy_profile.convert_node_into_dict( '118883', ['UreC', 'UreA'], 1, metric='fpk') self.assertEqual(result[1][('', 'Taxon name')], 'Sulfolobaceae') self.assertEqual(result[1][('UreC', '1.Score')], 0.5984440454817475) self.assertEqual(result[1][('UreA', '1.Score')], 0.0) self.assertEqual(attributes['UreC']['fpk'], 0.5984440454817475) self.assertEqual(attributes['UreA']['fpk'], 0.0) def test_0320_convert_profile_into_score_df(self): self.project.import_reads_json(sample_id, ENDS) scores = get_function_taxonomy_scores(self.project, sample_id=sample_id, metric='fpk') sample_scores_taxonomy = slice_function_taxonomy_scores( scores, sample_id) taxonomy_profile = TaxonomyProfile() taxonomy_profile.make_function_taxonomy_profile( self.project.taxonomy_data, sample_scores_taxonomy) result = taxonomy_profile.convert_profile_into_score_df(metric='fpk') self.assertEqual(result.iloc[0][1], 'root') self.assertEqual(result.iloc[1][0], 'superkingdom') def test_0330_convert_node_into_dict(self): self.project.import_reads_json(sample_id, ENDS) scores = get_function_taxonomy_scores(self.project, sample_id=sample_id, metric='fpk') sample_scores_taxonomy = slice_function_taxonomy_scores( scores, sample_id) taxonomy_profile = TaxonomyProfile() taxonomy_profile.make_function_taxonomy_profile( self.project.taxonomy_data, sample_scores_taxonomy) result, attributes = taxonomy_profile.convert_node_into_values_dict( '118883', ['UreC', 'UreA'], 1, metric='fpk') self.assertEqual(result[1][('', 'Taxon name')], 'Sulfolobaceae') self.assertEqual(result[1][('UreC', 'fpk')], 0.5984440454817475) self.assertEqual(result[1][('UreA', 'fpk')], 0.0) self.assertEqual(attributes['UreC']['fpk'], 0.5984440454817475) self.assertNotIn('UreA', attributes.keys()) def tearDown(self): self.parser = None
class FamaReportTest(unittest.TestCase): def setUp(self): self.project = Project(config_file=config_path, project_file=project_path) self.project.load_project() self.parser = DiamondParser(config = self.project.config, options=self.project.options, taxonomy_data=self.project.taxonomy_data, ref_data=self.project.ref_data, sample=self.project.samples[sample_id], end=end) @unittest.skip("for faster testing") def test_1_collection_pdf_output(self): outfile = os.path.join(data_dir,'collection_list.pdf') if (os.path.exists(outfile)): os.remove(outfile) pdf = FPDF('P', 'mm', 'Letter') pdf.add_page() pdf.add_font('DejaVu', '', '/usr/share/fonts/truetype/dejavu/DejaVuSansCondensed.ttf', uni=True) pdf.add_font('DejaVu', 'B', '/usr/share/fonts/truetype/dejavu/DejaVuSansCondensed-Bold.ttf', uni=True) pdf.set_font('DejaVu', 'B', 16) pdf.cell(40, 10, 'List of functions in collection ' + self.parser.collection) pdf.ln(h = '') pdf.set_font('DejaVu', '', 10) v_limit = 40 for function in self.parser.ref_data.functions_dict: v_limit += 10 pdf.cell(v_limit, 10, function + ' ' + self.parser.ref_data.functions_dict[function]['name'] + ': group ' + self.parser.ref_data.functions_dict[function]['group'] ) pdf.ln(h = 10) outfile = os.path.join(data_dir,'collection_list.pdf') pdf.output(outfile, 'F') self.assertTrue(os.path.exists(outfile)) # If function names contain 'bad' symbols, this test fails @unittest.skip("for faster testing") def test_2_get_functions_in_group(self): urease_list = self.parser.ref_data.get_functions_in_group('Urease') self.assertEqual(len(urease_list), 3) self.assertEqual(sorted(urease_list)[0], 'UreA') # @unittest.skip("for faster testing") def test_3_generate_pdf_report(self): self.parser.parse_background_output() generate_pdf_report(self.parser) @unittest.skip("for faster testing") def test_4_generate_functions_chart(self): self.parser.parse_background_output() generate_functions_chart(self.parser) @unittest.skip("for faster testing") def test_5_generate_functions_xlsx(self): for sample_id in self.project.list_samples(): self.project.import_reads_json(sample_id, self.project.ENDS) metrics = 'efpkg' scores = get_function_scores(self.project, sample_id=None, metrics=metrics) generate_function_sample_xlsx(self.project, scores, metrics=metrics, sample_id = None) self.assertTrue(len(scores) > 0) @unittest.skip("for faster testing") def test_6_generate_functions_markdown(self): for sample_id in self.project.list_samples(): self.project.import_reads_json(sample_id, self.project.ENDS) create_functions_markdown_document(self.project) @unittest.skip("for faster testing") def test_7_generate_function_table(self): function = 'AmoA' for sample_id in self.project.list_samples(): self.project.import_reads_json(sample_id, self.project.ENDS) outfile = 'test_output_20181130.txt' with open(outfile, 'w') as of: for read_id,read in self.project.samples[sample_id].reads[end].items(): # if function in read.get_functions(): for hit in read.get_hit_list().get_hits(): if function in hit.get_functions(): print (read_id, function, read.status, hit.get_subject_id()) of.write(('\t').join([read_id, function, read.status, hit.get_subject_id()]) + '\n') of.closed @unittest.skip("for faster testing") def test_8_predict_insert_size(self): #Should take into account https://github.com/ksahlin/GetDistr/blob/master/getdistr/model.py sample_id = 'sample1' # for sample_id in self.project.list_samples(): self.project.import_reads_json(sample_id, self.project.ENDS) outfile = sample_id + '_insert_size_data.txt' fragment_list = [] fragment_weights = defaultdict(float) gene_length_threshold = 150 alignment_length_threshold = 15 read_data = defaultdict(dict) print ('pe1 reads', str(len(self.project.samples[sample_id].reads['pe1']))) print ('pe2 reads', str(len(self.project.samples[sample_id].reads['pe2']))) for read_id,read1 in self.project.samples[sample_id].reads['pe1'].items(): if read1.get_status() != 'function': continue if read_id not in self.project.samples[sample_id].reads['pe2']: continue # print ('Found read with two mapped ends') read2 =self.project.samples[sample_id].reads['pe2'][read_id] if read2.get_status() != 'function': continue for hit in read1.get_hit_list().get_hits(): if hit.get_subject_id() not in [h.get_subject_id() for h in read2.get_hit_list().get_hits()]: # print ('Different target proteins: skipped') continue if hit.s_len*3 < gene_length_threshold: # print ('Target protein shorter than threshold: skipped') continue if hit.s_end - hit.s_start < alignment_length_threshold: continue for hit2 in read2.get_hit_list().get_hits(): if hit.get_subject_id() != hit2.get_subject_id(): continue # print ('Found read with two hits in one protein') if hit2.s_end - hit2.s_start < alignment_length_threshold: continue # print ('Found read with two hits in one protein longer than alignment cutoff') if (hit.s_end - hit2.s_start) > (hit2.s_end - hit.s_start): # Do not count overhangs #fragment_length = 3 * (hit.s_end - hit2.s_start) # Count overhangs insert_size = 3 * (hit.s_end - hit2.s_start) + hit2.q_start - 1 + len(read2.sequence) - hit.q_end else: # Do not count overhangs #fragment_length = 3 * (hit2.s_end - hit.s_start) # Count overhangs insert_size = 3 * (hit2.s_end - hit.s_start) + hit.q_start - 1 + len(read1.sequence) - hit2.q_end #fragment_weight = (gene_length_threshold - fragment_length + 1)/(3*hit.s_len - 3*alignment_length_threshold + 1) #fragment_weights[insert_size] += fragment_weight #fragment_list.append([fragment_length, fragment_weight]) read_data[read_id]['tlen'] = insert_size read_data[read_id]['rlen'] = (len(read1.sequence) + len(read2.sequence)) / 2 read_data[read_id]['ref_len'] = hit.s_len*3 read_data[read_id]['ref_name'] = hit.get_subject_id() if not read_data[read_id]['tlen'] > 0: print(read_id, str(read_data[read_id]['rlen']), str(insert_size), str(read_data[read_id]['ref_len']), read_data[read_id]['ref_name']) print(hit) print(hit2) break #~ if len(fragment_list) > 0: #~ return int(sum(fragment_list) / len(fragment_list)) #~ else: #~ return 0 # print (fragment_list) avg_fragment_length = get_lib_est(read_data, self.project.options.get_work_dir()) with open(outfile, 'w') as of: for read_id in read_data: of.write(read_data[read_id]['ref_name'] + '\t' + str(read_data[read_id]['ref_len']) + '\t' + str(read_data[read_id]['rlen']) + '\t' + str(read_data[read_id]['tlen'])+ '\n') #of.write(str(fragment[0]) + '\t' + str(fragment[1]) + '\n') #~ for length in sorted(fragment_weights.keys()): #~ of.write(str(length) + '\t' + str(fragment_weights[length]) + '\n') of.closed self.assertTrue(int(avg_fragment_length) > 0) @unittest.skip("for faster testing") def test_9_find_fragment_length(self): for sample_id in self.project.list_samples(): #sample_id = 'sample1' self.project.import_reads_json(sample_id, self.project.ENDS) #avg_fragment_length = self.project.find_fragment_length(self.project.samples[sample_id]) avg_fragment_length = self.project.samples[sample_id].estimate_average_insert_size(self.project.config.get_length_cutoff(self.project.options.get_collection(sample_id))) print('Insert size for',sample_id,'is',str(avg_fragment_length)) self.assertTrue(int(avg_fragment_length) > 0) @unittest.skip("for faster testing") def test_10_generate_markdown(self): #for sample_id in self.project.list_samples(): sample_id = 'sample1' metrics = 'efpkg' self.project.import_reads_json(sample_id, self.project.ENDS) scores = get_function_scores(self.project, sample_id=sample_id, metrics=metrics) generate_project_markdown_document(self.project, scores, sample_id = sample_id, metrics = metrics) outfile = sanitize_file_name(os.path.join(self.project.options.get_work_dir(), 'index.md')) with open (outfile, 'r') as f: line = f.readline() f.close() self.assertEqual(line, '# ' + self.project.options.get_name() + '\n') @unittest.skip("for faster testing") def test_11_generate_functions_stamp_input(self): for sample_id in self.project.list_samples(): self.project.import_reads_json(sample_id, self.project.ENDS) metrics = 'efpkg' # metrics = 'fragmentcount' scores = get_function_scores(self.project, metrics=metrics) generate_functions_stamp_input(self.project, scores, metrics) @unittest.skip("for faster testing") def test_12_generate_functions_taxonomy_stamp_input(self): for sample_id in self.project.list_samples(): self.project.import_reads_json(sample_id, self.project.ENDS) metrics = 'efpkg' # metrics = 'fragmentcount' scores = get_function_taxonomy_scores(self.project,metrics=metrics) generate_functions_taxonomy_stamp_input(self.project, scores, metrics) @unittest.skip("for faster testing") def test_5_generate_functions_samples_xlsx(self): for sample_id in self.project.list_samples(): self.project.import_reads_json(sample_id, self.project.ENDS) metrics = 'efpkg' rank = 'phylum' scores = get_function_taxonomy_scores(self.project, metrics=metrics) generate_sample_taxonomy_function_xlsx(self.project, scores, metrics=metrics, rank = rank) self.assertTrue(len(scores) > 0) @unittest.skip("for faster testing") def test_collect_taxonomic_data(self): print ('Load project from JSON') sample_id = 'FW306-ZV-1' metrics = 'efpkg' self.project.import_reads_json(sample_id, ENDS) sample_scores = get_function_taxonomy_scores(self.project, sample_id = sample_id, metrics = metrics)# autovivify(2, float) with open('sample1_taxonomy_scores.tsv', 'w') as of: of.write('Taxonomy_ID\tFunction_ID\tScore\n') for taxonomy_id in sorted(sample_scores.keys()): for function_id in sorted(sample_scores[taxonomy_id].keys()): of.write('\t'.join([taxonomy_id,function_id,str(sample_scores[taxonomy_id][function_id][sample_id][metrics])]) + '\n') of.close() self.assertEqual(len(self.project.samples), 6) def tearDown(self): self.parser = None
def rename_sample(config_file, project_file, old_id, new_id): """This function is doing all the work to rename a sample Args: config_file (str): path to program config ini file project_file (str): path to project options ini file old_id (str): existing sample identifier new_id (str): new sample identifier """ project = Project(config_file=config_file, project_file=project_file) project.load_project() check_id(project, old_id, new_id) # Rename files rename_files(project, old_id, new_id) # Change samples new_sample = project.samples[old_id] new_sample.sample_id = new_id # Delete sample output directory shutil.rmtree( os.path.join(project.options.get_project_dir(old_id), project.options.get_output_subdir(old_id))) items = [] for option in project.options.parser.options(old_id): if option not in project.options.parser.defaults(): items.append([option, project.options.parser.get(old_id, option)]) project.options.parser.add_section(new_id) for item in items: project.options.parser.set(new_id, item[0], item[1]) project.options.parser.remove_section(old_id) project.samples.pop(old_id, None) project.samples[new_id] = new_sample project.options.set_sample_data(project.samples[new_id]) project.save_project_options() # Re-open project with new version of project file project = Project(config_file=config_file, project_file=project_file) project.load_project() os.mkdir( os.path.join(project.options.get_project_dir(new_id), project.options.get_output_subdir(new_id))) project.import_reads_json(new_id, ENDS) for end in ENDS: if not project.samples[new_id].is_paired_end and end == 'pe2': continue parser = DiamondParser(config=project.config, options=project.options, taxonomy_data=project.taxonomy_data, ref_data=project.ref_data, sample=project.samples[new_id], end=end) parser.reads = project.samples[new_id].reads[end] # Re-create output files # Generate output generate_fastq_report(parser) generate_pdf_report(parser) make_functions_chart(parser)
def main(): """Loads annotated reads and calls report generators""" args = get_args() if args.prot: project = Project(config_file=args.config, project_file=args.project) project.load_project() for sample_id in project.list_samples(): if args.sample is not None: if args.sample != sample_id: continue project.options.set_sample_data(project.samples[sample_id]) project.import_reads_json(sample_id, ENDS) fama_report.generate_protein_sample_report(project, sample_id, metrics='readcount') if args.sample is None: # Skip project report generation if the pipeline is running for only one sample fama_report.generate_protein_project_report(project) generate_output(project) else: project = Project(config_file=args.config, project_file=args.project) project.load_project() for sample_id in project.list_samples(): if args.sample is not None and args.sample != sample_id: continue if project.samples[ sample_id].rpkg_scaling_factor is None or project.samples[ sample_id].rpkg_scaling_factor == 0.0: project.samples[sample_id].import_rpkg_scaling_factor() if project.samples[ sample_id].rpkg_scaling_factor is None or project.samples[ sample_id].rpkg_scaling_factor == 0.0: run_microbecensus(sample=project.samples[sample_id], config=project.config) project.samples[sample_id].import_rpkg_scaling_factor() project.options.set_sample_data(project.samples[sample_id]) project.import_reads_json(sample_id, ENDS) print('Generating report for', sample_id) fama_report.generate_sample_report(project, sample_id, metrics=args.metrics) print('Generating report for project') project.generate_report(metrics=args.metrics) project.save_project_options() print('Done!')
class DiamondParserTest(unittest.TestCase): def setUp(self): #self.parser = DiamondParser(config_file=config_path, project_file=project_path, sample=sample, end=end) self.project = Project(config_file=config_path, project_file=project_path) for sample_id in self.project.list_samples(): sample = Sample(sample_id=sample_id) sample.load_sample(self.project.options) self.project.samples[sample_id] = sample def test_1_test_pipeline(self): #parser = functional_profiling_pipeline(config_file=config_path, project_file=project_path, sample=sample, end=end) pass def test_2_load_proteins(self): self.project.import_reads_json(sample, ENDS) protein = 'D16-4706_contig_11213_8' self.assertTrue(protein in self.project.samples[sample].reads[end]) print(self.project.samples[sample].reads[end][protein].taxonomy) def test_3_protein_taxonomy(self): self.project.import_reads_json(sample, ENDS) protein = 'D16-4706_contig_11213_7' print('D16-4706_contig_11213_7 taxonomy') print(self.project.samples[sample].reads[end][protein].taxonomy) parser = DiamondParser(config=self.project.config, options=self.project.options, taxonomy_data=self.project.taxonomy_data, ref_data=self.project.ref_data, sample=self.project.samples[sample], end=end) parser.parse_reference_output() print(str(parser.reads[protein])) # parse_background_output(parser) hit_line = 'D16-4706_contig_11213_7|4|257 fig|408672.3.peg.2637 63.0 254 94 256 1 254 2 255 1.1e-97 362.1' hit = DiamondHit() hit.create_hit(tabular_output_fields=hit_line.split('\t')) hit_list = DiamondHitList('D16-4706_contig_11213_7|4|257') hit_list.add_hit(hit) hit_list.annotate_hits(self.project.ref_data) hit_list.filter_list_by_identity(self.project.ref_data) print('hit_list') print(hit_list) compare_protein_hits_lca(parser.reads[protein], 4, 257, hit_list, 0.03, 1.0, 1.0, self.project.taxonomy_data, self.project.ref_data) print(parser.reads[protein].taxonomy) self.assertEqual(parser.reads[protein].taxonomy, '408672') def test_4_sample_taxonomy(self): with open('samples_taxonomy2.txt', 'w') as outfile: for sample_id in self.project.list_samples(): self.project.import_reads_json(sample_id, ENDS) taxonomy_ids = [] for protein_id, protein in self.project.samples[sample_id].reads[end].items(): taxonomy_ids.append(protein.taxonomy) lca_taxonomy = self.project.taxonomy_data.get_lca(taxonomy_ids) outfile.write('\t'.join([sample_id, lca_taxonomy, self.project.taxonomy_data.get_name(lca_taxonomy)]) + '\n') #~ self.project.import_reads_json(sample, ENDS) #~ for protein_id, protein in self.project.samples[sample].reads[end].items(): #~ taxonomy_ids.append(protein.taxonomy) #~ lca_taxonomy = self.project.taxonomy_data.get_lca(taxonomy_ids) #~ print(sample, lca_taxonomy, self.project.taxonomy_data.get_name(lca_taxonomy)) #~ self.assertEqual(lca_taxonomy, '28216') def test_5_export_proteins(self): with open('proteins.faa', 'w') as outfile: for sample_id in self.project.list_samples(): self.project.import_reads_json(sample_id, ENDS) for protein_id, protein in self.project.samples[sample_id].reads[end].items(): if protein.status == 'function': outfile.write('>' + protein_id + '|' + sample_id + '|' + ';'.join(protein.functions.keys()) + '|' + self.project.taxonomy_data.get_name(protein.taxonomy) + '\n') outfile.write(protein.sequence + '\n\n') def test_6_export_protein_table(self): out_file = os.path.join(self.project.options.work_dir, 'proteins.list.txt') with open(out_file, 'w') as outfile: for sample_id in self.project.list_samples(): self.project.import_reads_json(sample_id, ENDS) for protein_id, protein in self.project.samples[sample_id].reads[end].items(): if protein.status == 'function': protein_length = len(protein.sequence) ref_length = int(protein.hit_list.hits[0].s_len) outfile.write('\t'.join([ sample_id, protein_id, ';'.join(sorted(protein.functions.keys())), '{0:.4f}'.format(protein_length/ref_length), protein.taxonomy, self.project.taxonomy_data.data[protein.taxonomy]['name'] ]) + '\n') def tearDown(self): self.parser = None
def test_load_project(self): print ('Load project from INI file') self.project = None self.project = Project(config_file=config_path, project_file=project_path) self.project.load_project() self.assertEqual(len(self.project.samples), 6)
def setUp(self): self.project = Project(config_file=config_path, project_file=project_path) for sample_id in self.project.list_samples(): sample = Sample(sample_id=sample_id) sample.load_sample(self.project.options) self.project.samples[sample_id] = sample
class ProjectTest(unittest.TestCase): def setUp(self): self.project = Project(config_file=config_path, project_file=project_path) for sample_id in self.project.list_samples(): sample = Sample(sample_id=sample_id) sample.load_sample(self.project.options) self.project.samples[sample_id] = sample @unittest.skip("for faster testing") def test_project_options(self): print ('Print list of samples1') options = ProjectOptions(project_path) print (options.parser.sections()) self.assertEqual(len(options.parser.sections()), 6) self.assertEqual(options.parser.sections()[0], 'sample1') @unittest.skip("for faster testing") def test_list_samples(self): print ('Print list of samples2') samples = self.project.list_samples() print (samples) self.assertEqual(len(samples), 6) self.assertEqual(samples[0], 'sample1') @unittest.skip("for faster testing") def test_check_project(self): print ('Print problems found in test project: ') self.project.check_project() @unittest.skip("for faster testing") def test_check_config(self): print ('Print problems found in test project: ') self.project.check_project() self.assertEqual(self.project.config.get_biscore_range_cutoff(self.project.options.get_collection()), 0.2) self.assertEqual(self.project.config.get_identity_cutoff(self.project.options.get_collection()), 40.0) ranks_cutoff = self.project.config.get_ranks_cutoffs(self.project.options.get_collection()) print(ranks_cutoff) print(ranks_cutoff['species']) @unittest.skip("for faster testing") def test_load_project(self): print ('Load project from INI file') self.project = None self.project = Project(config_file=config_path, project_file=project_path) self.project.load_project() self.assertEqual(len(self.project.samples), 6) @unittest.skip("for faster testing") def test_collect_fragment_stats(self): print ('Load project from JSON') #self.project.load_functional_profile() sample_stats = autovivify(2, int) for sample_id in self.project.list_samples(): if not self.project.samples[sample_id].is_paired_end: continue self.project.import_reads_json(sample_id, self.project.ENDS) print('Mapping data loaded for sample', sample_id) both_ends_mapped_reads = {} pe1_multiple_functions = {} pe2_multiple_functions = {} sample_stats['reads_pe1_total'][sample_id] = len(self.project.samples[sample_id].reads['pe1']) sample_stats['reads_pe2_total'][sample_id] = len(self.project.samples[sample_id].reads['pe2']) pe1_reads = self.project.samples[sample_id].reads['pe1'] pe2_reads = self.project.samples[sample_id].reads['pe2'] for read in pe1_reads: if pe1_reads[read].get_status() == 'function,besthit' or pe1_reads[read].get_status() == 'function': if read in pe2_reads and (pe2_reads[read].get_status() == 'function,besthit' or pe2_reads[read].get_status() == 'function'): sample_stats['both ends mapped'][sample_id] += 1 both_ends_mapped_reads[read] = 1 else: sample_stats['pe1 mapped only'][sample_id] += 1 for read in pe2_reads: if read not in both_ends_mapped_reads and (pe2_reads[read].get_status() == 'function,besthit' or pe2_reads[read].get_status() == 'function'): sample_stats['pe2 mapped only'][sample_id] += 1 for read in pe1_reads: if pe1_reads[read].get_status() == 'function,besthit' or pe1_reads[read].get_status() == 'function': sample_stats['reads_pe1_mapped'][sample_id] += 1 if len(pe1_reads[read].get_functions()) == 1: sample_stats['pe1 single function'][sample_id] += 1 elif len(pe1_reads[read].get_functions()) > 1: sample_stats['pe1 multiple functions'][sample_id] += 1 pe1_multiple_functions[read] = 1 for read in pe2_reads: if pe2_reads[read].get_status() == 'function,besthit' or pe2_reads[read].get_status() == 'function': sample_stats['reads_pe2_mapped'][sample_id] += 1 if len(pe2_reads[read].get_functions()) == 1: sample_stats['pe2 single function'][sample_id] += 1 elif len(pe2_reads[read].get_functions()) > 1: sample_stats['pe2 multiple functions'][sample_id] += 1 pe2_multiple_functions[read] = 1 for read in both_ends_mapped_reads: if len(pe1_reads[read].get_functions()) == 1: sample_stats['pe1 single function, both ends mapped'][sample_id] += 1 elif len(pe1_reads[read].get_functions()) > 1: sample_stats['pe1 multiple functions, both ends mapped'][sample_id] += 1 if len(pe2_reads[read].get_functions()) == 1: sample_stats['pe2 single function, both ends mapped'][sample_id] += 1 elif len(pe2_reads[read].get_functions()) > 1: sample_stats['pe2 multiple functions, both ends mapped'][sample_id] += 1 for read in pe1_reads: if pe1_reads[read].get_status() == 'function,besthit' or pe1_reads[read].get_status() == 'function': hits = pe1_reads[read].get_hit_list() if len(hits.get_hits()) == 1: sample_stats['pe1 single hit'][sample_id] += 1 elif len(hits.get_hits()) > 1: sample_stats['pe1 multiple hits'][sample_id] += 1 for read in pe2_reads: if pe2_reads[read].get_status() == 'function,besthit' or pe2_reads[read].get_status() == 'function': hits = pe2_reads[read].get_hit_list() if len(hits.get_hits()) == 1: sample_stats['pe2 single hit'][sample_id] += 1 elif len(hits.get_hits()) > 1: sample_stats['pe2 multiple hits'][sample_id] += 1 for read in pe1_multiple_functions: if pe1_reads[read].get_status() == 'function,besthit' or pe1_reads[read].get_status() == 'function': hits = pe1_reads[read].get_hit_list() if len(hits.get_hits()) == 1: sample_stats['pe1 multiple functions, single hit'][sample_id] += 1 elif len(hits.get_hits()) > 1: sample_stats['pe1 multiple functions, multiple hits'][sample_id] += 1 for read in pe2_multiple_functions: if pe2_reads[read].get_status() == 'function,besthit' or pe2_reads[read].get_status() == 'function': hits = pe2_reads[read].get_hit_list() if len(hits.get_hits()) == 1: sample_stats['pe2 multiple functions, single hit'][sample_id] += 1 elif len(hits.get_hits()) > 1: sample_stats['pe2 multiple functions, multiple hits'][sample_id] += 1 for read in both_ends_mapped_reads: hits = pe1_reads[read].get_hit_list() if len(hits.get_hits()) == 1: sample_stats['pe1 single hit, both ends mapped'][sample_id] += 1 elif len(hits.get_hits()) > 1: sample_stats['pe1 multiple hits, both ends mapped'][sample_id] += 1 hits = pe2_reads[read].get_hit_list() if len(hits.get_hits()) == 1: sample_stats['pe2 single hit, both ends mapped'][sample_id] += 1 elif len(hits.get_hits()) > 1: sample_stats['pe2 multiple hits, both ends mapped'][sample_id] += 1 self.project.samples[sample_id].reads = None with open('outfile.tsv', 'w') as of: for sample_id in self.project.list_samples(): of.write('\t' + sample_id) of.write('\n') for item in sorted(sample_stats.keys()): of.write(item) for sample_id in self.project.list_samples(): if sample_id in sample_stats[item]: of.write('\t' + str(sample_stats[item][sample_id])) else: of.write('\t0') of.write('\n') of.close() self.assertEqual(len(self.project.samples), 6) @unittest.skip("for faster testing") def test_top_size(self): print ('Load reads from JSON') sample_id = 'sample3' self.project.import_reads_json(sample_id,['pe1',]) tsvfile = '/mnt/data3/FEBA/4703/nitrogen_v7.1_fama/sample3_pe1_bgr_tabular_output.txt' outfile = 'top_hit_count.txt' current_query_id = None top_size = 0 identity_cutoff = 50.0 length_cutoff = 15 bitscore_range_cutoff = 0.97 bitscore_cutoff = 0.0 print ('Identity cutoff: ', identity_cutoff, ', Length cutoff: ', length_cutoff) with open(outfile, 'w') as of: with open(tsvfile, 'r', newline='') as f: tsvin = csv.reader(f, delimiter='\t') for row in tsvin: if current_query_id is None: current_query_id = row[0] bitscore_cutoff = float(row[11]) * bitscore_range_cutoff # filtering by identity and length if float(row[2]) < identity_cutoff: continue # skip this line if float(row[3]) < length_cutoff: continue # skip this line if row[0] != current_query_id: read_id = current_query_id.split('|')[0] of.write(read_id + '\t' + str(top_size) + '\t' + self.project.samples[sample_id].reads['pe1'][read_id].get_status() + '\n') current_query_id = row[0] top_size = 0 bitscore_cutoff = float(row[11]) * bitscore_range_cutoff if float(row[11]) >= bitscore_cutoff: top_size += 1 f.closed of.closed def tearDown(self): self.project = None