def main(args): # connect to database session = app.db.session() logging.info("Reading sample information") # Add all samples from the sample info file sample_info = pd.read_table(args.sample_info, sep=',', index_col=0) logging.info("Creating sample sets") # sample_set sample_sets = {} for sample_set_name, sample_set_df in sample_info.groupby('sample_set'): if len(SampleSet.query.filter_by(name=sample_set_name).all()) == 0: sample_set = SampleSet(sample_set_name) for sample_id, row in sample_set_df.iterrows(): sample_sets[sample_id] = sample_set logging.info("Creating individual samples") sample_properties = [] all_samples = {} time_places = [] metadata_reference = pd.read_table(args.metadata_reference, index_col=0) meta_categories = list(metadata_reference.index) default_units = metadata_reference['Unit'].to_dict() default_units['filter_lower'] = 'µm' default_units['filter_upper'] = 'µm' for sample_id, row in sample_info.iterrows(): samples_with_code = Sample.query.filter_by(scilifelab_code=sample_id).all() assert len(samples_with_code) == 0 meta_data = {} for meta_category in meta_categories: if meta_category == 'Collection date': date = datetime.datetime.strptime(row[meta_category], '%y/%m/%d') if meta_category == 'Collection time': time = datetime.datetime.strptime(str(row[meta_category]), '%H:%M').time() else: meta_data[meta_category] = row[meta_category] extra_categories = ['filter_lower', 'filter_upper'] for meta_category in extra_categories: meta_data[meta_category] = row[meta_category] time_place = TimePlace(datetime.datetime.combine(date, time), meta_data['Latitude'], meta_data['Longitude']) time_places.append(time_place) all_samples[sample_id] = Sample(sample_id, sample_sets[sample_id], time_place) for meta_category in meta_categories: if meta_category in ['Latitude', 'Longitude', 'Collection date', 'Collection time']: continue if meta_data[meta_category] is not None: sample_properties.append(SampleProperty(meta_category, meta_data[meta_category], default_units[meta_category], all_samples[sample_id])) session.add_all(list(sample_sets.values()) + list(all_samples.values()) + time_places + sample_properties) logging.info("Commiting everything except gene counts") session.commit() commited_genes = # fetch from database # Fetch each gene from the gene count file and create the corresponding gene count logging.info("Starting with gene counts") gene_counts = pd.read_table(args.gene_counts, index_col=0) total_gene_count_len = len(gene_counts) val_cols = gene_counts.columns nr_columns = len(val_cols) filtered_gene_counts = gene_counts[ gene_counts.index.isin(commited_genes.keys()) ].copy() filtered_gene_counts['gene_name'] = filtered_gene_counts.index filtered_gene_counts['gene_id'] = filtered_gene_counts['gene_name'].apply(lambda x: commited_genes[x]) def add_gene_counts_to_file(col, filtered_gene_counts, sample_id): tmp_cov_df = filtered_gene_counts[[col, 'gene_id']].copy() tmp_cov_df['rpkm'] = tmp_cov_df[col] tmp_cov_df['sample_id'] = sample_id with open(args.tmp_file, 'w') as gene_counts_file: tmp_cov_df[['gene_id', 'sample_id', 'rpkm']].to_csv(gene_counts_file, index=False, header=False) all_sample_ids = dict((sample_name, sample.id) for sample_name, sample in all_samples.items()) filtered_gene_counts.rename(columns=all_sample_ids, inplace=True) sample_id_cols = filtered_gene_counts.columns.tolist() sample_id_cols.remove('gene_id') sample_id_cols.remove('gene_name') filtered_gene_counts.index = filtered_gene_counts['gene_id'] filtered_gene_counts = pd.DataFrame(filtered_gene_counts[sample_id_cols].stack()) filtered_gene_counts.reset_index(inplace=True) filtered_gene_counts.columns = ['gene_id', 'sample_id', 'rpkm'] tot_nr_samples = len(all_samples.values()) logging.info("Start adding gene counts") for i, sample_t in enumerate(filtered_gene_counts.groupby('sample_id')): sample, sample_df = sample_t with open(args.tmp_file, 'w') as gene_counts_file: sample_df.to_csv(gene_counts_file, index=False, header=False) logging.info("Adding gene counts from file. Sample {}/{}".format(i+1, tot_nr_samples)) session.execute("COPY gene_count (gene_id, sample_id, rpkm) FROM '{}' WITH CSV;".format(args.tmp_file)) logging.info("{} out of {} are annotated genes".format(len(filtered_gene_counts), total_gene_count_len)) session.commit() logging.info("Refreshing materialized view") refresh_all_mat_views() session.commit() logging.info("Finished!")
def test_annotation_rpkm_table(self): annotation_types = [("Cog", {'class': Cog}), ("Pfam", {'class': Pfam}), ("TigrFam", {'class': TigrFam}), ("EcNumber", {'class': EcNumber})] nr_annotation_types = len(annotation_types) annotation_sources = {} for annotation_type, type_d in annotation_types: annotation_sources[annotation_type]= AnnotationSource( annotation_type, "v1.0", "rpsblast", "e_value=0.000001" ) sample1 = Sample("P1993_101", None, None) sample2 = Sample("P1993_102", None, None) nr_samples = 2 for i in range(50): gene1 = Gene("gene1{}".format(i), None) gene2 = Gene("gene2{}".format(i), None) gene_count1 = GeneCount(gene1, sample1, 0.001) gene_count2 = GeneCount(gene1, sample2, 0.01) gene_count3 = GeneCount(gene2, sample1, 0.002) gene_count4 = GeneCount(gene2, sample2, 0.02) for annotation_type, type_d in annotation_types: if annotation_type == 'Cog': type_id = str(i) type_id = "0"*(4-len(type_id))+type_id annotation = type_d['class'](annotation_type.upper() + type_id, "H") elif annotation_type == 'EcNumber': if i > 25: type_id = "0.0.2.{}".format(i) else: type_id = "0.0.0.{}".format(i) annotation = type_d['class'](type_id) else: type_id = str(i) type_id = "0"*(4-len(type_id))+type_id annotation = type_d['class'](annotation_type.upper() + type_id) annotation_mode = i % 3 gene_annotations = [] if annotation_mode in [0,1]: gene_annotations.append(GeneAnnotation( annotation, gene1, annotation_sources[annotation_type] )) if annotation_mode in [1,2]: gene_annotations.append(GeneAnnotation( annotation, gene2, annotation_sources[annotation_type] )) self.session.add_all(gene_annotations) self.session.add(gene1) self.session.add(gene2) self.session.commit() refresh_all_mat_views() samples, rows = Annotation.rpkm_table() assert len(samples) == 2 assert len(rows) == 20 # Default limit samples, rows = Annotation.rpkm_table(limit=100) assert len(samples) == 2 assert len(rows) == 100 samples, rows = Annotation.rpkm_table(limit=None) assert len(samples) == 2 assert len(rows) == nr_annotation_types * 50 for annotation, sample_d in rows.items(): # sample_d should be a ordered dict assert ["P1993_101", "P1993_102"] == [sample.scilifelab_code for sample in sample_d.keys()] rpkms = [[rpkm for sample, rpkm in sample_d.items()] for annotation, sample_d in rows.items()] rpkms_flat = [] for rpkm_row in rpkms: rpkms_flat += rpkm_row assert len(rpkms_flat) == nr_annotation_types * nr_samples * 50 # Annotations sorted by total rpkm over all samples # and the rpkm values should be summed over all genes for that annotation # there should be roughly equal numbers of the three different counts for i, row in enumerate(rpkms[:67]): assert row == [0.003, 0.03] for row in rpkms[69:130]: assert row == [0.002, 0.02] for row in rpkms[150:200]: assert row == [0.001, 0.01] # possible to filter on function classes for annotation_type, type_d in annotation_types: samples, rows = Annotation.rpkm_table(limit=None, function_class=annotation_type.lower()) assert len(rows) == 50 for key in rows.keys(): assert annotation_type[:3].lower() == key.annotation_type[:3] # possible to filter on samples for sample in [sample1, sample2]: samples, rows = Annotation.rpkm_table(samples=[sample.scilifelab_code], limit=None) assert len(rows) == 200 assert len(samples) == 1 assert samples[0] == sample for annotation, sample_d in rows.items(): assert list(sample_d.keys()) == [sample] rpkms = [[rpkm for sample, rpkm in sample_d.items()] for annotation, sample_d in rows.items()] if sample.scilifelab_code == "P1993_101": for i, row in enumerate(rpkms[:65]): assert row == [0.003] for row in rpkms[69:130]: assert row == [0.002] for row in rpkms[150:200]: assert row == [0.001] else: for row in rpkms[:67]: assert row == [0.03] for row in rpkms[69:130]: assert row == [0.02] for row in rpkms[150:200]: assert row == [0.01] # possible to filter on sample and function class at the same time for annotation_type, type_d in annotation_types: for sample in [sample1, sample2]: samples, rows = Annotation.rpkm_table(limit=None, function_class=annotation_type.lower(), samples=[sample.scilifelab_code]) assert len(rows) == 50 for key in rows.keys(): assert annotation_type.lower()[:3] == key.annotation_type[:3] assert len(samples) == 1 assert samples[0] == sample for annotation, sample_d in rows.items(): assert list(sample_d.keys()) == [sample] rpkms = [[rpkm for sample, rpkm in sample_d.items()] for annotation, sample_d in rows.items()] if sample.scilifelab_code == "P1993_101": for row in rpkms[:9]: assert row == [0.003] for row in rpkms[19:29]: assert row == [0.002] for row in rpkms[39:]: assert row == [0.001] else: for row in rpkms[:9]: assert row == [0.03] for row in rpkms[19:29]: assert row == [0.02] for row in rpkms[39:]: assert row == [0.01] # possible to filter on individual annotations annotation_ids = ["COG0001", "TIGRFAM0004", "COG0003", "PFAM0002", "0.0.2.26"] for r in range(5): for type_identifiers in itertools.combinations(annotation_ids, r+1): samples, rows = Annotation.rpkm_table(limit=None, type_identifiers=list(type_identifiers)) assert len(samples) == 2 assert len(rows) == len(type_identifiers) assert set([key.type_identifier for key in rows.keys()]) == set(type_identifiers)
def test_taxon(self): ref_assembly = ReferenceAssembly("Version 1") gene1 = Gene("gene1", ref_assembly) sample1 = Sample("P1993_101", None, None) reference_assembly = ReferenceAssembly("version 1") gene_count1 = GeneCount(gene1, sample1, 0.001) taxon1 = Taxon(superkingdom="Bacteria", phylum="Proteobacteria") gene1.taxon = taxon1 self.session.add(gene1) self.session.add(taxon1) self.session.add(sample1) self.session.add(gene_count1) self.session.commit() gene1 = Gene.query.first() taxon1 = Taxon.query.first() assert gene1.taxon == taxon1 assert gene1 in taxon1.genes assert taxon1.superkingdom == 'Bacteria' assert taxon1.phylum == 'Proteobacteria' assert taxon1.taxclass == '' assert taxon1.full_taxonomy == 'Bacteria;Proteobacteria;;;;;;' refresh_all_mat_views() # Test sample count retreival sample2 = Sample("P1993_102", None, None) self.session.add(sample2) self.session.commit() refresh_all_mat_views() assert taxon1.rpkm == {sample1: 0.001} gene_count2 = GeneCount(gene1, sample2, 0.2) self.session.add(gene_count2) self.session.commit() refresh_all_mat_views() assert taxon1.rpkm == {sample1: 0.001, sample2: 0.2} gene2 = Gene("gene2", ref_assembly) gene_count3 = GeneCount(gene2, sample2, 0.1) self.session.add(gene2) self.session.add(gene_count3) self.session.commit() refresh_all_mat_views() # taxon1.rpkm should still be the same since the new gene is not connected to taxon1 assert taxon1.rpkm == {sample1: 0.001, sample2: 0.2} taxon2 = Taxon(superkingdom="Eukaryota", phylum="Chlorophyta") gene2.taxon = taxon2 self.session.add(taxon2) self.session.add(gene2) self.session.commit() refresh_all_mat_views() # Taxon2 should have gene_count3 stats only assert taxon2.rpkm == {sample2: 0.1} gene3 = Gene("gene3", ref_assembly, taxon_id=taxon1.id) gene_count4 = GeneCount(gene3, sample1, 1.0) self.session.add(gene3) self.session.add(gene_count4) self.session.commit() # Taxon1 should now have the original stats plus gene_count4 assert taxon1.rpkm == {sample1: 1.001, sample2: 0.2} taxon3 = Taxon(superkingdom="Eukaryota", phylum="Unnamed", taxclass="Dinophyceae") self.session.add(taxon3) self.session.commit() gene4 = Gene("gene4", ref_assembly, taxon_id=taxon3.id) gene_count5 = GeneCount(gene4, sample2, 0.003) self.session.add(gene4) self.session.add(gene_count5) self.session.commit() refresh_all_mat_views() # theoretical rpkm_table: # samples = [sample1, sample2] # rpkm_table = {"Bacteria": {"P1993_101": 1.001, "P1993_102": 0.2}, "Eukaryota": {"P1993_102": 0.103}} samples, rpkm_table, complete_val_to_val = Taxon.rpkm_table() assert samples == [sample1, sample2] assert [complete_val_to_val[complete_level_val] for complete_level_val in rpkm_table.keys()] == ["Bacteria", "Eukaryota"] # Sorted by summed rpkm assert rpkm_table[("Bacteria")] == {sample1: 1.001, sample2: 0.2} assert rpkm_table[("Eukaryota")] == {sample2: 0.103} samples, rpkm_table, complete_val_to_val= Taxon.rpkm_table(level='phylum') assert samples == [sample1, sample2] assert [complete_val_to_val[complete_level_val] for complete_level_val in rpkm_table.keys()] == ["Proteobacteria", "Chlorophyta", "Unnamed"] # Sorted by summed rpkm assert rpkm_table[("Bacteria;Proteobacteria")] == {sample1: 1.001, sample2: 0.2} assert rpkm_table[("Eukaryota;Chlorophyta")] == {sample2: 0.1} assert rpkm_table[("Eukaryota;Unnamed")] == {sample2: 0.003}
def test_taxon_large_scale_rpkm_table(self): sample1 = Sample("P1993_101", None, None) sample2 = Sample("P1993_102", None, None) nr_samples = 2 taxons = [] for euk_i in range(2): for ph_i in range(3): for tc_i in range(20): taxons.append(Taxon(superkingdom="sk_{}".format(euk_i), phylum="ph_{}".format(ph_i), taxclass="tc_{}".format(tc_i))) self.session.add_all(taxons) self.session.commit() refresh_all_mat_views() for i,taxon in enumerate(taxons): count_mode = i % 3 gene_counts = [] gene1 = Gene("gene1{}".format(i), None, taxon_id=taxon.id) gene2 = Gene("gene2{}".format(i), None, taxon_id=taxon.id) if count_mode in [0,1]: gene_counts.append(GeneCount(gene1, sample1, 0.001)) gene_counts.append(GeneCount(gene1, sample2, 0.01)) if count_mode in [1,2]: gene_counts.append(GeneCount(gene2, sample1, 0.002)) gene_counts.append(GeneCount(gene2, sample2, 0.02)) self.session.add_all(gene_counts) self.session.add(gene1) self.session.add(gene2) self.session.commit() refresh_all_mat_views() samples, rows, complete_val_to_val = Taxon.rpkm_table() assert len(samples) == 2 assert len(rows) == 2 # Number of unique superkingdoms samples, rows, complete_val_to_val = Taxon.rpkm_table(level="phylum") assert len(samples) == 2 assert len(rows) == 6 # Number of unique down to phylum samples, rows, complete_val_to_val = Taxon.rpkm_table(level="taxclass") assert len(samples) == 2 assert len(rows) == 20 # Default limit samples, rows, complete_val_to_val = Taxon.rpkm_table(level="taxclass", limit=None) assert len(samples) == 2 assert len(rows) == 120 # Number of unique down to taxclass samples, rows, complete_val_to_val = Taxon.rpkm_table(level="taxclass", limit=None) for taxon, sample_d in rows.items(): # sample_d should be a ordered dict assert ["P1993_101", "P1993_102"] == [sample.scilifelab_code for sample in sample_d.keys()] rpkms = [[rpkm for sample, rpkm in sample_d.items()] for taxon, sample_d in rows.items()] rpkms_flat = [] for rpkm_row in rpkms: rpkms_flat += rpkm_row assert len(rpkms_flat) == 2 * 3 * 20 * nr_samples # Annotations sorted by total rpkm over all samples # and the rpkm values should be summed over all genes for that taxon # there should be roughly equal numbers of the three different counts for i, row in enumerate(rpkms[:40]): assert row == [0.003, 0.03] for row in rpkms[40:80]: assert row == [0.002, 0.02] for row in rpkms[80:120]: assert row == [0.001, 0.01] # possible to filter on specific level values at superkingdom for level_val in ["sk_0", "sk_1"]: samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[level_val], top_level="superkingdom", level="phylum") assert len(rows) == 3 level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()] assert level_vals == ["ph_2", "ph_0", "ph_1"] samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[level_val], top_level="superkingdom", level="taxclass") assert len(rows) == 3*20 # possible to filter on specific level values at phylum for sk_level_val in ["sk_0", "sk_1"]: for ph_level_val in ["ph_0", "ph_1", "ph_2"]: top_level_complete_value="{};{}".format(sk_level_val, ph_level_val) samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[top_level_complete_value], top_level="phylum", level="phylum") assert len(rows) == 1 level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()] assert level_vals == [ph_level_val] samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[top_level_complete_value], top_level="phylum", level="taxclass") assert len(rows) == 20 # possible to filter on multiple specific level values at phylum for sk_level_val in ["sk_0", "sk_1"]: for ph_level_vals in itertools.combinations(["ph_0", "ph_1", "ph_2"], 2): top_level_complete_values = [] for ph_level_val in ph_level_vals: top_level_complete_values.append("{};{}".format(sk_level_val, ph_level_val)) samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=top_level_complete_values, top_level="phylum", level="phylum") assert len(rows) == 2 level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()] assert sorted(level_vals) == sorted(list(ph_level_vals)) samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=top_level_complete_values, top_level="phylum", level="taxclass") assert len(rows) == 40 # possible to filter on specific level values at taxclass for sk_level_val in ["sk_0", "sk_1"]: for ph_level_val in ["ph_0", "ph_1", "ph_2"]: for tc_level_val in ["tc_{}".format(i) for i in range(20)]: top_level_complete_value="{};{};{}".format(sk_level_val, ph_level_val, tc_level_val) samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[top_level_complete_value], top_level="taxclass", level="taxclass") assert len(rows) == 1 # possible to filter on specific level values at taxclass for sk_level_val in ["sk_0", "sk_1"]: for ph_level_val in ["ph_0", "ph_1", "ph_2"]: for tc_level_vals in itertools.combinations(["tc_{}".format(i) for i in range(5)], 4): top_level_complete_values = [] for tc_level_val in tc_level_vals: top_level_complete_values.append("{};{};{}".format(sk_level_val, ph_level_val, tc_level_val)) samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=top_level_complete_values, top_level="taxclass", level="taxclass") assert len(rows) == 4 # possible to filter on samples for sample in [sample1, sample2]: samples, rows, complete_val_to_val = Taxon.rpkm_table(samples=[sample.scilifelab_code], level="taxclass", limit=None) assert len(rows) == 120 assert len(samples) == 1 assert samples[0] == sample for taxon, sample_d in rows.items(): assert list(sample_d.keys()) == [sample] rpkms = [[rpkm for sample, rpkm in sample_d.items()] for taxon, sample_d in rows.items()] if sample.scilifelab_code == "P1993_101": for i, row in enumerate(rpkms[:40]): assert row == [0.003] for row in rpkms[40:80]: assert row == [0.002] for row in rpkms[80:120]: assert row == [0.001] else: for row in rpkms[:40]: assert row == [0.03] for row in rpkms[40:80]: assert row == [0.02] for row in rpkms[80:120]: assert row == [0.01] # possible to filter on sample and taxon at the same time for sample in [sample1, sample2]: for sk_level_val in ["sk_0", "sk_1"]: top_level_complete_value = sk_level_val samples, rows, complete_val_to_val = Taxon.rpkm_table(samples=[sample.scilifelab_code], limit=None, top_level_complete_values=[top_level_complete_value], top_level="superkingdom", level="phylum") assert len(samples) == 1 assert samples[0] == sample for taxon, sample_d in rows.items(): assert list(sample_d.keys()) == [sample] assert len(rows) == 3 level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()] assert level_vals == ["ph_2", "ph_0", "ph_1"] samples, rows, complete_val_to_val = Taxon.rpkm_table(samples=[sample.scilifelab_code], limit=None, top_level_complete_values=[top_level_complete_value], top_level="superkingdom", level="taxclass") assert len(rows) == 3*20 rpkms = [[rpkm for sample, rpkm in sample_d.items()] for annotation, sample_d in rows.items()] if sample.scilifelab_code == "P1993_101": for row in rpkms[:20]: assert row == [0.003] for row in rpkms[20:40]: assert row == [0.002] for row in rpkms[40:60]: assert row == [0.001] else: for row in rpkms[:20]: assert row == [0.03] for row in rpkms[20:40]: assert row == [0.02] for row in rpkms[40:80]: assert row == [0.01]
def main(args): # Create materialized view that is not created by manage.py app.db.create_all() # connect to database session = app.db.session() logging.info("Reading sample information") # Add all samples from the sample info file sample_info = pd.read_table(args.sample_info, sep=',', index_col=0) logging.info("Creating sample sets") # sample_set sample_sets = {} for sample_set_name, sample_set_df in sample_info.groupby('sample_set'): if len(SampleSet.query.filter_by(name=sample_set_name).all()) == 0: sample_set = SampleSet(sample_set_name) for sample_id, row in sample_set_df.iterrows(): sample_sets[sample_id] = sample_set logging.info("Creating individual samples") sample_properties = [] all_samples = {} time_places = [] metadata_reference = pd.read_table(args.metadata_reference, index_col=0) meta_categories = list(metadata_reference.index) default_units = metadata_reference['Unit'].to_dict() default_units['filter_lower'] = 'µm' default_units['filter_upper'] = 'µm' for sample_id, row in sample_info.iterrows(): samples_with_code = Sample.query.filter_by(scilifelab_code=sample_id).all() assert len(samples_with_code) == 0 meta_data = {} for meta_category in meta_categories: if meta_category == 'Collection date': date = datetime.datetime.strptime(row[meta_category], '%y/%m/%d') if meta_category == 'Collection time': time = datetime.datetime.strptime(str(row[meta_category]), '%H:%M').time() else: meta_data[meta_category] = row[meta_category] extra_categories = ['filter_lower', 'filter_upper'] for meta_category in extra_categories: meta_data[meta_category] = row[meta_category] time_place = TimePlace(datetime.datetime.combine(date, time), meta_data['Latitude'], meta_data['Longitude']) time_places.append(time_place) all_samples[sample_id] = Sample(sample_id, sample_sets[sample_id], time_place) for meta_category in meta_categories: if meta_category in ['Latitude', 'Longitude', 'Collection date', 'Collection time']: continue if meta_data[meta_category] is not None: sample_properties.append(SampleProperty(meta_category, meta_data[meta_category], default_units[meta_category], all_samples[sample_id])) session.add_all(list(sample_sets.values()) + list(all_samples.values()) + time_places + sample_properties) logging.info("Creating the reference assembly") # create the reference assembly ref_assemblies = ReferenceAssembly.query.filter_by(name=args.reference_assembly).all() if len(ref_assemblies) == 0: ref_assembly = ReferenceAssembly(args.reference_assembly) else: assert len(ref_assemblies) == 1 ref_assembly = ref_assemlbies[0] session.add(ref_assembly) logging.info("Adding annotation information") # Make sure annotations are present annotation_info = pd.read_table(args.all_annotations, header=None, names=["type_identifier", "gene_name", "description"]) annotation_models = {'COG': Cog, 'TIG': TigrFam, 'pfa': Pfam, 'PFA': Pfam} annotation_polymorphic_id = {'COG': 'cog', 'TIG': 'tigrfam', 'pfa': 'pfam', 'PFA': 'pfam'} all_annotations = {} annotation_info['annotation_type'] = annotation_info['type_identifier'].apply(lambda x: annotation_polymorphic_id[x[0:3]]) annotation_info['type_grouping'] = annotation_info['type_identifier'].apply(lambda x: x[0:3]) annotation_info['id'] = annotation_info.index annotation_info['category'] = None for type_grouping, annotation_info_subset in annotation_info.groupby('type_grouping'): if type_grouping == 'COG': logging.info("Commiting all COG annotation info") session.bulk_insert_mappings(Cog, annotation_info_subset[['id', 'type_identifier', 'annotation_type', 'category', 'description']].to_dict(orient='index').values()) else: logging.info("Commiting all {} annotation info".format(type_grouping)) session.bulk_insert_mappings(annotation_models[type_grouping], annotation_info_subset[['id', 'type_identifier', 'annotation_type', 'description']].to_dict(orient='index').values()) all_annotations = dict( (annotation.type_identifier, annotation) for annotation in session.query(Annotation).all() ) logging.info("Adding annotation source") # Create annotation source annotation_source_info = pd.read_table(args.annotation_source_info, sep=',', header=None, names=["annotation_type", "db_version", "algorithm", "algorithm_parameters"], index_col = 0) all_annotation_sources = {} for annotation_type, row in annotation_source_info.iterrows(): all_annotation_sources[annotation_type] = AnnotationSource(annotation_type, row.db_version, row.algorithm, row.algorithm_parameters) session.add_all(list(all_annotation_sources.values())) logging.info("Commiting everything except genes and gene counts") session.commit() def add_genes_with_taxonomy(taxonomy_per_gene, commited_genes): gene_annotations = pd.read_table(taxonomy_per_gene, index_col=0) gene_annotations['taxclass'] = gene_annotations['class'] taxonomy_columns = ["superkingdom", "phylum", "taxclass", "order", "family", "genus", "species"] # Only add genes with taxonomy given annotated_genes = gene_annotations[ ~ gene_annotations[taxonomy_columns].isnull().all(axis=1)] def add_taxa(full_taxonomy, taxonomy_columns): # We want to make a difference between unnamed phyla and unset phyla first = True rev_new_taxonomy = [] for tax_val in reversed(full_taxonomy.split(';')): if tax_val is "": if first: tax_val = None else: tax_val = "Unnamed" else: first = False rev_new_taxonomy.append(tax_val) new_taxa_d = dict(zip(taxonomy_columns, reversed(rev_new_taxonomy))) new_taxa = Taxon(**new_taxa_d) return new_taxa added_taxa = {} gene_to_taxa = {} annotated_genes = annotated_genes.fillna("") # The number of taxa is lower than the genes with taxonomic annotation annotated_genes['full_taxonomy'] = annotated_genes["superkingdom"] + ';' + \ annotated_genes['phylum'] + ';' + \ annotated_genes['taxclass'] + ';' +\ annotated_genes['order'] + ';' +\ annotated_genes['family'] + ';' +\ annotated_genes['genus'] + ';' +\ annotated_genes['species'] + ';' all_taxas = annotated_genes['full_taxonomy'].unique() all_taxas_to_be_created = {} first_full_taxa_to_real_full_taxa = {} for full_taxonomy in all_taxas: taxa = add_taxa(full_taxonomy, taxonomy_columns) first_full_taxa_to_real_full_taxa[full_taxonomy] = taxa.full_taxonomy all_taxas_to_be_created[taxa.full_taxonomy] = taxa annotated_genes['real_full_taxonomy'] = annotated_genes['full_taxonomy'].apply(lambda x: first_full_taxa_to_real_full_taxa[x]) logging.info("Commiting all taxa") session.add_all(all_taxas_to_be_created.values()) session.commit() logging.info("Creating genes with taxon information") all_created_taxa = dict(session.query(Taxon.full_taxonomy, Taxon.id).all() ) annotated_genes['taxon_id'] = annotated_genes['real_full_taxonomy'].apply(lambda x: all_created_taxa[x]) annotated_genes['name'] = annotated_genes.index annotated_genes["reference_assembly_id"] = ref_assembly.id with open(args.tmp_file, 'w') as gene_file: annotated_genes[['name', 'reference_assembly_id', 'taxon_id']].to_csv(gene_file, index=False, header=False) session.execute("COPY gene (name, reference_assembly_id, taxon_id) FROM '{}' WITH CSV;".format(args.tmp_file)) commited_genes.update(dict( session.query(Gene.name, Gene.id).all() )) logging.info("{} genes present in database".format(len(commited_genes.keys()))) return commited_genes commited_genes = {} commited_genes = add_genes_with_taxonomy(args.taxonomy_per_gene, commited_genes) logging.info("Processed {} genes for gene taxonomy, moving on to functional annotation".format(len(commited_genes.keys()))) def add_genes_with_annotation(annotation_type, gene_annotation_arg, commited_genes, all_annotations, annotation_source): logging.info("Adding genes with {} annotations".format(annotation_type)) gene_annotations = pd.read_table(gene_annotation_arg, header=None, names=["name", "type_identifier", "e_value"]) # Only add genes once new_genes = gene_annotations[ ~ gene_annotations['name'].isin(commited_genes.keys()) ] new_genes_uniq = pd.DataFrame([new_genes['name'].unique()]) new_genes_uniq = new_genes_uniq.transpose() new_genes_uniq.columns = ['name'] new_genes_uniq["reference_assembly_id"] = ref_assembly.id logging.info("Commiting all {} genes.".format(annotation_type)) with open(args.tmp_file, 'w') as gene_file: new_genes_uniq[['name', 'reference_assembly_id']].to_csv(gene_file, index=False, header=False) session.execute("COPY gene (name, reference_assembly_id) FROM '{}' WITH CSV;".format(args.tmp_file)) commited_genes.update(dict( session.query(Gene.name, Gene.id).all() )) logging.info("{} genes present in database".format(len(commited_genes.keys()))) gene_annotations['gene_id'] = gene_annotations['name'].apply(lambda x: commited_genes[x]) gene_annotations['annotation_id'] = gene_annotations['type_identifier'].apply(lambda x: all_annotations[x].id) annotation_source = all_annotation_sources[annotation_type] gene_annotations['annotation_source_id'] = annotation_source.id logging.info("Commiting all {} gene anntations".format(annotation_type)) with open(args.tmp_file, 'w') as gene_file: gene_annotations[['gene_id', 'annotation_id', 'annotation_source_id', 'e_value']].to_csv(gene_file, index=False, header=False) session.execute("COPY gene_annotation (gene_id, annotation_id, annotation_source_id, e_value) FROM '{}' WITH CSV;".format(args.tmp_file)) session.commit() return commited_genes commited_genes = add_genes_with_annotation("Cog", args.gene_annotations_cog, commited_genes, all_annotations, all_annotation_sources["Cog"]) commited_genes = add_genes_with_annotation("Pfam", args.gene_annotations_pfam, commited_genes, all_annotations, all_annotation_sources["Pfam"]) commited_genes = add_genes_with_annotation("TigrFam", args.gene_annotations_tigrfam, commited_genes, all_annotations, all_annotation_sources["TigrFam"]) logging.info("Processed {} genes in total, moving on to gene counts".format(len(commited_genes.keys()))) # Fetch each gene from the gene count file and create the corresponding gene count logging.info("Starting with gene counts") gene_counts = pd.read_table(args.gene_counts, index_col=0) total_gene_count_len = len(gene_counts) val_cols = gene_counts.columns nr_columns = len(val_cols) filtered_gene_counts = gene_counts[ gene_counts.index.isin(commited_genes.keys()) ].copy() filtered_gene_counts['gene_name'] = filtered_gene_counts.index filtered_gene_counts['gene_id'] = filtered_gene_counts['gene_name'].apply(lambda x: commited_genes[x]) def add_gene_counts_to_file(col, filtered_gene_counts, sample_id): tmp_cov_df = filtered_gene_counts[[col, 'gene_id']].copy() tmp_cov_df['rpkm'] = tmp_cov_df[col] tmp_cov_df['sample_id'] = sample_id with open(args.tmp_file, 'w') as gene_counts_file: tmp_cov_df[['gene_id', 'sample_id', 'rpkm']].to_csv(gene_counts_file, index=False, header=False) all_sample_ids = dict((sample_name, sample.id) for sample_name, sample in all_samples.items()) filtered_gene_counts.rename(columns=all_sample_ids, inplace=True) sample_id_cols = filtered_gene_counts.columns.tolist() sample_id_cols.remove('gene_id') sample_id_cols.remove('gene_name') filtered_gene_counts.index = filtered_gene_counts['gene_id'] filtered_gene_counts = pd.DataFrame(filtered_gene_counts[sample_id_cols].stack()) filtered_gene_counts.reset_index(inplace=True) filtered_gene_counts.columns = ['gene_id', 'sample_id', 'rpkm'] tot_nr_samples = len(all_samples.values()) logging.info("Start adding gene counts") for i, sample_t in enumerate(filtered_gene_counts.groupby('sample_id')): sample, sample_df = sample_t with open(args.tmp_file, 'w') as gene_counts_file: sample_df.to_csv(gene_counts_file, index=False, header=False) logging.info("Adding gene counts from file. Sample {}/{}".format(i+1, tot_nr_samples)) session.execute("COPY gene_count (gene_id, sample_id, rpkm) FROM '{}' WITH CSV;".format(args.tmp_file)) logging.info("{} out of {} are annotated genes".format(len(filtered_gene_counts), total_gene_count_len)) session.commit() logging.info("Refreshing materialized view") refresh_all_mat_views() session.commit() logging.info("Finished!")