def get_dge_tbl(session, contrast_name, design_name, dataset_name, method_name) -> DGEmodel: contrast = session.query(ContrastColumn) \ .filter(DGEmodel.DGEmethodID == DGEmethod.ID) \ .filter(DGEmodel.ContrastID == Contrast.ID) \ .filter(Contrast.DesignID == Design.ID) \ .filter(Design.DataSetID == DataSet.ID) \ .filter(ContrastColumn.ContrastID == Contrast.ID) \ .filter(Design.Name == design_name) \ .filter(DataSet.Name == dataset_name) \ .filter(DGEmethod.Name == method_name) \ .filter(ContrastColumn.Name == contrast_name).one_or_none() if contrast is None: raise ROGERUsageError("Model for %s:%s:%s does not exists" % (dataset_name, design_name, contrast_name)) dge_table = as_data_frame( session.query(DGEtable).filter( DGEtable.ContrastColumnID == contrast.ID)) feature_data = contrast.Design.DataSet.feature_data dge_table['Contrast'] = contrast.Name dge_table['Design'] = contrast.Design.Name dge_table['DGEMethod'] = method_name return dge_table.join(feature_data.set_index('FeatureIndex'), on='FeatureIndex', rsuffix="Feature")
def list_gse_tables(session, contrast, design, dataset, dge_method, gse_method): q = session.query(DataSet.Name.label("Data Set"), Design.Name.label("Design"), Contrast.Name.label("Contrast"), DGEmethod.Name.label("DGE Method"), GSEmethod.Name.label("GSE Method"), func.count(GSEtable.GeneSetID).label("Entry Count")) \ .filter(Contrast.DesignID == Design.ID) \ .filter(Design.DataSetID == DataSet.ID) \ .filter(DGEmodel.ContrastID == Contrast.ID) \ .filter(DGEmodel.DGEmethodID == DGEmethod.ID) \ .filter(GSEresult.ContrastID == Contrast.ID) \ .filter(GSEresult.DGEmethodID == DGEmethod.ID) \ .filter(GSEresult.GSEmethodID == GSEmethod.ID) \ .filter(GSEtable.GSEresultID == GSEresult.ID).group_by(GSEtable.GSEresultID) if contrast is not None: q = q.filter(Contrast.Name == contrast) if design is not None: q = q.filter(Design.Name == design) if dataset is not None: q = q.filter(DataSet.Name == dataset) if dge_method is not None: q = q.filter(DGEmethod.Name == dge_method) if gse_method is not None: q = q.filter(GSEmethod.Name == gse_method) return as_data_frame(q)
def list_dge_models(session, contrast_name, design_name, dataset_name, method_name): q = query_dge_models(session, contrast_name, design_name, dataset_name, method_name, DataSet.Name.label("DataSet"), Design.Name.label("Design"), Contrast.Name.label("Contrast"), DGEmethod.Name.label("Method")) return as_data_frame(q)
def list_design(session, ds_name=None): q = session.query(Design.Name, Design.Description, DataSet.Name.label("DataSet"), Design.VariableCount, Design.CreatedBy) \ .filter(Design.DataSetID == DataSet.ID) if ds_name is not None: q = q.filter(DataSet.Name == ds_name) return as_data_frame(q)
def get_gmt_locations(session: Session, gene_set_category_filter: List[str] = None): query = session.query(GeneSetCategory.Name.label("Category"), GeneSetCategory.FileWC, GeneSet.ID, GeneSet.Name) \ .filter(GeneSetCategory.ID == GeneSet.CategoryID) if gene_set_category_filter: query = query.filter( GeneSetCategory.Name.in_(gene_set_category_filter)) return as_data_frame(query)
def list_contrast(session, design_name=None, ds_name=None): q = session.query(Contrast.Name, Contrast.Description, Design.Name.label("Design"), DataSet.Name.label("DataSet"), Contrast.CreatedBy) \ .filter(Contrast.DesignID == Design.ID) \ .filter(Design.DataSetID == DataSet.ID) if design_name is not None: q = q.filter(Design.Name == design_name) if ds_name is not None: q = q.filter(DataSet.Name == ds_name) return as_data_frame(q)
def add_gmt(session, roger_wd_dir, category_name, file, tax_id, description=None): gene_anno = as_data_frame( session.query(GeneAnnotation).filter(GeneAnnotation.TaxID == tax_id)) # TODO Make min_size configurable? gmt = gsea_gmt_parser(file, min_size=1, max_size=sys.maxsize) gene_sets_path = os.path.join(roger_wd_dir, GENE_SET_SUB_FOLDER) file_copy_path = os.path.join(gene_sets_path, os.path.basename(file)) category = GeneSetCategory(Name=category_name, FileWC=file_copy_path, FileSrc=os.path.abspath(file)) session.add(category) if not os.path.exists(gene_sets_path): os.makedirs(gene_sets_path) shutil.copy(file, file_copy_path) session.flush() gene_sets = [ GeneSet(Category=category, Name=gene_set_name, TaxID=tax_id, Description=description, GeneCount=len(genes), IsPrivate=False) for gene_set_name, genes in gmt.items() ] session.add_all(gene_sets) session.flush() gene_set_dict = {gene_set.Name: gene_set for gene_set in gene_sets} gene_set_data = {'GeneSetID': [], 'GeneSymbol': []} for gene_set_name, genes in gmt.items(): gene_set_data['GeneSetID'] += [gene_set_dict[gene_set_name].ID ] * len(genes) gene_set_data['GeneSymbol'] += genes genes_table = pd.DataFrame.from_dict(gene_set_data) annotated_genes = genes_table.join(gene_anno.set_index('GeneSymbol'), on='GeneSymbol') # Filter out non-matching genes matched_genes = annotated_genes[annotated_genes.RogerGeneIndex.notna()] \ .drop_duplicates(subset=['RogerGeneIndex', 'GeneSetID'], keep=False) # Bulk insert all gene set genes insert_data_frame(session, matched_genes, GeneSetGene.__table__, chunk_size=100000) session.commit() # Report number of gene symbols that could not be matched with gene annotation p_unknown_gene_symbols = (annotated_genes.shape[0] - matched_genes.shape[0] ) / float(annotated_genes.shape[0]) return p_unknown_gene_symbols
def list_gmt(session): return as_data_frame( session.query(GeneSetCategory.Name, func.count(GeneSet.ID).label("GeneSetCount")).filter( GeneSetCategory.ID == GeneSet.CategoryID).group_by( GeneSetCategory.Name))
def list_methods(session): return as_data_frame( session.query(GSEmethod.Name, GSEmethod.Description, DGEmethod.Name.label("Reference DGE Method")).filter( GSEmethod.DGEmethodID == DGEmethod.ID))
def list_ds(session): return as_data_frame( session.query(DataSet.Name, DataSet.Type, DataSet.FeatureCount, DataSet.SampleCount, DataSet.CreatedBy, DataSet.Xref))
def list_methods(session): return as_data_frame(session.query(DGEmethod.Name, DGEmethod.Description))
def add_species(session, dataset_name, tax_id): annotation_service = roger.logic.mart.provider.get_annotation_service() # Check if dataset is already preset in the database species_table = list_species(session) if species_table[species_table.TaxID == human_tax_id].empty and human_tax_id != tax_id: raise ROGERUsageError( 'No human species annotation data present - import human gene annotation first' ) if not species_table[species_table.TaxID == tax_id].empty: raise ROGERUsageError('Species already exists in database: %s' % dataset_name) homolog_attr = re.sub(r'(\w+)_gene_ensembl', r'\1_homolog_ensembl_gene', dataset_name) homolog_filter = re.sub(r'(\w+)_gene_ensembl', r'with_\1_homolog', dataset_name) # Insert Gene annotation dataset = annotation_service.get_dataset(dataset_name) # TODO fix this, should move into provider.py version = "%s %s" % (dataset_name, re.search(r'[^(]+\(([^)]+)\)', dataset.display_name).group(1)) gene_anno = dataset.get_bulk_query( params={ 'attributes': [ "ensembl_gene_id", "entrezgene", "gene_biotype", "external_gene_name" ] }) next_id = get_next_free_db_id(session, GeneAnnotation.RogerGeneIndex) genes = DataFrame({ 'RogerGeneIndex': range(next_id, next_id + gene_anno.shape[0]), 'Version': version, 'TaxID': tax_id, 'EnsemblGeneID': gene_anno["ensembl_gene_id"], 'EntrezGeneID': gene_anno["entrezgene"], 'GeneType': gene_anno["gene_biotype"], 'GeneSymbol': gene_anno["external_gene_name"], 'IsObsolete': False }) insert_data_frame(session, genes, GeneAnnotation.__table__) # Insert orthologs if tax_id == human_tax_id: orthologs = DataFrame({ 'RogerGeneIndex': genes["RogerGeneIndex"], 'HumanRogerGeneIndex': genes["RogerGeneIndex"] }) insert_data_frame(session, orthologs, Ortholog.__table__) session.commit() return huma_anno_query = as_data_frame( session.query(GeneAnnotation).filter( GeneAnnotation.TaxID == human_tax_id)) ortho = annotation_service.get_bulk_query( human_dataset, params={ 'attributes': ["ensembl_gene_id", homolog_attr], 'filters': { homolog_filter: True } }) merged_ortho = ortho.join(huma_anno_query.set_index('EnsemblGeneID'), on='ensembl_gene_id') \ .join(genes.set_index('EnsemblGeneID'), on=homolog_attr, lsuffix='Human', rsuffix='Other') orthologs = DataFrame({ 'RogerGeneIndex': merged_ortho["RogerGeneIndexOther"], 'HumanRogerGeneIndex': merged_ortho["RogerGeneIndexHuman"] }) insert_data_frame(session, orthologs, Ortholog.__table__) session.commit()
def list_species(session): return as_data_frame( session.query(GeneAnnotation.TaxID, GeneAnnotation.Version).group_by(GeneAnnotation.TaxID, GeneAnnotation.Version))