Пример #1
0
def get_dge_tbl(session, contrast_name, design_name, dataset_name,
                method_name) -> DGEmodel:
    contrast = session.query(ContrastColumn) \
        .filter(DGEmodel.DGEmethodID == DGEmethod.ID) \
        .filter(DGEmodel.ContrastID == Contrast.ID) \
        .filter(Contrast.DesignID == Design.ID) \
        .filter(Design.DataSetID == DataSet.ID) \
        .filter(ContrastColumn.ContrastID == Contrast.ID) \
        .filter(Design.Name == design_name) \
        .filter(DataSet.Name == dataset_name) \
        .filter(DGEmethod.Name == method_name) \
        .filter(ContrastColumn.Name == contrast_name).one_or_none()
    if contrast is None:
        raise ROGERUsageError("Model for %s:%s:%s does not exists" %
                              (dataset_name, design_name, contrast_name))

    dge_table = as_data_frame(
        session.query(DGEtable).filter(
            DGEtable.ContrastColumnID == contrast.ID))
    feature_data = contrast.Design.DataSet.feature_data
    dge_table['Contrast'] = contrast.Name
    dge_table['Design'] = contrast.Design.Name
    dge_table['DGEMethod'] = method_name
    return dge_table.join(feature_data.set_index('FeatureIndex'),
                          on='FeatureIndex',
                          rsuffix="Feature")
Пример #2
0
def list_gse_tables(session, contrast, design, dataset, dge_method,
                    gse_method):
    q = session.query(DataSet.Name.label("Data Set"),
                      Design.Name.label("Design"),
                      Contrast.Name.label("Contrast"),
                      DGEmethod.Name.label("DGE Method"),
                      GSEmethod.Name.label("GSE Method"),
                      func.count(GSEtable.GeneSetID).label("Entry Count")) \
        .filter(Contrast.DesignID == Design.ID) \
        .filter(Design.DataSetID == DataSet.ID) \
        .filter(DGEmodel.ContrastID == Contrast.ID) \
        .filter(DGEmodel.DGEmethodID == DGEmethod.ID) \
        .filter(GSEresult.ContrastID == Contrast.ID) \
        .filter(GSEresult.DGEmethodID == DGEmethod.ID) \
        .filter(GSEresult.GSEmethodID == GSEmethod.ID) \
        .filter(GSEtable.GSEresultID == GSEresult.ID).group_by(GSEtable.GSEresultID)
    if contrast is not None:
        q = q.filter(Contrast.Name == contrast)
    if design is not None:
        q = q.filter(Design.Name == design)
    if dataset is not None:
        q = q.filter(DataSet.Name == dataset)
    if dge_method is not None:
        q = q.filter(DGEmethod.Name == dge_method)
    if gse_method is not None:
        q = q.filter(GSEmethod.Name == gse_method)
    return as_data_frame(q)
Пример #3
0
def list_dge_models(session, contrast_name, design_name, dataset_name,
                    method_name):
    q = query_dge_models(session, contrast_name, design_name, dataset_name,
                         method_name, DataSet.Name.label("DataSet"),
                         Design.Name.label("Design"),
                         Contrast.Name.label("Contrast"),
                         DGEmethod.Name.label("Method"))
    return as_data_frame(q)
Пример #4
0
def list_design(session, ds_name=None):
    q = session.query(Design.Name,
                      Design.Description,
                      DataSet.Name.label("DataSet"),
                      Design.VariableCount,
                      Design.CreatedBy) \
        .filter(Design.DataSetID == DataSet.ID)
    if ds_name is not None:
        q = q.filter(DataSet.Name == ds_name)
    return as_data_frame(q)
Пример #5
0
def get_gmt_locations(session: Session,
                      gene_set_category_filter: List[str] = None):
    query = session.query(GeneSetCategory.Name.label("Category"),
                          GeneSetCategory.FileWC,
                          GeneSet.ID,
                          GeneSet.Name) \
        .filter(GeneSetCategory.ID == GeneSet.CategoryID)

    if gene_set_category_filter:
        query = query.filter(
            GeneSetCategory.Name.in_(gene_set_category_filter))

    return as_data_frame(query)
Пример #6
0
def list_contrast(session, design_name=None, ds_name=None):
    q = session.query(Contrast.Name,
                      Contrast.Description,
                      Design.Name.label("Design"),
                      DataSet.Name.label("DataSet"),
                      Contrast.CreatedBy) \
        .filter(Contrast.DesignID == Design.ID) \
        .filter(Design.DataSetID == DataSet.ID)
    if design_name is not None:
        q = q.filter(Design.Name == design_name)
    if ds_name is not None:
        q = q.filter(DataSet.Name == ds_name)
    return as_data_frame(q)
Пример #7
0
def add_gmt(session,
            roger_wd_dir,
            category_name,
            file,
            tax_id,
            description=None):
    gene_anno = as_data_frame(
        session.query(GeneAnnotation).filter(GeneAnnotation.TaxID == tax_id))
    # TODO Make min_size configurable?
    gmt = gsea_gmt_parser(file, min_size=1, max_size=sys.maxsize)

    gene_sets_path = os.path.join(roger_wd_dir, GENE_SET_SUB_FOLDER)
    file_copy_path = os.path.join(gene_sets_path, os.path.basename(file))

    category = GeneSetCategory(Name=category_name,
                               FileWC=file_copy_path,
                               FileSrc=os.path.abspath(file))
    session.add(category)

    if not os.path.exists(gene_sets_path):
        os.makedirs(gene_sets_path)
    shutil.copy(file, file_copy_path)

    session.flush()

    gene_sets = [
        GeneSet(Category=category,
                Name=gene_set_name,
                TaxID=tax_id,
                Description=description,
                GeneCount=len(genes),
                IsPrivate=False) for gene_set_name, genes in gmt.items()
    ]
    session.add_all(gene_sets)
    session.flush()
    gene_set_dict = {gene_set.Name: gene_set for gene_set in gene_sets}

    gene_set_data = {'GeneSetID': [], 'GeneSymbol': []}
    for gene_set_name, genes in gmt.items():
        gene_set_data['GeneSetID'] += [gene_set_dict[gene_set_name].ID
                                       ] * len(genes)
        gene_set_data['GeneSymbol'] += genes

    genes_table = pd.DataFrame.from_dict(gene_set_data)
    annotated_genes = genes_table.join(gene_anno.set_index('GeneSymbol'),
                                       on='GeneSymbol')
    # Filter out non-matching genes
    matched_genes = annotated_genes[annotated_genes.RogerGeneIndex.notna()] \
        .drop_duplicates(subset=['RogerGeneIndex', 'GeneSetID'], keep=False)

    # Bulk insert all gene set genes
    insert_data_frame(session,
                      matched_genes,
                      GeneSetGene.__table__,
                      chunk_size=100000)
    session.commit()

    # Report number of gene symbols that could not be matched with gene annotation
    p_unknown_gene_symbols = (annotated_genes.shape[0] - matched_genes.shape[0]
                              ) / float(annotated_genes.shape[0])
    return p_unknown_gene_symbols
Пример #8
0
def list_gmt(session):
    return as_data_frame(
        session.query(GeneSetCategory.Name,
                      func.count(GeneSet.ID).label("GeneSetCount")).filter(
                          GeneSetCategory.ID == GeneSet.CategoryID).group_by(
                              GeneSetCategory.Name))
Пример #9
0
def list_methods(session):
    return as_data_frame(
        session.query(GSEmethod.Name, GSEmethod.Description,
                      DGEmethod.Name.label("Reference DGE Method")).filter(
                          GSEmethod.DGEmethodID == DGEmethod.ID))
Пример #10
0
def list_ds(session):
    return as_data_frame(
        session.query(DataSet.Name, DataSet.Type, DataSet.FeatureCount,
                      DataSet.SampleCount, DataSet.CreatedBy, DataSet.Xref))
Пример #11
0
def list_methods(session):
    return as_data_frame(session.query(DGEmethod.Name, DGEmethod.Description))
Пример #12
0
def add_species(session, dataset_name, tax_id):
    annotation_service = roger.logic.mart.provider.get_annotation_service()
    # Check if dataset is already preset in the database
    species_table = list_species(session)

    if species_table[species_table.TaxID ==
                     human_tax_id].empty and human_tax_id != tax_id:
        raise ROGERUsageError(
            'No human species annotation data present - import human gene annotation first'
        )
    if not species_table[species_table.TaxID == tax_id].empty:
        raise ROGERUsageError('Species already exists in database: %s' %
                              dataset_name)

    homolog_attr = re.sub(r'(\w+)_gene_ensembl', r'\1_homolog_ensembl_gene',
                          dataset_name)
    homolog_filter = re.sub(r'(\w+)_gene_ensembl', r'with_\1_homolog',
                            dataset_name)

    # Insert Gene annotation
    dataset = annotation_service.get_dataset(dataset_name)
    # TODO fix this, should move into provider.py
    version = "%s %s" % (dataset_name,
                         re.search(r'[^(]+\(([^)]+)\)',
                                   dataset.display_name).group(1))

    gene_anno = dataset.get_bulk_query(
        params={
            'attributes': [
                "ensembl_gene_id", "entrezgene", "gene_biotype",
                "external_gene_name"
            ]
        })

    next_id = get_next_free_db_id(session, GeneAnnotation.RogerGeneIndex)

    genes = DataFrame({
        'RogerGeneIndex':
        range(next_id, next_id + gene_anno.shape[0]),
        'Version':
        version,
        'TaxID':
        tax_id,
        'EnsemblGeneID':
        gene_anno["ensembl_gene_id"],
        'EntrezGeneID':
        gene_anno["entrezgene"],
        'GeneType':
        gene_anno["gene_biotype"],
        'GeneSymbol':
        gene_anno["external_gene_name"],
        'IsObsolete':
        False
    })
    insert_data_frame(session, genes, GeneAnnotation.__table__)

    # Insert orthologs
    if tax_id == human_tax_id:
        orthologs = DataFrame({
            'RogerGeneIndex': genes["RogerGeneIndex"],
            'HumanRogerGeneIndex': genes["RogerGeneIndex"]
        })
        insert_data_frame(session, orthologs, Ortholog.__table__)
        session.commit()
        return

    huma_anno_query = as_data_frame(
        session.query(GeneAnnotation).filter(
            GeneAnnotation.TaxID == human_tax_id))
    ortho = annotation_service.get_bulk_query(
        human_dataset,
        params={
            'attributes': ["ensembl_gene_id", homolog_attr],
            'filters': {
                homolog_filter: True
            }
        })
    merged_ortho = ortho.join(huma_anno_query.set_index('EnsemblGeneID'), on='ensembl_gene_id') \
        .join(genes.set_index('EnsemblGeneID'), on=homolog_attr, lsuffix='Human', rsuffix='Other')

    orthologs = DataFrame({
        'RogerGeneIndex':
        merged_ortho["RogerGeneIndexOther"],
        'HumanRogerGeneIndex':
        merged_ortho["RogerGeneIndexHuman"]
    })
    insert_data_frame(session, orthologs, Ortholog.__table__)
    session.commit()
Пример #13
0
def list_species(session):
    return as_data_frame(
        session.query(GeneAnnotation.TaxID,
                      GeneAnnotation.Version).group_by(GeneAnnotation.TaxID,
                                                       GeneAnnotation.Version))