Exemplo n.º 1
0
def get_ident(form_field):
    # check if the post request has the file part
    if form_field not in request.values:
        raise ROGERUsageError('No \'%s\' part' % form_field)
    value = request.values[form_field]
    # if user does not select file, browser also
    # submit a empty part without filename
    if not IDENT_PATTERN.match(value):
        raise ROGERUsageError("Invalid value for `%s': `%s`" % (form_field, value))
    return value
Exemplo n.º 2
0
def remove_species(session, tax_id):
    # Check if dataset is already preset in the database
    species_table = list_species(session)
    if tax_id == human_tax_id:
        raise ROGERUsageError(
            'Cannot delete gene annotation from human species')
    if species_table[species_table.TaxID == tax_id].empty:
        raise ROGERUsageError('Species does not exist in database: %s' %
                              tax_id)

    session.query(GeneAnnotation).filter(
        GeneAnnotation.TaxID == tax_id).delete()
    session.commit()
Exemplo n.º 3
0
Arquivo: dge.py Projeto: bedapub/roger
def __check_matrix(ref_columns, matrix, matrix_name, ref_list_name):
    if len(ref_columns) != matrix.shape[0]:
        raise ROGERUsageError(
            "Number of rows in %s does not match the number of %s: %d vs %d" %
            (matrix_name, ref_list_name, len(ref_columns), matrix.shape[0]))
    if matrix.index.dtype.name == "object" and set(
            matrix.index) != set(ref_columns):
        raise ROGERUsageError("Row names of %s and %s do not match" %
                              (matrix_name, ref_list_name))

    for col_name in matrix.columns:
        if not np.issubdtype(matrix[col_name].dtype, np.integer):
            raise ROGERUsageError("Column '%s' is not an integer type" %
                                  col_name)
Exemplo n.º 4
0
def get_file(form_field):
    # check if the post request has the file part
    if form_field not in request.files:
        raise ROGERUsageError('No \'%s\' part' % form_field)
    file = request.files[form_field]
    # if user does not select file, browser also
    # submit a empty part without filename
    if file.filename == '':
        raise ROGERUsageError('No file specified for `%s`' % form_field)
    if not allowed_file(file.filename):
        raise ROGERUsageError("Invalid file name in `%s`: %s" % (form_field, file.filename))

    filename = secure_filename(file.filename)
    file.save(os.path.join(current_app().config['ROGER_DATA_FOLDER'], filename))
    return file
Exemplo n.º 5
0
Arquivo: dge.py Projeto: bedapub/roger
def add_design(session,
               design_file,
               dataset_name,
               name=None,
               description=None,
               sample_groups_file=None,
               sample_group_levels_file=None,
               sample_group_pheno_column=None):
    ds = get_ds(session, dataset_name)

    name = get_or_guess_name(name, design_file)

    design = query_design(session, name, dataset_name).one_or_none()
    if design is not None:
        raise ROGERUsageError(
            "Design of data set '%s' with name '%s' already exist" %
            (dataset_name, name))

    design_data = create_design_data(
        read_table(design_file, sep='\t',
                   index_col=0), ds.pheno_data, name, description,
        read_array(sample_groups_file, nullable=True),
        read_array(sample_group_levels_file, nullable=True),
        sample_group_pheno_column)

    design_data.design.DataSetID = ds.ID
    session.add(design_data.design)
    session.flush()

    design_data.sample_subset["DesignID"] = design_data.design.ID
    insert_data_frame(session, design_data.sample_subset,
                      SampleSubset.__table__)

    session.commit()
    return name
Exemplo n.º 6
0
Arquivo: dge.py Projeto: bedapub/roger
def get_dge_tbl(session, contrast_name, design_name, dataset_name,
                method_name) -> DGEmodel:
    contrast = session.query(ContrastColumn) \
        .filter(DGEmodel.DGEmethodID == DGEmethod.ID) \
        .filter(DGEmodel.ContrastID == Contrast.ID) \
        .filter(Contrast.DesignID == Design.ID) \
        .filter(Design.DataSetID == DataSet.ID) \
        .filter(ContrastColumn.ContrastID == Contrast.ID) \
        .filter(Design.Name == design_name) \
        .filter(DataSet.Name == dataset_name) \
        .filter(DGEmethod.Name == method_name) \
        .filter(ContrastColumn.Name == contrast_name).one_or_none()
    if contrast is None:
        raise ROGERUsageError("Model for %s:%s:%s does not exists" %
                              (dataset_name, design_name, contrast_name))

    dge_table = as_data_frame(
        session.query(DGEtable).filter(
            DGEtable.ContrastColumnID == contrast.ID))
    feature_data = contrast.Design.DataSet.feature_data
    dge_table['Contrast'] = contrast.Name
    dge_table['Design'] = contrast.Design.Name
    dge_table['DGEMethod'] = method_name
    return dge_table.join(feature_data.set_index('FeatureIndex'),
                          on='FeatureIndex',
                          rsuffix="Feature")
Exemplo n.º 7
0
Arquivo: dge.py Projeto: bedapub/roger
def get_design(session, design_name, ds_name) -> Design:
    design = query_design(session, design_name, ds_name).one_or_none()
    if design is None:
        raise ROGERUsageError(
            "Design of data set '%s' with name '%s' does not exist" %
            (ds_name, design_name))
    return design
Exemplo n.º 8
0
Arquivo: dge.py Projeto: bedapub/roger
def annotate_ds_pheno_data(gct_data, pheno_data=pd.DataFrame()):
    if pheno_data.shape[0] > 0:
        if pheno_data.shape[0] != len(gct_data.columns):
            raise ROGERUsageError(
                "Number of rows in pheno data and number of samples don't match: %d vs %d"
                % (pheno_data.shape[0], len(gct_data.columns)))

    if ROGER_SAMPLE_NAME not in pheno_data:
        pheno_data.insert(0, ROGER_SAMPLE_NAME, list(gct_data))
    if ROGER_SAMPLE_NAME in pheno_data and set(
            pheno_data[ROGER_SAMPLE_NAME]) != set(gct_data):
        raise ROGERUsageError(
            "Sample names given by column '%s' don't match the sample names in expression data"
            % ROGER_SAMPLE_NAME)

    return pheno_data
Exemplo n.º 9
0
Arquivo: dge.py Projeto: bedapub/roger
def get_dge_model(session, contrast_name, design_name, dataset_name,
                  method_name) -> DGEmodel:
    model = query_dge_models(session, contrast_name, design_name, dataset_name,
                             method_name, DGEmodel).one_or_none()
    if model is None:
        raise ROGERUsageError("Model for %s:%s:%s does not exists" %
                              (dataset_name, design_name, contrast_name))
    return model
Exemplo n.º 10
0
Arquivo: gse.py Projeto: bedapub/roger
def delete_method(session, name):
    # Check if GSE method is already preset in the database
    gse_methods = list_methods(session)
    if gse_methods[gse_methods.Name == name].empty:
        raise ROGERUsageError('GSE does not exist in database: %s' % name)

    session.query(GSEmethod).filter(GSEmethod.Name == name).delete()
    session.commit()
Exemplo n.º 11
0
Arquivo: dge.py Projeto: bedapub/roger
def get_algorithm(name) -> DGEAlgorithm:
    algorithm_dict = {
        algo.name: algo
        for algo in [cls() for cls in all_subclasses(DGEAlgorithm)]
    }
    if name not in algorithm_dict:
        raise ROGERUsageError("Algorithm '%s' does not exist" % name)
    return algorithm_dict[name]
Exemplo n.º 12
0
Arquivo: dge.py Projeto: bedapub/roger
def get_contrast(session, contrast_name, design_name, ds_name) -> Contrast:
    design = query_contrast(session, contrast_name, design_name,
                            ds_name).one_or_none()
    if design is None:
        raise ROGERUsageError(
            "Contrast of design '%s' with name '%s' does not exist" %
            (design_name, contrast_name))
    return design
Exemplo n.º 13
0
Arquivo: gse.py Projeto: bedapub/roger
def delete_gmt(session, category_name):
    # Check if gene set category is preset in the database
    gmt_list = list_gmt(session)
    if gmt_list[gmt_list.Name == category_name].empty:
        raise ROGERUsageError('GMT does not exist in database: %s' %
                              category_name)

    session.query(GeneSetCategory).filter(
        GeneSetCategory.Name == category_name).delete()
    session.commit()
Exemplo n.º 14
0
Arquivo: gse.py Projeto: bedapub/roger
def get_gse_table(session, contrast, design, dataset, dge_method,
                  gse_method) -> DataFrame:
    gse_result = get_gse_result(session, contrast, design, dataset, dge_method,
                                gse_method)

    if not gse_result:
        raise ROGERUsageError(
            "GSE results for %s:%s:%s:%s:%s do not exist" %
            (contrast, design, dataset, dge_method, gse_method))
    return gse_result.result_table
Exemplo n.º 15
0
def get_dataset_of(session, tax_id):
    annotation_service = get_annotation_service()

    species_list = roger.persistence.geneanno.list_species(session)
    if species_list[species_list.TaxID == tax_id].empty:
        raise ROGERUsageError('Unknown taxon id: %s' % tax_id)

    dataset_name = species_list.loc[species_list["TaxID"] == tax_id,
                                    "Version"].values[0].split(' ')[0]

    return annotation_service.get_dataset(dataset_name)
Exemplo n.º 16
0
def annotate(session, gct_data, tax_id, symbol_type):
    ensembl_dataset = get_dataset_of(session, tax_id)

    attributes = ensembl_dataset.attributes

    params = {
        "attributes": [symbol_type, "ensembl_gene_id"],
    }

    filter_attr = "with_%s" % symbol_type
    if filter_attr in attributes["name"]:
        params["filters"] = {filter_attr: True}

    all_sym = ensembl_dataset.get_bulk_query(params).dropna()

    feature_anno = pd.DataFrame(data={
        "Name": gct_data.index,
        "FeatureIndex": range(0, gct_data.shape[0])
    },
                                index=gct_data.index)
    feature_anno = feature_anno.join(all_sym.set_index(symbol_type))

    if feature_anno[feature_anno.isnull().any(
            axis=1)].shape[0] == feature_anno.shape[0]:
        raise ROGERUsageError("Unable to annotate features in expression file")

    # TODO Find a better heuristic to drop multiple Ensembl ID association
    feature_anno = feature_anno[~feature_anno.index.duplicated(keep='first')]
    feature_anno = feature_anno.set_index("ensembl_gene_id")

    # TODO include origin tax id and origin roger gene index
    query = session.query(GeneAnnotation.RogerGeneIndex,
                          GeneAnnotation.RogerGeneIndex.label("OriRogerGeneIndex"),
                          literal(human_tax_id).label("OriTaxID"),
                          GeneAnnotation.EnsemblGeneID) \
        .filter_by(TaxID=tax_id)
    if tax_id != human_tax_id:
        query = session \
            .query(Ortholog.HumanRogerGeneIndex.label("RogerGeneIndex"),
                   Ortholog.RogerGeneIndex.label("OriRogerGeneIndex"),
                   literal(tax_id).label("OriTaxID"),
                   GeneAnnotation.EnsemblGeneID) \
            .filter(GeneAnnotation.RogerGeneIndex == Ortholog.RogerGeneIndex) \
            .filter(GeneAnnotation.TaxID == tax_id)
    roger_gene_indices = roger.logic.util.data.as_data_frame(query)
    # TODO Find a better heuristic to drop multiple Ensembl ID association
    # feature_anno.join(roger_gene_indices.set_index("EnsemblGeneID")).to_csv("test.txt", sep="\t")
    feature_anno = feature_anno.join(roger_gene_indices.set_index("EnsemblGeneID")). \
        drop_duplicates("FeatureIndex"). \
        sort_values('FeatureIndex'). \
        reset_index().drop(columns="index")
    return feature_anno, ensembl_dataset.display_name
Exemplo n.º 17
0
Arquivo: dge.py Projeto: bedapub/roger
def __get_sample_groups(design_data,
                        pheno_data,
                        sample_groups=None,
                        sample_group_pheno_column=None):
    if sample_group_pheno_column is not None and sample_groups is not None:
        raise ROGERUsageError(
            "You cannot give a list of sample groups and specify a "
            "sample group column within the pheno data at the same time")

    if sample_group_pheno_column is None and sample_groups is not None:
        return sample_groups

    if sample_group_pheno_column is not None and sample_groups is None:
        if sample_group_pheno_column not in pheno_data:
            raise ROGERUsageError(
                "Column '%s' does not exist in the pheno matrix of the given study"
                % sample_group_pheno_column)
        return pheno_data[sample_group_pheno_column].tolist()

    # No information given? infer sample groups then from the design matrix
    return design_data.apply(lambda row: "_".join(["%s.%d" % (key, value) for (key, value) in row.items()]), axis=1) \
        .tolist()
Exemplo n.º 18
0
Arquivo: dge.py Projeto: bedapub/roger
def create_ds(session,
              ds_type: Type[DataSet],
              exprs_file,
              tax_id,
              symbol_type,
              pheno_file=None,
              name=None,
              normalization_method=None,
              description=None,
              xref=None):
    name = get_or_guess_name(name, exprs_file)

    # Input checking
    species_list = list_species(session)
    if species_list[species_list.TaxID == tax_id].empty:
        raise ROGERUsageError('Unknown taxon id: %s' % tax_id)

    if session.query(DataSet).filter(
            DataSet.Name == name).one_or_none() is not None:
        raise ROGERUsageError("Data set with name '%s' already exists" % name)

    exprs_data = parse_gct(file_path=exprs_file)

    (annotation_data, annotation_version) = annotate(session, exprs_data,
                                                     tax_id, symbol_type)

    pheno_data = pd.DataFrame()
    if pheno_file is not None:
        pheno_data = read_df(pheno_file)

    annotated_pheno_data = annotate_ds_pheno_data(exprs_data, pheno_data)

    return DataSetProperties(ds_type, tax_id, exprs_file, pheno_file,
                             exprs_data, annotated_pheno_data, annotation_data,
                             annotation_version, name, normalization_method,
                             description, xref)
Exemplo n.º 19
0
Arquivo: dge.py Projeto: bedapub/roger
def add_contrast(session,
                 contrast_file,
                 design_name,
                 dataset_name,
                 name=None,
                 description=None):
    design = get_design(session, design_name, dataset_name)

    name = get_or_guess_name(name, contrast_file)

    if query_contrast(session, name, design_name,
                      dataset_name).one_or_none() is not None:
        raise ROGERUsageError("Contrast '%s' already exist in '%s'" %
                              (name, design_name))

    contrast = Contrast(DesignID=design.ID,
                        Name=name,
                        Description=description,
                        CreatedBy=get_current_user_name(),
                        CreationTime=get_current_datetime())
    session.add(contrast)
    session.flush()

    contrast_data = read_table(contrast_file, sep='\t', index_col=0)
    check_contrast_matrix(design.design_matrix.columns, contrast_data)

    contrast_cols = contrast_data.columns
    contrast_table = DataFrame({
        "ContrastID":
        contrast.ID,
        "DesignID":
        design.ID,
        "Name":
        contrast_cols,
        "Description":
        contrast_cols,
        "ColumnData": [
            contrast_data[col_name].values.tolist()
            for col_name in contrast_cols
        ]
    })

    insert_data_frame(session, contrast_table, ContrastColumn.__table__)

    session.commit()
    return name
Exemplo n.º 20
0
Arquivo: dge.py Projeto: bedapub/roger
def create_design_data(design_data,
                       pheno_data,
                       name=None,
                       description=None,
                       sample_groups=None,
                       sample_group_levels=None,
                       sample_group_pheno_column=None) -> DesignData:
    check_design_matrix(pheno_data[ROGER_SAMPLE_NAME], design_data)

    sample_groups = __get_sample_groups(design_data, pheno_data, sample_groups,
                                        sample_group_pheno_column)

    if sample_group_levels is None:
        sample_group_levels = list(set(sample_groups))

    if any([x not in sample_group_levels for x in sample_groups]):
        raise ROGERUsageError(
            "Sample group list contains groups that are not part of sample group levels: %s vs %s"
            % (sample_groups, sample_group_levels))

    # TODO make this customizable by user
    sample_subset = DataFrame({
        "SampleIndex": range(0, pheno_data.shape[0]),
        "IsUsed": True,
        "Description": "No filtering"
    })

    # TODO make this customizable by user
    json_obj = [{
        "columnName": col_name,
        "isCovariate": False,
        "values": design_data[col_name].values.tolist()
    } for col_name in design_data.columns]

    design_entry = Design(VariableCount=design_data.shape[1],
                          Name=name,
                          Description=description,
                          DesignMatrix=json_obj,
                          SampleGroups=sample_groups,
                          SampleGroupLevels=sample_group_levels,
                          CreatedBy=get_current_user_name(),
                          CreationTime=get_current_datetime())

    return DesignData(design_entry, sample_subset)
Exemplo n.º 21
0
Arquivo: gse.py Projeto: bedapub/roger
def perform_gse(session: Session,
                roger_wd_dir: str,
                dge_model: DGEmodel,
                algorithm: GSEAlgorithm,
                gene_set_category_filter: List[str] = None):
    existing_results = get_gse_result(session, dge_model.Contrast.Name,
                                      dge_model.Contrast.Design.Name,
                                      dge_model.Contrast.Design.DataSet.Name,
                                      dge_model.Method.Name, algorithm.name)
    if existing_results:
        raise ROGERUsageError(
            "Result for %s:%s:%s:%s:%s already exists" %
            (dge_model.Contrast.Name, dge_model.Contrast.Design.Name,
             dge_model.Contrast.Design.DataSet.Name, dge_model.Method.Name,
             algorithm.name))

    gene_sets = get_gmt_locations(session, gene_set_category_filter)
    gscs_list = {
        gene_set.Category: gene_set.FileWC
        for index, gene_set in gene_sets.iterrows()
    }

    if len(gscs_list) == 0:
        raise ROGERUsageError(
            "Cannot perform GSE without preexisting gene sets (did you import GMT files?)"
        )

    gscs = ribios_gsea.readGmt(ListVector(gscs_list))

    contrast_columns = dge_model.Contrast.contrast_columns

    gse_method_id = session.query(GSEmethod.ID) \
        .filter(GSEmethod.DGEmethodID == DGEmethod.ID) \
        .filter(DGEmethod.ID == dge_model.Method.ID) \
        .filter(GSEmethod.Name == algorithm.name).scalar()

    gse_algo_result = algorithm.exec_gse(dge_model, gscs)
    enrich_tbl = gse_algo_result.raw_gse_table

    gene_sets.Category = gene_sets.Category.str.lower()
    gene_sets.Name = gene_sets.Name.str.lower()
    enrich_tbl.Category = enrich_tbl.Category.str.lower()
    enrich_tbl.GeneSet = enrich_tbl.GeneSet.str.lower()
    merged_enrich_tbl = enrich_tbl.join(
        gene_sets.set_index(['Category', 'Name']),
        on=['Category', "GeneSet"]).join(contrast_columns.set_index("Name"),
                                         on="Contrast",
                                         lsuffix="_GENE_SET")

    gse_method_sub_dir = "%d_%s" % (dge_model.Contrast.ID, algorithm.name)
    gse_models_path = os.path.join(roger_wd_dir, GSE_RESULT_SUB_FOLDER)
    gse_model_path = os.path.join(gse_models_path, gse_method_sub_dir)
    if not os.path.exists(gse_model_path):
        os.makedirs(gse_model_path)

    gse_result_file = os.path.join(gse_model_path, "gse_table.txt")
    write_df(enrich_tbl, gse_result_file)

    gse_result = GSEresult(ContrastID=dge_model.ContrastID,
                           DGEmethodID=dge_model.DGEmethodID,
                           GSEmethodID=gse_method_id,
                           OutputFile=gse_result_file,
                           MethodDescription=gse_algo_result.method_desc)
    session.add(gse_result)
    session.flush()

    gse_tbl = DataFrame({
        "GSEresultID":
        gse_result.ID,
        "ContrastColumnID":
        merged_enrich_tbl.ID,
        "GeneSetID":
        merged_enrich_tbl.ID_GENE_SET,
        "Correlation":
        merged_enrich_tbl.Correlation,
        "Direction":
        merged_enrich_tbl.Direction.map({
            "Up": 1,
            "Down": -1
        }),
        "PValue":
        merged_enrich_tbl.PValue,
        "FDR":
        merged_enrich_tbl.FDR,
        "EnrichmentScore":
        merged_enrich_tbl.Direction.map({
            "Up": 1,
            "Down": -1
        }) * abs(log10(merged_enrich_tbl.PValue)),
        "EffGeneCount":
        merged_enrich_tbl.NGenes
    })
    unmapped = gse_tbl[gse_tbl.GeneSetID.isnull()]
    mapped = gse_tbl[~gse_tbl.GeneSetID.isnull()]
    if unmapped.shape[0] > 0:
        print("Warning: unable to map %d of %d entries to gene sets " %
              (unmapped.shape[0], merged_enrich_tbl.shape[0]))

    mapped_duplications = mapped.drop_duplicates(
        subset=['ContrastColumnID', 'GeneSetID'])

    if mapped_duplications.shape[0] < mapped.shape[0]:
        print(
            "Warning: %d of %d entries of mapped result entries are duplicated"
            %
            (mapped.shape[0] - mapped_duplications.shape[0], mapped.shape[0]))

    insert_data_frame(session, mapped_duplications, GSEtable.__table__)
    session.commit()
Exemplo n.º 22
0
 def get_dataset(self, dataset_name):
     try:
         return RemoteBioMartDataSet(self.__server.datasets[dataset_name])
     except KeyError:
         raise ROGERUsageError("Dataset not found on Ensembl BioMart: %s" %
                               dataset_name)
Exemplo n.º 23
0
def parse_gct(file_path):
    with open(file_path) as myfile:
        header = [next(myfile).rstrip() for _ in range(3)]

    version_line = header[0]
    dim_line = header[1].split("\t")
    col_line = header[2].split("\t")

    if version_line != "#1.2":
        raise ROGERUsageError(
            "Unable to parse GCT file '%s': missing GCT header" % file_path)

    # Number of genes + number of samples
    n_dim_elems = 2

    if len(dim_line) != n_dim_elems:
        raise ROGERUsageError(
            "Unable to parse GCT file '%s': missing dimension header in GCT header"
            % file_path)

    try:
        dims = [int(x) for x in header[1].split("\t")]
    except ValueError:
        raise ROGERUsageError(
            "Unable to parse GCT file '%s': ill-formatted dimension header '%s'"
            % (file_path, header[1]))

    # Name col + Description col + at least one sample col
    n_minimum_cols = 3

    if len(col_line) < n_minimum_cols or col_line[0].lower(
    ) != "name" or col_line[1].lower() != "description":
        raise ROGERUsageError(
            "Unable to parse GCT file '%s': ill-formatted column header '%s ...'"
            % (file_path, header[2][0:100]))

    sample_names = col_line[2:]

    if len(sample_names) != len(set(sample_names)):
        raise ROGERUsageError(
            "Unable to parse GCT file '%s': duplicated sample names" %
            file_path)

    df = pd.read_table(file_path, sep="\t", skiprows=2, index_col=0)
    df = df.drop(columns=df.columns[0])
    df.index = df.index.astype(str)

    if dims[0] != df.shape[0]:
        raise ROGERUsageError(
            "Unable to parse GCT file '%s': Number of expected genes don't match (%d vs %d)"
            % (file_path, dims[0], df.shape[0]))

    if dims[1] != df.shape[1]:
        raise ROGERUsageError(
            "Unable to parse GCT file '%s': Number of expected samples don't match (%d vs %d)"
            % (file_path, dims[1], df.shape[1]))

    if any([col_type.name == "object" for col_type in df.dtypes]):
        raise ROGERUsageError(
            "Uable to parse GCT file '%s': counts / signal columns have non-numeric values"
            % file_path)

    gene_duplicates = df.index.duplicated()
    if any(gene_duplicates):
        raise ROGERUsageError(
            "Unable to parse GCT file '%s': duplicated row names '%s ...'" %
            (file_path, df[gene_duplicates].index[0:2].tolist()))

    return df
Exemplo n.º 24
0
def add_species(session, dataset_name, tax_id):
    annotation_service = roger.logic.mart.provider.get_annotation_service()
    # Check if dataset is already preset in the database
    species_table = list_species(session)

    if species_table[species_table.TaxID ==
                     human_tax_id].empty and human_tax_id != tax_id:
        raise ROGERUsageError(
            'No human species annotation data present - import human gene annotation first'
        )
    if not species_table[species_table.TaxID == tax_id].empty:
        raise ROGERUsageError('Species already exists in database: %s' %
                              dataset_name)

    homolog_attr = re.sub(r'(\w+)_gene_ensembl', r'\1_homolog_ensembl_gene',
                          dataset_name)
    homolog_filter = re.sub(r'(\w+)_gene_ensembl', r'with_\1_homolog',
                            dataset_name)

    # Insert Gene annotation
    dataset = annotation_service.get_dataset(dataset_name)
    # TODO fix this, should move into provider.py
    version = "%s %s" % (dataset_name,
                         re.search(r'[^(]+\(([^)]+)\)',
                                   dataset.display_name).group(1))

    gene_anno = dataset.get_bulk_query(
        params={
            'attributes': [
                "ensembl_gene_id", "entrezgene", "gene_biotype",
                "external_gene_name"
            ]
        })

    next_id = get_next_free_db_id(session, GeneAnnotation.RogerGeneIndex)

    genes = DataFrame({
        'RogerGeneIndex':
        range(next_id, next_id + gene_anno.shape[0]),
        'Version':
        version,
        'TaxID':
        tax_id,
        'EnsemblGeneID':
        gene_anno["ensembl_gene_id"],
        'EntrezGeneID':
        gene_anno["entrezgene"],
        'GeneType':
        gene_anno["gene_biotype"],
        'GeneSymbol':
        gene_anno["external_gene_name"],
        'IsObsolete':
        False
    })
    insert_data_frame(session, genes, GeneAnnotation.__table__)

    # Insert orthologs
    if tax_id == human_tax_id:
        orthologs = DataFrame({
            'RogerGeneIndex': genes["RogerGeneIndex"],
            'HumanRogerGeneIndex': genes["RogerGeneIndex"]
        })
        insert_data_frame(session, orthologs, Ortholog.__table__)
        session.commit()
        return

    huma_anno_query = as_data_frame(
        session.query(GeneAnnotation).filter(
            GeneAnnotation.TaxID == human_tax_id))
    ortho = annotation_service.get_bulk_query(
        human_dataset,
        params={
            'attributes': ["ensembl_gene_id", homolog_attr],
            'filters': {
                homolog_filter: True
            }
        })
    merged_ortho = ortho.join(huma_anno_query.set_index('EnsemblGeneID'), on='ensembl_gene_id') \
        .join(genes.set_index('EnsemblGeneID'), on=homolog_attr, lsuffix='Human', rsuffix='Other')

    orthologs = DataFrame({
        'RogerGeneIndex':
        merged_ortho["RogerGeneIndexOther"],
        'HumanRogerGeneIndex':
        merged_ortho["RogerGeneIndexHuman"]
    })
    insert_data_frame(session, orthologs, Ortholog.__table__)
    session.commit()
Exemplo n.º 25
0
Arquivo: dge.py Projeto: bedapub/roger
def get_ds(session, name) -> DataSet:
    ds = session.query(DataSet).filter(DataSet.Name == name).one_or_none()
    if ds is None:
        raise ROGERUsageError("Data set with name '%s' does not exist" % name)
    return ds
Exemplo n.º 26
0
Arquivo: dge.py Projeto: bedapub/roger
def run_dge(session, roger_wd_dir, contrast, design, dataset,
            algorithm: DGEAlgorithm):
    model = query_dge_models(session, contrast, design, dataset,
                             algorithm.name, DGEmodel).one_or_none()
    if model is not None:
        raise ROGERUsageError(
            "A model for %s:%s:%s has already been generated by the method '%s'"
            % (dataset, design, contrast, algorithm.name))

    print("Retrieving data from database")
    contrast_data = get_contrast(session, contrast, design, dataset)
    design_data = contrast_data.Design
    ds_data = design_data.DataSet

    feature_data = ds_data.feature_data

    print("Performing differential gene expression analysis using %s" %
          algorithm.name)
    contrast_matrix = contrast_data.contrast_matrix
    dge_result = algorithm.exec_dge(ds_data.ExprsWC, feature_data, design_data,
                                    contrast_matrix)

    print("Persisting model information")
    method = session.query(DGEmethod).filter(
        DGEmethod.Name == algorithm.name).one()

    dge_method_sub_dir = "%d_%d" % (contrast_data.ID, method.ID)
    dge_models_path = os.path.join(roger_wd_dir, DGE_RESULT_SUB_FOLDER)
    dge_model_path = os.path.join(dge_models_path, dge_method_sub_dir)
    if not os.path.exists(dge_model_path):
        os.makedirs(dge_model_path)

    input_obj_file = os.path.abspath(
        os.path.join(dge_model_path, "limma_input_obj.rds"))
    base.saveRDS(dge_result.input_obj, file=input_obj_file)

    fit_obj_file = os.path.abspath(
        os.path.join(dge_model_path, "limma_fit_obj"))
    base.saveRDS(dge_result.fit_obj, file=fit_obj_file)

    dge_model = DGEmodel(ContrastID=contrast_data.ID,
                         DGEmethodID=method.ID,
                         InputObjFile=input_obj_file,
                         FitObjFile=fit_obj_file,
                         MethodDescription=dge_result.method_description)

    session.add(dge_model)
    session.flush()

    print("Persisting feature subsets")
    feature_subset = pd.DataFrame({
        "FeatureIndex":
        feature_data["FeatureIndex"],
        "DataSetID":
        ds_data.ID,
        "ContrastID":
        contrast_data.ID,
        "DGEmethodID":
        method.ID,
        "IsUsed":
        dge_result.used_feature_list,
        "Description":
        "Default filtering by '%s'" % algorithm.name
    })
    insert_data_frame(session, feature_subset, FeatureSubset.__table__)

    dge_tbl = dge_result.dge_table \
        .join(contrast_data.contrast_columns.set_index("Name"), on="Contrast", rsuffix="_C") \
        .join(feature_data.set_index("FeatureIndex"), on="FeatureIndex", rsuffix="_F")

    dgetable = pd.DataFrame({
        'ContrastColumnID': dge_tbl["ID"],
        'FeatureIndex': dge_tbl["FeatureIndex"],
        "ContrastID": contrast_data.ID,
        "DGEmethodID": method.ID,
        'DataSetID': dge_tbl["DataSetID"],
        'AveExprs': dge_tbl["AveExpr"],
        'Statistic': dge_tbl["t"],
        'LogFC': dge_tbl["logFC"],
        'PValue': dge_tbl["PValue"],
        'FDR': dge_tbl["FDR"]
    })
    insert_data_frame(session, dgetable, DGEtable.__table__)
    session.commit()