def test():
   persister = Persister(PERSISTER_PATH)
   self.assertTrue(persister.isExist())
   calculator = persister.get()
   self.assertTrue(isinstance(
       calculator.df_ria, pd.DataFrame))
   self.assertTrue(os.path.isfile(OUT_PATH))
Exemplo n.º 2
0
 def __init__(self, persister_path=None):
     """
 Parameters
 ----------
 persister_path: str
     path to persister file
 """
     if persister_path is None:
         persister_path = os.path.join(cn.DATA_DIR, DATA_FILE)
     self.persister = Persister(persister_path)
     self.namespace_dct = {}  # Items that go in the caller's namespace
Exemplo n.º 3
0
class TestSharedData(unittest.TestCase):
    def deleteFiles(self):
        if os.path.isfile(PERSISTER_PATH):
            os.remove(PERSISTER_PATH)

    def setUp(self):
        self.deleteFiles()
        self.persister = Persister(PERSISTER_PATH)

    def tearDown(self):
        self.deleteFiles()

    def testConstructor(self):
        if IGNORE_TEST:
            return

        def test():
            data = shared_data.SharedData(persister=self.persister)
            self.assertTrue(isinstance(data.provider, DataProvider))
            self.assertTrue(isinstance(data.df_X, pd.DataFrame))
            self.assertTrue(isinstance(data.ser_y, pd.Series))
            self.assertTrue(isinstance(data.states, np.ndarray))
            self.assertEqual(len(data.states), len(data.collection_dct.keys()))

        # Test without persister
        test()
        self.assertTrue(self.persister.isExist())
        # Test with persister
        test()
Exemplo n.º 4
0
 def testRun(self):
     if IGNORE_TEST:
         return
     main.run(PERSISTER_PATH,
              True,
              max_iter=1,
              is_report=False,
              mcfo_kwargs=MCFO_KWARGS)
     persister = Persister(PERSISTER_PATH)
     self.assertTrue(persister.isExist())
     optimizer = persister.get()
     self.assertTrue(isinstance(optimizer.fit_result_dct, dict))
     #
     main.run(PERSISTER_PATH,
              False,
              max_iter=1,
              is_report=False,
              mcfo_kwargs=MCFO_KWARGS)
     optimizer2 = persister.get()
     for cls in optimizer.fit_result_dct.keys():
         self.assertTrue(
             len(optimizer.fit_result_dct[cls]) == len(
                 optimizer2.fit_result_dct[cls]))
Exemplo n.º 5
0
 def do(self, data_dir=cn.DATA_DIR):
     """
 Assigns values to the instance data.
 """
     persister = Persister(cn.DATA_PROVIDER_PERSISTER_PATH)
     if persister.isExist():
         provider = persister.get()
         self._setValues(provider=provider)
     else:
         # Gene categorizations
         self.df_ec_terms =  \
             self._makeDFFromCSV(FILENAME_EC_TERMS, is_index_geneid=True)
         self.df_ko_terms =  \
             self._makeDFFromCSV(FILENAME_KO_TERMS, is_index_geneid=True)
         self.df_kegg_pathways =  \
             self._makeDFFromCSV(FILENAME_KEGG_PATHWAYS,
             is_index_geneid=False)
         self.df_kegg_gene_pathways =  \
             self._makeDFFromCSV(FILENAME_KEGG_GENE_PATHWAY,
             is_index_geneid=True)
         # GO Terms
         self.df_go_terms = self._makeGoTerms()
         # Gene expression for state
         self.df_gene_expression_state = self._makeDFFromCSV(
             FILENAME_GENE_EXPRESSION_STATE, is_index_geneid=True)
         # Gene description
         self.df_gene_description = self._makeGeneDescriptionDF()
         # Stages matrix
         self.df_stage_matrix = self._makeStageMatrixDF()
         # Normalized data values
         self.df_normalized = self._makeNormalizedDF()
         # Raw readcounts
         self.dfs_read_count = self._makeReadCountDFS()
         # Hypoxia data
         self.df_hypoxia = self._makeHypoxiaDF()
         # Create mean and std dataframes
         self.df_mean = self._makeMeanDF()
         self.df_std = self._makeStdDF()
         self.df_cv = 100 * self.df_std / self.df_mean
         persister.set(self)
def getPersister(path=None):
  if path is None:
    path = _makePath(filename=PERSISTER_FILE)
  return Persister(path)
Exemplo n.º 7
0
 def tearDown(self):
     persister = Persister(cn.DATA_PROVIDER_PERSISTER_PATH)
     persister.remove()
Exemplo n.º 8
0
class ClassificationData():
    # Data preparation constants

    def __init__(self, persister_path=None):
        """
    Parameters
    ----------
    persister_path: str
        path to persister file
    """
        if persister_path is None:
            persister_path = os.path.join(cn.DATA_DIR, DATA_FILE)
        self.persister = Persister(persister_path)
        self.namespace_dct = {}  # Items that go in the caller's namespace

    def initialize(self):
        """
    Initializes the data. Defines and initializes all names added to globals().
    """
        #
        T0 = "T0"
        POOLED = "pooled"
        self._addName("T0", "T0")
        self._addName("POOLED", "pooled")
        self._addName("REF_TYPE_POOLED", REF_TYPE_POOLED)
        self._addName("REF_TYPE_BIOREACTOR", REF_TYPE_BIOREACTOR)
        self._addName("REF_TYPE_SELF", REF_TYPE_SELF)
        # Provider
        PROVIDER = DataProvider()
        self._addName("PROVIDER", PROVIDER)
        PROVIDER.do()
        TRINARY = TrinaryData()
        self._addName("TRINARY", TRINARY)
        # Gene Classes
        ALL_GENES = list(TRINARY.df_X.columns)
        self._addName("ALL_GENES", ALL_GENES)
        # Gene groupings. Added later so can include top12 from classifier
        MYCOBACTIN_GENES = [
            "Rv2377c",
            "Rv2378c",
            "Rv2379c",
            "Rv2380c",
            "Rv2381c",
            "Rv2382c",
            "Rv2383c",
            "Rv2384",
            "Rv2385",
            "Rv2386c",
        ]
        self._addName("MYCOBACTIN_GENES", MYCOBACTIN_GENES)
        BACTERIOFERRITIN_GENES = [
            "Rv2341",
            "Rv3841",
        ]
        self._addName("BACTERIOFERRITIN_GENES", BACTERIOFERRITIN_GENES)
        MYCOBACTIN_BACTERIOFERRIN_GENES = list(MYCOBACTIN_GENES)
        self._addName("MYCOBACTIN_BACTERIOFERRIN_GENES",
                      MYCOBACTIN_BACTERIOFERRIN_GENES)
        MYCOBACTIN_BACTERIOFERRIN_GENES.extend(BACTERIOFERRITIN_GENES)
        MYCOBACTIN_BACTERIOFERRITIN = "mycobactin_bacterioferritin"
        BACTERIOFERRITIN = "bacterioferritin"
        MYCOBACTIN = "mycobactin"
        ALL = "all"
        GENE_DCT = {
            MYCOBACTIN: MYCOBACTIN_GENES,
            BACTERIOFERRITIN: BACTERIOFERRITIN_GENES,
            MYCOBACTIN_BACTERIOFERRITIN: MYCOBACTIN_BACTERIOFERRIN_GENES,
            ALL: ALL_GENES,
        }
        # Define the stage names
        STAGE_NAMES = list(cn.STATE_NAMES)
        self._addName("STAGE_NAMES", STAGE_NAMES)
        STAGE_NAMES.remove("Normoxia")
        STAGE_NAMES = np.array(STAGE_NAMES)
        # Bioreactor data calculated with two different references
        DATA_DCT = {
            T0:
            TrinaryData(is_regulator=False, is_dropT1=True, is_averaged=True),
            POOLED:
            TrinaryData(is_regulator=False,
                        is_dropT1=True,
                        is_averaged=True,
                        calcRef=PROVIDER.calcRefPooled)
        }
        self._addName("DATA_DCT", DATA_DCT)
        SER_Y_DCT = {k: t.ser_y for k, t in DATA_DCT.items()}
        self._addName("SER_Y_DCT", SER_Y_DCT)
        # Feature vectors are specific to the gene subsets
        DF_X_DCT = {k: t.df_X.copy() for k, t in DATA_DCT.items()}
        DF_X_DCT = {k: df[MYCOBACTIN_GENES] for k, df in DF_X_DCT.items()}
        self._addName("DF_X_DCT", DF_X_DCT)
        # Sample data
        SAMPLE_DCT = {
            r: sample_data.getSampleData(ref_type=r, is_regulator=False)
            for r in [REF_TYPE_BIOREACTOR, REF_TYPE_SELF, REF_TYPE_POOLED]
        }
        self._addName("SAMPLE_DCT", SAMPLE_DCT)
        SAMPLE_AVG_DCT = {
            r: sample_data.getSampleData(ref_type=r,
                                         is_regulator=False,
                                         is_average=True)
            for r in [REF_TYPE_BIOREACTOR, REF_TYPE_SELF, REF_TYPE_POOLED]
        }
        self._addName("SAMPLE_AVG_DCT", SAMPLE_AVG_DCT)
        # Classifiers
        num_feature = len(MYCOBACTIN_BACTERIOFERRIN_GENES)
        CLASSIFIER_BASE = classifier_ensemble.ClassifierEnsemble(
            classifier_ensemble.ClassifierDescriptorSVM(),
            filter_high_rank=num_feature,
            size=NUM_CLASSIFIER_IN_ENSEMBLE)
        self._addName("CLASSIFIER_BASE", CLASSIFIER_BASE)
        CLASSIFIER_DCT = {}
        self._addName("CLASSIFIER_DCT", CLASSIFIER_DCT)
        for trinary_key, trinary in DATA_DCT.items():
            for gene_key, gene_list in GENE_DCT.items():
                classifier = copy.deepcopy(CLASSIFIER_BASE)
                # Not all genes may be present in TrinaryData since they may be correlated or unvarying.
                df_X = dataframe.subset(trinary.df_X, gene_list, axis=1)
                classifier.fit(df_X, trinary.ser_y, class_names=STAGE_NAMES)
                CLASSIFIER_DCT[(trinary_key, gene_key)] = classifier
        # Calculate the rest of the gene groups and add them
        TOP12_T0 = "top12_T0"
        TOP12_POOLED = "top12_pooled"
        TOP12_T0_GENES = list(CLASSIFIER_DCT[(T0, ALL)].columns)
        TOP12_POOLED_GENES = list(CLASSIFIER_DCT[(POOLED, ALL)].columns)
        GENE_DCT[TOP12_T0] = TOP12_T0_GENES
        GENE_DCT[TOP12_POOLED] = TOP12_POOLED_GENES
        GENE_GROUPS = list(GENE_DCT.keys())
        self._addName("GENE_GROUPS", GENE_GROUPS)
        for name in GENE_GROUPS:
            self._addName(name.upper(), name)  # Add the name of each group
        self._addName("GENE_DCT", GENE_DCT)
        # Construct derivative structures
        self._addName("DF_X", DF_X_DCT[T0])
        self._addName("SER_Y", SER_Y_DCT[T0])
        self._addName("SAMPLE_DATA_DCT", SAMPLE_DCT[REF_TYPE_BIOREACTOR])
        self._addName("CLASSIFIER", CLASSIFIER_DCT[('T0', 'mycobactin')])
        key = (T0, "mycobactin_bacterioferritin")
        self._addName("GENES", CLASSIFIER_DCT[key].features)
        # Accuracy calculations for classifiers
        DF_ACCURACY = self.calcAccuracy()
        self._addName("DF_ACCURACY", DF_ACCURACY)

    def _addName(self, name, value):
        """
    Adds the name and value to the namespace.
 
    Parameters
    ----------
    name: str
    value: object
    -------
    """
        stmt = "self.namespace_dct['%s'] = value" % name
        exec(stmt)

    def serialize(self):
        """
    Writes the current contents of self.namespace_dct to the persister.
    """
        self.persister.set(self.namespace_dct)

    def deserialize(self):
        """
    Recovers previously serialized data, initializing self.namespace_dct.
    -------
    """
        if not self.persister.isExist():
            raise ValueError(
                "Persister file %s does not exist. Use serialize first." %
                self.persister.path)
        self.namespace_dct = self.persister.get()
        return self.namespace_dct

    def setNamespace(self, globals_dct):
        """
    Sets the globals provided based on the initialized namespace.

    Parameters
    ----------
    globals_dct: dict
    """
        for name, value in self.namespace_dct.items():
            globals_dct[name] = value

    def get(self, globals_dct):
        """
    Deserializes an existing persister file and initializes the namespace.

    Parameters
    ----------
    globals_dct: dict
    """
        self.deserialize()
        self.setNamespace(globals_dct)

    def calcAccuracy(self,
                     num_features=NUM_FEATURES,
                     num_clf=100,
                     is_debug=False):
        """
    Calculates the accuracy of classifiers using 10 iterations of 
    cross validation with one holdout per state (stage).

    Parameters
    ----------
    num_features: list-int
    num_clf: number of classifiers in the ensemble
    is_debug: bool
        Creates dummy data 
    
    Returns
    -------
    DataFrame:
        COL_REF: how reference is calculated for gene expressions
        COL_GENE_GROUP: grouping of genes used in classifier
        COL_NUM_FEATURE: number of features in classifiers
        COL_MEAN_ACCURACY: mean accuracy of the classifiers
        COL_STD_ACCURACY: standard deviation of accuracy
    """
        classifier_dct = self.namespace_dct["CLASSIFIER_DCT"]
        data_dct = self.namespace_dct["DATA_DCT"]
        gene_dct = self.namespace_dct["GENE_DCT"]
        line_dct = {r: l for r, l in zip(data_dct.keys(), ["-", "--"])}
        accuracy_dct = {c: [] for c in DF_ACCURACY_COLUMNS}
        for (ref, group), clf in classifier_dct.items():
            num_features = list(range(1, 13))
            num_features.insert(0, 1)
            trinary = copy.deepcopy(data_dct[ref])
            trinary.df_X = dataframe.subset(trinary.df_X, gene_dct[group])
            for num_feature in num_features:
                if is_debug:
                    # Create a dummy value
                    mean_accuracy = np.random.rand()
                else:
                    mean_accuracy = clf.crossValidate(
                        trinary,
                        num_iter=10,
                        num_holdout=1,
                        filter_high_rank=num_feature,
                        size=num_clf)
                accuracy_dct[COL_REF].append(ref)
                accuracy_dct[COL_GENE_GROUP].append(group)
                accuracy_dct[COL_NUM_FEATURE].append(num_feature)
                accuracy_dct[COL_MEAN_ACCURACY].append(mean_accuracy)
                std_accuracy = np.sqrt(mean_accuracy * (1 - mean_accuracy) /
                                       num_clf)
                accuracy_dct[COL_STD_ACCURACY].append(std_accuracy)
        df_accuracy = pd.DataFrame(accuracy_dct)
        return df_accuracy
def getPersister(path=PERSISTER_PATH):
  return Persister(path)
Exemplo n.º 10
0
def run(state,
        out_dir_pat=OUT_PATH_DIR_PAT,
        num_cross_iter=NUM_CROSS_ITER,
        is_status=False,
        report_interval=REPORT_INTERVAL,
        is_report=True,
        columns=None,
        is_restart=IS_RESTART,
        **kwargs):
    """
  Runs feature selection.
  :param int state: State being analyzed
  :param list-str columns: columns of df_X to use
  :param bool is_status: report status extracted from the persister
  :param kwargs dict: arguments for TrinaryData
  :param dict kwargs: optional arguments for
       FeatureAnalyzer
  """
    def calcLen(obj, func):
        if obj is None:
            return 0
        else:
            return func(obj)

    #
    CUR_LEN = "cur_len"
    MAX_LEN = "max_len"
    persister_path = PERSISTER_PATH_PAT % state
    df_X, ser_y = _getData(state, columns, **kwargs)
    if is_status:
        func = lambda d: len(d["score"])
        #
        persister = Persister(persister_path)
        analyzer = persister.get()
        pair_length = MAX_FEATURES_FOR_PAIRING*(MAX_FEATURES_FOR_PAIRING-1)/2  \
            + MAX_FEATURES_FOR_PAIRING
        dct = {
            "sfa": {
                CUR_LEN: calcLen(analyzer._sfa_dct, lambda d: len(d.keys())),
                MAX_LEN: len(df_X.columns)
            },
            "cpc": {
                CUR_LEN: calcLen(analyzer._cpc_dct, func),
                MAX_LEN: pair_length
            },
            "ipa": {
                CUR_LEN: calcLen(analyzer._ipa_dct, func),
                MAX_LEN: pair_length
            },
        }
        report_stg = "State %s: " % str(state)
        for metric in dct.keys():
            cur_length = 0
            if dct[metric][MAX_LEN] is not None:
                cur_length = dct[metric][CUR_LEN]
            frac = min(1.0, cur_length / dct[metric][MAX_LEN])
            report_stg = ("%s %s/%2.3f" % (report_stg, metric, frac))
        if is_report:
            print(report_stg)
    else:
        analyzer = feature_analyzer.FeatureAnalyzer(
            CLF,
            df_X,
            ser_y,
            max_features_for_pairing=MAX_FEATURES_FOR_PAIRING,
            persister_path=persister_path,
            num_cross_iter=num_cross_iter,
            report_interval=report_interval)
        out_dir = out_dir_pat % state
        _ = analyzer.serialize(out_dir, is_restart=is_restart)
Exemplo n.º 11
0
from common_python.util import dataframe
from common_python.util.persister import Persister
from tools import make_classification_data

import os
import shutil
import unittest

IGNORE_TEST = False
IS_PLOT = False
IS_CHANGED = False # ClassificationData namespace has changed
TEST_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_FILE_PATH = os.path.join(TEST_DIR, make_classification_data.DATA_FILE)
DATA_FILE_PATH_TEST = os.path.join(
    TEST_DIR, "make_classification_data_save.pcl")
PERSISTER = Persister(DATA_FILE_PATH_TEST)
FILES = [DATA_FILE_PATH]
data= make_classification_data.ClassificationData(
    persister_path=DATA_FILE_PATH_TEST)
# Creates a file with the desired initializtions, if necessary
# Note, if there are changes in the state variables, you
# must set IS_CHANGED to True.
if not PERSISTER.isExist() or IS_CHANGED:
  data.initialize()
  data.serialize()


class TestClassificationData(unittest.TestCase):

  def deleteFiles(self):
    for ffile in FILES:
Exemplo n.º 12
0
from common.data_provider import DataProvider
import common_python.classifier.feature_analyzer as fa
from common_python.classifier  \
    import feature_set_collection as fsc
from common_python.classifier.feature_set import FeatureSet
from common import trinary_data
from common_python.util.persister import Persister

import argparse
import numpy as np
import os
import pandas as pd

DIR = os.path.dirname(os.path.abspath(__file__))
PERSISTER_PATH = os.path.join(DIR, "persister_shared_data.pcl")
PERSISTER = Persister(PERSISTER_PATH)
DIRECTORY = "feature_analyzer_averaged"


class SharedData(object):
    def __init__(self, persister=PERSISTER):
        if persister.isExist():
            shared_data = persister.get()
            for key in shared_data.__dict__.keys():
                self.__setattr__(key, shared_data.__getattribute__(key))
        else:
            self.provider = DataProvider()
            self.trinary = trinary_data.TrinaryData(is_averaged=False,
                                                    is_dropT1=False,
                                                    is_regulator=True)
            self.df_X = self.trinary.df_X
Exemplo n.º 13
0
 def setUp(self):
     self.deleteFiles()
     self.persister = Persister(PERSISTER_PATH)
Exemplo n.º 14
0
import common.constants as cn
from common import sample_data
from common_python.util.persister import Persister

import numpy as np
import os
import pandas as pd
import unittest


IGNORE_TEST = False
IS_PLOT = False

TEST_DIR = os.path.dirname(os.path.abspath(__file__))
PERSISTER_PATH = os.path.join(TEST_DIR, "test_sample_data_persister.pcl")
PERSISTER = Persister(PERSISTER_PATH)
got_sample = False
if PERSISTER.isExist():
  SAMPLE_DATA = PERSISTER.get()
  if SAMPLE_DATA is None:
    got_sample = False
  else:
    got_sample = True
if not got_sample:
  try:
    SAMPLE_DATA = sample_data.getSampleData()
    SAMPLE_DATA.initialize()
  except:
    SAMPLE_DATA = None
    print("***Proceeding without SAMPLE_DATA")
  PERSISTER.set(SAMPLE_DATA)
Exemplo n.º 15
0
 def do(self, data_dir=cn.DATA_DIR):
     """
 Assigns values to the instance data.
 """
     # Determine if can initialize from existing data
     persister = Persister(cn.DATA_PROVIDER_PERSISTER_PATH)
     is_initialized = False
     if persister.isExist():
         if not self.is_reinitialize:
             provider = persister.get()
             # See if there's a change in the calculation of reference values
             if self.calcRef == provider.calcRef:
                 is_initialized = True
                 self._setValues(provider=provider)
                 if not "is_reinitialize" in dir(self):
                     self.is_reinitialize = False
     if not is_initialized:
         # Do the initializtions
         # Gene categorizations
         self.df_ec_terms =  \
             self._makeDFFromCSV(FILENAME_EC_TERMS,
             is_index_geneid=True)
         self.df_ko_terms =  \
             self._makeDFFromCSV(FILENAME_KO_TERMS,
             is_index_geneid=True)
         self.df_kegg_pathways =  \
             self._makeDFFromCSV(FILENAME_KEGG_PATHWAYS,
             is_index_geneid=False)
         self.df_kegg_gene_pathways =  \
             self._makeDFFromCSV(FILENAME_KEGG_GENE_PATHWAY,
             is_index_geneid=True)
         # Transcription Regulation Network
         self.df_trn_unsigned = self._makeDFFromCSV(FILENAME_TRN_UNSIGNED)
         self.df_trn_unsigned.columns = TRN_COLUMNS
         self.df_trn_signed = self._makeDFFromCSV(FILENAME_TRN_SIGNED)
         self.df_trn_signed.columns = TRN_COLUMNS
         # GO Terms
         self.df_go_terms = self._makeGoTerms()
         # Gene expression for state
         self.df_gene_expression_state = self._makeDFFromCSV(
             FILENAME_GENE_EXPRESSION_STATE, is_index_geneid=True)
         # Gene description
         self.df_gene_description = self._makeGeneDescriptionDF()
         # Stages matrix
         self.df_stage_matrix = self._makeStageMatrixDF()
         # Normalized data values
         self.df_normalized = self._makeNormalizedDF()
         # Raw readcounts
         self.dfs_read_count = self._makeReadCountDFS()
         # Hypoxia data
         self.df_hypoxia = self._makeHypoxiaDF()
         # Create mean and std dataframes
         self.df_mean = self._makeMeanDF()
         self.df_std = self._makeStdDF()
         self.df_cv = 100 * self.df_std / self.df_mean
         # Transcription factors
         self.tfs = self.df_trn_unsigned[cn.TF].unique()
         self.tfs = list(
             set(self.tfs).intersection(
                 self.dfs_adjusted_read_count[0].index))
         persister.set(self)
Exemplo n.º 16
0
from common import trinary_data
from common_python.testing import helpers
from common_python.util.persister import Persister

import numpy as np
import os
import pandas as pd
import unittest

IGNORE_TEST = False
IS_PLOT = False
NUM_REPL = 3
DIR = os.path.dirname(os.path.abspath(__file__))
TEST_SAMPLE_PATH = os.path.join(DIR, "test_trinary_data_sample.csv")
PERSISTER_PATH = os.path.join(DIR, "test_trinary_data_persister.pcl")
PERSISTER = Persister(PERSISTER_PATH)
GENES = ["Rv1927", "Rv3083"]
if PERSISTER.isExist():
    PROVIDER, SAMPLE_DATA = PERSISTER.get()
else:
    SAMPLE_DATA = sample_data.getSampleData()
    PROVIDER = DataProvider(is_reinitialize=True)
    PROVIDER.do()
    PERSISTER.set((PROVIDER, SAMPLE_DATA))


################### FUNCTIONS ############
def isConsistentState(ser_y):
    # Check consistency of states
    times = ser_y.index
    if len(times) == 0: