예제 #1
0
def subsystem_statistics():
    categories = DataReader().read_subsystem_categories()
    total = 0
    for k, v in categories.items():
        print(k, len(v))
        total += len(v)
    print('total:', total)
예제 #2
0
def hmdb_disease_analysis_on_server():
    client = MetaboliticsApiClient()
    client.login('email', 'password')

    hmdb_data = DataReader().read_hmdb_diseases()

    for name, measurements in hmdb_data.items():
        print(client.analyze(name, measurements))
    def create_for(cls, dataset_name="recon2"):
        if dataset_name == 'example':
            model = DataReader().create_example_model()
        elif dataset_name == 'example2':
            model = DataReader().create_example2_model()
        else:
            model = DataReader().read_network_model(dataset_name)

        self = cls(description=model)
        return self
예제 #4
0
def naming_issue():

    human_names = set(NamingService('recon')._names.keys())

    dr = DataReader()
    bc_names = set(i.lower().strip() for i in dr.read_columns('BC'))
    hcc_names = set(i.lower().strip() for i in dr.read_columns('HCC'))

    report_matching(hcc_names, bc_names, 'hcc', 'bc')

    print('-' * 10, 'human', '-' * 10)
    report_matching(hcc_names, human_names, 'hcc', '')
    report_matching(bc_names, human_names, 'bc', '')
예제 #5
0
def pathifier(disease_name):
    model = DataReader().read_network_model()
    X, y = DataReader().read_data(disease_name)
    pre = DynamicPreprocessing(['metabolic-standard'])

    X = pre.fit_transform(X, y)
    import pdb
    pdb.set_trace()

    df = pd.DataFrame(X)
    metabolite_fold_changes = robj.r.matrix(robj.FloatVector(
        df.as_matrix().T.ravel().tolist()),
                                            nrow=df.shape[1])
    all_metabolite_ids = robj.StrVector(list(df))

    subsystem_metabolite = defaultdict(set)
    for r in model.reactions:
        if r.subsystem and not (r.subsystem.startswith('Transport')
                                or r.subsystem.startswith('Exchange')):
            subsystem_metabolite[r.subsystem] \
                .update(m.id for m in r.metabolites if m.id in df)

    pathway_names, pathway_metabolites = zip(
        *filter(lambda x: x[1], subsystem_metabolite.items()))

    pathway_metabolites = robj.r['list'](
        *map(lambda x: robj.StrVector(list(x)), pathway_metabolites))

    pathway_names = robj.StrVector(list(pathway_names))
    is_healthy = robj.BoolVector(list(map(lambda x: x == 'h', y)))

    pathifier = importr("pathifier")

    result = pathifier.quantify_pathways_deregulation(metabolite_fold_changes,
                                                      all_metabolite_ids,
                                                      pathway_metabolites,
                                                      pathway_names,
                                                      is_healthy,
                                                      attempts=100,
                                                      min_exp=0,
                                                      min_std=0)

    regScores = dict()
    for pathway, scores in dict(result.items())['scores'].items():
        regScores[pathway] = list(scores[:])

    df = pd.DataFrame(regScores)
    df.insert(0, 'stage', y)
    df.to_csv('../dataset/disease/%s_regulization.csv' % disease_name,
              index=False)
예제 #6
0
def elimination_tabular():
    (X, y) = DataReader().read_data('BC')

    datasets = {'metabolite': DataReader().read_data('BC')}
    scores = list()

    for i in range(1, len(X[0].keys()) + 1, 10):

        vect = DictVectorizer(sparse=False)
        selector = SelectNotKBest(k=i)

        clfs = dict()

        clfs['metabolite'] = Pipeline([
            # pipe for compare model with eliminating some features
            ('metabolic',
             DynamicPreprocessing(['naming', 'metabolic-standard'])),
            ('vect', vect),
            ('selector', selector),
            ('pca', PCA()),
            ('clf', LogisticRegression(C=0.01, random_state=43))
        ])

        try:
            path = '../dataset/solutions/bc_disease_analysis#k=%d.json' % i
            datasets['reaction'] = list(
                zip(*[json.loads(i) for i in open(path)][0]))
        except:
            print(pd.DataFrame(scores))
            return

        clfs['reaction'] = FVADiseaseClassifier()

        kf = StratifiedKFold(n_splits=10, random_state=43)

        score = {
            name: np.mean(
                cross_val_score(clf,
                                datasets[name][0],
                                datasets[name][1],
                                cv=kf,
                                n_jobs=-1,
                                scoring='f1_micro'))
            for name, clf in clfs.items()
        }
        score['iteration'] = i
        scores.append(score)

    print(pd.DataFrame(scores))
예제 #7
0
def hmdb_disease_normalization():
    dataset = DataReader().read_hmdb_diseases()
    naming = NamingService('hmdb')
    nor_data = dict()

    for dis, categories in dataset.items():
        for cat, measurements in categories.items():
            named_measurements = naming.to(dict(measurements))
            if len(named_measurements) >= 10:
                nor_data['%s %s' % (dis, cat)] = {
                    k: round(
                        min(v - 1, 100) if v >= 1 else max(1 - v**-1, -100), 3)
                    for k, v in named_measurements.items()
                }
    DataWriter('normalization_hmdb').write_json(nor_data)
예제 #8
0
    def setUp(self):
        self.predictor = TrendPredictor()

        self.X_train, self.X_test, self.y_train, self.y_test = \
            Business.train_test_set(DataReader().sample_businesses())

        self.predictor.fit(self.X_train, self.y_train)
예제 #9
0
    def setUp(self):
        self.clf = SolutionLevelDiseaseClassifier()
        (X, y) = DataReader().read_small_data()
        (self.X_train, self.X_test, self.y_train, self.y_test) =  \
            train_test_split(X, y, random_state=0)

        self.clf.fit(self.X_train, self.y_train)
예제 #10
0
def generate_angular_friendly_model():
    '''
    This function convert json model into angular friendly json
    '''
    model = DataReader().read_network_model()
    model_json = json.load(open('../dataset/network/recon2.json'))

    reactions, metabolites = model_json['reactions'], model_json['metabolites']
    model_json = defaultdict(dict)
    model_json['pathways'] = defaultdict(list)

    for m in metabolites:
        m['reactions'] = [
            r.id for r in model.metabolites.get_by_id(m['id']).reactions
        ]
        model_json['metabolites'][m['id']] = m

    for r in reactions:
        # r['gene_reaction_rule'], r['notes'] = [], {}
        del r['gene_reaction_rule']
        del r['notes']

        model_json['reactions'][r['id']] = r
        model_json['pathways'][r.get('subsystem', 'NOpathway')].append(r['id'])

    json.dump(model_json, open('../outputs/ng-recon.json', 'w'))
예제 #11
0
    def setUp(self):
        self.clf = DummyDiseaseClassifier()
        (X, y) = DataReader().read_solutions()
        (self.X_train, self.X_test, self.y_train, self.y_test) =  \
            train_test_split(X, y, random_state=0)

        self.clf.fit(self.X_train, self.y_train)
예제 #12
0
    def setUp(self):
        (X, y) = DataReader().read_data('BC')
        X = NamingService('recon').to(X)

        Xy = next(filter(lambda k: k[1] == 'h', zip(X, y)))
        (self.X, self.y) = ([Xy[0]], [Xy[1]])
        self.fva = FVARangedMeasurement()
예제 #13
0
 def setUp(self):
     (X, y) = DataReader().read_all()
     X = NamingService('recon').to(X)
     self.vect = DictVectorizer(sparse=False)
     X = self.vect.fit_transform(X, y)
     X = MetabolicStandardScaler().fit_transform(X, y)
     self.measured_metabolites = X[0]
     self.scaler = FVAScaler(self.vect)
예제 #14
0
    def setUp(self):
        self.clf = MetaboliteLevelDiseaseClassifier()
        (X, y) = DataReader().read_all()

        (self.X_train, self.X_test, self.y_train, self.y_test) =  \
            train_test_split(X, y, random_state=0)

        self.clf.fit(self.X_train, self.y_train)
예제 #15
0
def fva_range_analysis_save():
    # (X, y) = DataReader().read_data('BC')
    (X, y) = DataReader().read_data('HCC')
    X = NamingService('recon').to(X)
    X = FVARangedMeasurement().fit_transform(X, y)
    with open('../outputs/fva_solutions.txt', 'w') as f:
        for x, label in zip(X, y):
            f.write('%s %s\n' % (label, x))
예제 #16
0
def hmdb_disease_analysis():
    naming = NamingService('recon')

    y, X = list(zip(*DataReader().read_hmdb_diseases().items()))

    dyn_pre = DynamicPreprocessing(['fva'])

    X_t = dyn_pre.fit_transform(X, y)
    DataWriter('hmdb_disease_analysis').write_json(dict(zip(y, X_t)))
예제 #17
0
def most_correlated_reactions(top_num_reaction):
    (X, y) = DataReader().read_fva_solutions()
    vect = DictVectorizer(sparse=False)
    X = vect.fit_transform(X)
    vt = VarianceThreshold(0.1)
    X = vt.fit_transform(X)
    (F, pval) = f_classif(X, y)

    feature_names = np.array(vect.feature_names_)[vt.get_support()]
    top_n = sorted(zip(feature_names, F), key=lambda x: x[1],
                   reverse=True)[:int(top_num_reaction)]
    model = DataReader().read_network_model()
    for n, v in top_n:
        print('name:', n[:-4])
        print('reaction:', model.reactions.get_by_id(n[:-4]).reaction)
        print('min-max:', n[-3:])
        print('F:', v)
        print('-' * 10)
예제 #18
0
def most_correlated_pathway(top_num_pathway, num_of_reactions):
    (X, y) = DataReader().read_fva_solutions('fva_without.transports.txt')

    vect = [DictVectorizer(sparse=False)] * 3
    vt = VarianceThreshold(0.1)
    skb = SelectKBest(k=int(num_of_reactions))
    X = Pipeline([('vect1', vect[0]), ('vt', vt),
                  ('inv_vec1', InverseDictVectorizer(vect[0], vt)),
                  ('vect2', vect[1]), ('skb', skb),
                  ('inv_vec2', InverseDictVectorizer(vect[1], skb)),
                  ('pathway_scoring', PathwayFvaScaler()),
                  ('vect3', vect[2])]).fit_transform(X, y)

    (F, pval) = f_classif(X, y)

    top_n = sorted(zip(vect[2].feature_names_, F, pval),
                   key=lambda x: x[1],
                   reverse=True)[:int(top_num_pathway)]

    model = DataReader().read_network_model()
    X, y = DataReader().read_data('BC')
    bc = NamingService('recon').to(X)

    subsystem_metabolite = defaultdict(set)
    for r in model.reactions:
        subsystem_metabolite[r.subsystem].update(m.id for m in r.metabolites)

    subsystem_counts = defaultdict(float)
    for sample in bc:
        for s, v in subsystem_metabolite.items():
            subsystem_counts[s] += len(v.intersection(sample.keys()))

    subsystem_counts = {
        i: v / len(subsystem_counts)
        for i, v in subsystem_counts.items()
    }

    for n, v, p in top_n:
        print('name:', n[:-4])
        print('min-max:', n[-3:])
        print('metabolites:%s' % subsystem_counts[n[:-4]])
        print('F:', v)
        print('p:', p)
        print('-' * 10)
예제 #19
0
def healties_model():
    X, y = DataReader().read_healthy('BC')

    pre_model = DynamicPreprocessing(['naming', 'basic-fold-change-scaler'])
    X = pre_model.fit_transform(list(X), y)

    model = DynamicPreprocessing(['fva', 'flux-diff'])
    model.fit(X, y)

    with open('../outputs/api_model.p', 'wb') as f:
        pickle.dump(model, f)
예제 #20
0
def hmdb_disease_analysis_pathway_level():
    X, y = DataReader().read_solution('hmdb_disease_analysis')

    with open('../models/api_model.p', 'rb') as f:
        reaction_scaler = pickle.load(f)

    dyn_pre = DynamicPreprocessing(
        ['pathway-scoring', 'transport-elimination'])

    X_t = reaction_scaler._model.named_steps['flux-diff'].transform(X)
    X_t = dyn_pre.fit_transform(X_t, y)
    DataWriter('hmdb_disease_analysis_pathway_level').write_json(
        dict(zip(y, X_t)))
def solution_config_generator():
    #model = DataReader().read_network_model()
    model = BaseFVA.create_for()

    categories = DataReader().read_subsystem_categories()

    start = datetime.datetime.now()

    configurations = []
    for category, subsystems in categories.items():

        #if len(subsystems) > 9 and len(subsystems) < 13:
        if category.startswith('glycan'):
            print(category, len(subsystems))
            print(subsystems)
            generate_category_config(model, subsystems, configurations)
            break
    print(total, feasible)
    end = datetime.datetime.now()
    delta = end - start
    print('the number of valid configurations:', len(configurations))
    print(delta)
예제 #22
0
def subsystem_naming():
    categoires = DataReader().read_subsystem_categories()
    subsystems = reduce(set.union, categoires.values())

    model = DataReader().read_network_model()
    model_subsystems = model.subsystems()

    print('Diff of cate from model', subsystems.difference(model_subsystems))
    print('Diff of model from cate', model_subsystems.difference(subsystems))
예제 #23
0
def solution_for_dataset():
    (X, y) = DataReader().read_all()

    vect = DictVectorizer(sparse=False)
    X = vect.fit_transform(X, y)
    X = MetabolicChangeScaler().fit_transform(X, y)
    X = MetabolicSolutionScaler(vect).to_ecolin(X)

    solution_service = SolutionService()
    file_path = '../output/solution_for_dataset.json'
    calculated_samples = sum(1 for line in open(file_path))

    f = open(file_path, 'a')
    for x in X[calculated_samples:]:
        solution = solution_service.get_solution(x)
        f.write('%s\n' % json.dumps(solution))
예제 #24
0
def healty_for_heatmap(num_of_reactions):
    (X, y) = DataReader().read_fva_solutions('fva_without.transports.txt')
    X = Pipeline([
        ('flux-diff-scaler', ReactionDiffScaler()),
        ('pathway_scoring', PathwayFvaScaler()),
    ]).fit_transform(X, y)

    df = pd.DataFrame(ix for ix, iy in zip(X, y) if iy == 'h')

    hjson = {
        'x': [i[:-4] for i in df],
        'z': df.values.tolist(),
        'type': 'heatmap'
    }

    json.dump(hjson, open('../outputs/healties_heatmap.json', 'w'))
예제 #25
0
def svr_trend_prediction():

    logger = logging.getLogger('trend-prediction')
    logger.setLevel(logging.INFO)
    logger.addHandler(logging.FileHandler('../logs/trend_prediction.log'))

    predictor = TrendPredictor()

    X_train, X_test, y_train, y_test = \
        Business.train_test_set(DataReader().businesses())

    predictor.fit(X_train, y_train)
    predictor.save()

    logger.info(predictor)
    logger.info('mean squared error: %s ' %
                predictor.mean_squared_error(X_test, y_test))
    logger.info('r2 score: %s ' % predictor.r2_score(X_test, y_test))
예제 #26
0
def eliminate_best_k():
    (X, y) = DataReader().read_data('BC')

    for i in range(1, len(X[0].keys()) + 1, 10):

        vect = DictVectorizer(sparse=False)
        selector = SelectNotKBest(k=i)

        pipe = Pipeline([
            # pipe for compare model with eliminating some features
            ('metabolic',
             DynamicPreprocessing(['naming', 'metabolic-standard'])),
            ('vect', vect),
            ('selector', selector),
            ('inv_vect', InverseDictVectorizer(vect, selector)),
            ('fva', DynamicPreprocessing(['fva']))
        ])

        X_result = pipe.fit_transform(X, y)

        DataWriter('bc_disease_analysis#k=%s' % i) \
            .write_json_dataset(X_result, y)
예제 #27
0
def fva_range_with_basic_analysis_save():
    X, y = DataReader().read_data('BC')

    # preproc = DynamicPreprocessing(['naming', 'basic-fold-change-scaler'])
    # X_p = preproc.fit_transform(X, y)
    # import pprint
    # import pdb
    # for i in X_p:
    #     pprint.pprint(i)
    #     pdb.set_trace()

    for x in X:
        for k, v in x.items():
            x[k] = round(v, 3)

    preproc = DynamicPreprocessing(
        ['naming', 'basic-fold-change-scaler', 'fva']).fit(X, y)

    print('model trained...')

    DataWriter('fva_solution_with_basic_fold_change') \
        .write_json_stream(preproc.transform, X)
예제 #28
0
 def setUpData(self):
     return DataReader().read_data('BC_regulization')
 def __init__(self, dataset_name="recon2"):
     super().__init__()
     self.model = DataReader.read_network_model(dataset_name)
예제 #30
0
 def __init__(self):
     super().__init__()
     self.model = DataReader().read_network_model()