예제 #1
0
def hmdb_disease_normalization():
    dataset = DataReader().read_hmdb_diseases()
    naming = NamingService('hmdb')
    nor_data = dict()

    for dis, categories in dataset.items():
        for cat, measurements in categories.items():
            named_measurements = naming.to(dict(measurements))
            if len(named_measurements) >= 10:
                nor_data['%s %s' % (dis, cat)] = {
                    k: round(
                        min(v - 1, 100) if v >= 1 else max(1 - v**-1, -100), 3)
                    for k, v in named_measurements.items()
                }
    DataWriter('normalization_hmdb').write_json(nor_data)
예제 #2
0
    def setUp(self):
        (X, y) = DataReader().read_data('BC')
        X = NamingService('recon').to(X)

        Xy = next(filter(lambda k: k[1] == 'h', zip(X, y)))
        (self.X, self.y) = ([Xy[0]], [Xy[1]])
        self.fva = FVARangedMeasurement()
예제 #3
0
def fva_range_analysis_save():
    # (X, y) = DataReader().read_data('BC')
    (X, y) = DataReader().read_data('HCC')
    X = NamingService('recon').to(X)
    X = FVARangedMeasurement().fit_transform(X, y)
    with open('../outputs/fva_solutions.txt', 'w') as f:
        for x, label in zip(X, y):
            f.write('%s %s\n' % (label, x))
예제 #4
0
 def setUp(self):
     (X, y) = DataReader().read_all()
     X = NamingService('recon').to(X)
     self.vect = DictVectorizer(sparse=False)
     X = self.vect.fit_transform(X, y)
     X = MetabolicStandardScaler().fit_transform(X, y)
     self.measured_metabolites = X[0]
     self.scaler = FVAScaler(self.vect)
예제 #5
0
def hmdb_disease_analysis():
    naming = NamingService('recon')

    y, X = list(zip(*DataReader().read_hmdb_diseases().items()))

    dyn_pre = DynamicPreprocessing(['fva'])

    X_t = dyn_pre.fit_transform(X, y)
    DataWriter('hmdb_disease_analysis').write_json(dict(zip(y, X_t)))
def naming_issue():

    econ_names = set(NamingService('ecolin')._names.keys())
    human_names = set(NamingService('human')._names.keys())

    dr = DataReader()
    bc_names = set(i.lower().strip() for i in dr.read_columns('BC'))
    hcc_names = set(i.lower().strip() for i in dr.read_columns('HCC'))

    report_matching(hcc_names, bc_names, 'hcc', 'bc')

    print('-' * 10, 'ecolin', '-' * 10)
    report_matching(hcc_names, econ_names, 'hcc', '')
    report_matching(bc_names, econ_names, 'bc', '')

    print('-' * 10, 'human', '-' * 10)
    report_matching(hcc_names, human_names, 'hcc', '')
    report_matching(bc_names, human_names, 'bc', '')
예제 #7
0
class NameMatching(TransformerMixin):

    def __init__(self):
        super().__init__()
        self.naming = NamingService('recon')

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return self.naming.to(X)
class MetabolicSolutionScaler(TransformerMixin):
    """Scaler for converting change level data to pathway level"""
    def __init__(self, vectorizer: DictVectorizer):
        self.vectorizer = vectorizer
        self.solution_service = SolutionService()
        self.naming = NamingService('ecolin')

    def fit(self, X, y):
        return self

    def transform(self, X, y=[]):
        ecolin_X = self.naming.to(X)
        solutions = self.solution_service.get_solutions(ecolin_X)
        return solutions

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X, y)
예제 #9
0
def most_correlated_pathway(top_num_pathway, num_of_reactions):
    (X, y) = DataReader().read_fva_solutions('fva_without.transports.txt')

    vect = [DictVectorizer(sparse=False)] * 3
    vt = VarianceThreshold(0.1)
    skb = SelectKBest(k=int(num_of_reactions))
    X = Pipeline([('vect1', vect[0]), ('vt', vt),
                  ('inv_vec1', InverseDictVectorizer(vect[0], vt)),
                  ('vect2', vect[1]), ('skb', skb),
                  ('inv_vec2', InverseDictVectorizer(vect[1], skb)),
                  ('pathway_scoring', PathwayFvaScaler()),
                  ('vect3', vect[2])]).fit_transform(X, y)

    (F, pval) = f_classif(X, y)

    top_n = sorted(zip(vect[2].feature_names_, F, pval),
                   key=lambda x: x[1],
                   reverse=True)[:int(top_num_pathway)]

    model = DataReader().read_network_model()
    X, y = DataReader().read_data('BC')
    bc = NamingService('recon').to(X)

    subsystem_metabolite = defaultdict(set)
    for r in model.reactions:
        subsystem_metabolite[r.subsystem].update(m.id for m in r.metabolites)

    subsystem_counts = defaultdict(float)
    for sample in bc:
        for s, v in subsystem_metabolite.items():
            subsystem_counts[s] += len(v.intersection(sample.keys()))

    subsystem_counts = {
        i: v / len(subsystem_counts)
        for i, v in subsystem_counts.items()
    }

    for n, v, p in top_n:
        print('name:', n[:-4])
        print('min-max:', n[-3:])
        print('metabolites:%s' % subsystem_counts[n[:-4]])
        print('F:', v)
        print('p:', p)
        print('-' * 10)
예제 #10
0
def constraint_logging():
    (X, y) = DataReader().read_data('BC')
    X = NamingService('recon').to(X)
    (X_h, y_h) = [(x, l) for x, l in zip(X, y) if l == 'h'][0]
    (X_bc, y_bc) = [(x, l) for x, l in zip(X, y) if l == 'bc'][0]
    FVARangedMeasurement().fit_transform([X_bc, X_h], [y_bc, y_h])
 def __init__(self, vectorizer: DictVectorizer):
     self.vectorizer = vectorizer
     self.solution_service = SolutionService()
     self.naming = NamingService('ecolin')
예제 #12
0
 def __init__(self):
     super().__init__()
     self.naming = NamingService('recon')
예제 #13
0
def flux_diff_analysis():
    files = ['fva_solutions.enriched_measurements_in_obj.wconst.txt',
             'fva_solutions.enriched_measurements_in_obj.woconst.txt',
             'fva_solutions.enriched_measurements_in_obj.wconst.useV.txt',
             'fva_solutionsfva_solutions.enriched_measurements_in_obj.wconst.lb1.txt']
        #['fva_solutions.cameo.wconst.txt', 'fva_solutions.cameo.woconst.txt',
        #     'fva_solutions.cameo.wconst.weighted.txt', 'fva_solutions6.txt']

    model = DataReader().read_network_model()
    categories = DataReader().read_subsystem_categories()

    (X, y) = DataReader().read_data('BC')
    X = NamingService('recon').to(X)

    subsys_categories = {}
    subsys_measurement_mapping = {}
    max_category_len = 0
    for category, subsystems in categories.items():
        if len(category) > max_category_len:
            max_category_len = len(category)

        for subsys in subsystems:
            subsys_categories[subsys] = category
            subsys_measurement_mapping[subsys] = []

    category_active_counts = {}


    for measurement_dict in X:
        for mid, fold_change in measurement_dict.items():
            metabolite = model.metabolites.get_by_id(mid)
            met_subsystems = {}
            for r in metabolite.reactions:
                subsys = r.subsystem
                if subsys in met_subsystems:
                    continue

                subsys_measurement_mapping[subsys].append(abs(fold_change))
                met_subsystems[subsys] = None
        break


    for file in files:
        fluxes, class_labels = DataReader().read_fva_solutions(file)

        flux_dict = {}
        ix = 0

        max_reaction_length = 0
        while ix < len(fluxes):
            class_label = class_labels[ix]
            subsys_has_active_reaction = {}

            for reaction, flux in fluxes[ix].items():
                rxn, qualifier = reaction[:-4], reaction[-3:]
                flux_dict.setdefault(rxn, {})
                flux_dict[rxn].setdefault(class_label, {})
                flux_dict[rxn][class_label][qualifier] = flux

                subsys = model.reactions.get_by_id(rxn).subsystem
                subsys_has_active_reaction.setdefault(subsys, False)
                if abs(flux) > 0:
                    subsys_has_active_reaction[subsys] = True

                if len(rxn) > max_reaction_length:
                    max_reaction_length = len(rxn)

            for category in categories:
                active = 0
                for subsys in categories[category]:
                    try:
                        if subsys_has_active_reaction[subsys]:
                            active += 1
                    except:
                        continue
                category_active_counts.setdefault(category, [])
                category_active_counts[category].append(active)

            ix += 1

        healthy = 'h'
        diseased = 'bc'
        distances = []
        subsystem_dist_dict = {}
        max_subsys_length = 0

        for reaction, flux_vals in flux_dict.items():
            healthy_flux = (flux_vals[healthy]['min'], flux_vals[healthy]['max'])
            diseased_flux = (flux_vals[diseased]['min'], flux_vals[diseased]['max'])

            interval_length = max(healthy_flux[1], diseased_flux[1]) - min(healthy_flux[0], diseased_flux[0])

            dist = abs(healthy_flux[0] - diseased_flux[0]) + abs(healthy_flux[1] - diseased_flux[1])

            if healthy_flux == diseased_flux:
                dist = 0
            else:
                dist = dist/interval_length

            distances.append((dist, reaction))

            subsys = model.reactions.get_by_id(reaction).subsystem
            subsystem_dist_dict.setdefault(subsys, [])
            subsystem_dist_dict[subsys].append(dist)

            if len(subsys) > max_subsys_length:
                max_subsys_length = len(subsys)

        distances.sort(reverse=True)
        rmean = round(sum([dist for dist, reaction in distances])/len(distances), 4)
        rmedian = distances[(len(distances)//2) + 1][0]

        subsystem_distances = [(sum(distances)/len(distances), subsys) for subsys, distances in subsystem_dist_dict.items()]
        subsystem_distances.sort(reverse=True)
        smean = round(sum([dist for dist, subsys in subsystem_distances])/len(subsystem_distances), 4)
        smedian = subsystem_distances[(len(subsystem_distances) // 2) + 1][0]

        with open('../outputs/diff_%s' % file, 'w') as f:
            f.write('Reaction Level: mean: %s, median: %s, min: %s, max: %s\n' % (str(rmean), str(rmedian),
                                                                      str(distances[len(distances)-1]), str(distances[0])))
            f.write('Subsystem Level: mean: %s, median: %s, min: %s, max: %s\n\n\n' % (str(smean), str(smedian),
                                                                                      str(subsystem_distances[len(subsystem_distances) - 1]),
                                                                                      str(subsystem_distances[0])))

            f.write("Category Activeness (Actual, Avg, Min, Max):\n" + '-' * (max_category_len + 20) + "\n")
            for category, active_counts in category_active_counts.items():
                f.write(('{:>' + str(max_category_len) + '}\t{}\t{:.2f}\t{}\t{}\n').format(category,
                                                                                   len(categories[category]),
                                                                           round(sum(active_counts)/len(active_counts), 2),
                                                                           min(active_counts),
                                                                           max(active_counts)))
            for category in categories:
                if category in category_active_counts:
                    continue

                f.write(('{:>' + str(max_category_len) + '}\t{}\t{}\t{}\t{}\n').format(category,
                                                                                   len(categories[category]),
                                                                                   0, 0 ,0))

            f.write("\n\nSubsystems:\n" + '-'*(max_subsys_length + 25)+"\n")
            for dist, subsys in subsystem_distances:
                f.write(('{:>' + str(max_subsys_length) + '}\t{:.2f}\t{}\t{:.2f}\n').format(subsys, round(dist, 2),len(subsys_measurement_mapping.get(subsys, [])),
                                                                                round(sum(subsys_measurement_mapping.get(subsys, []))/
                                                                                        max(len(subsys_measurement_mapping.get(subsys, [])), 1), 2)))

            f.write("\n\nReactions:\n" + '-'*(max_reaction_length + 5)+"\n")
            for dist, reaction in distances:
                f.write(('{:>' + str(max_reaction_length) + '}\t{:.2f}\n').format(reaction, round(dist, 2)))