def hmdb_disease_normalization(): dataset = DataReader().read_hmdb_diseases() naming = NamingService('hmdb') nor_data = dict() for dis, categories in dataset.items(): for cat, measurements in categories.items(): named_measurements = naming.to(dict(measurements)) if len(named_measurements) >= 10: nor_data['%s %s' % (dis, cat)] = { k: round( min(v - 1, 100) if v >= 1 else max(1 - v**-1, -100), 3) for k, v in named_measurements.items() } DataWriter('normalization_hmdb').write_json(nor_data)
def setUp(self): (X, y) = DataReader().read_data('BC') X = NamingService('recon').to(X) Xy = next(filter(lambda k: k[1] == 'h', zip(X, y))) (self.X, self.y) = ([Xy[0]], [Xy[1]]) self.fva = FVARangedMeasurement()
def fva_range_analysis_save(): # (X, y) = DataReader().read_data('BC') (X, y) = DataReader().read_data('HCC') X = NamingService('recon').to(X) X = FVARangedMeasurement().fit_transform(X, y) with open('../outputs/fva_solutions.txt', 'w') as f: for x, label in zip(X, y): f.write('%s %s\n' % (label, x))
def setUp(self): (X, y) = DataReader().read_all() X = NamingService('recon').to(X) self.vect = DictVectorizer(sparse=False) X = self.vect.fit_transform(X, y) X = MetabolicStandardScaler().fit_transform(X, y) self.measured_metabolites = X[0] self.scaler = FVAScaler(self.vect)
def hmdb_disease_analysis(): naming = NamingService('recon') y, X = list(zip(*DataReader().read_hmdb_diseases().items())) dyn_pre = DynamicPreprocessing(['fva']) X_t = dyn_pre.fit_transform(X, y) DataWriter('hmdb_disease_analysis').write_json(dict(zip(y, X_t)))
def naming_issue(): econ_names = set(NamingService('ecolin')._names.keys()) human_names = set(NamingService('human')._names.keys()) dr = DataReader() bc_names = set(i.lower().strip() for i in dr.read_columns('BC')) hcc_names = set(i.lower().strip() for i in dr.read_columns('HCC')) report_matching(hcc_names, bc_names, 'hcc', 'bc') print('-' * 10, 'ecolin', '-' * 10) report_matching(hcc_names, econ_names, 'hcc', '') report_matching(bc_names, econ_names, 'bc', '') print('-' * 10, 'human', '-' * 10) report_matching(hcc_names, human_names, 'hcc', '') report_matching(bc_names, human_names, 'bc', '')
class NameMatching(TransformerMixin): def __init__(self): super().__init__() self.naming = NamingService('recon') def fit(self, X, y=None): return self def transform(self, X, y=None): return self.naming.to(X)
class MetabolicSolutionScaler(TransformerMixin): """Scaler for converting change level data to pathway level""" def __init__(self, vectorizer: DictVectorizer): self.vectorizer = vectorizer self.solution_service = SolutionService() self.naming = NamingService('ecolin') def fit(self, X, y): return self def transform(self, X, y=[]): ecolin_X = self.naming.to(X) solutions = self.solution_service.get_solutions(ecolin_X) return solutions def fit_transform(self, X, y): return self.fit(X, y).transform(X, y)
def most_correlated_pathway(top_num_pathway, num_of_reactions): (X, y) = DataReader().read_fva_solutions('fva_without.transports.txt') vect = [DictVectorizer(sparse=False)] * 3 vt = VarianceThreshold(0.1) skb = SelectKBest(k=int(num_of_reactions)) X = Pipeline([('vect1', vect[0]), ('vt', vt), ('inv_vec1', InverseDictVectorizer(vect[0], vt)), ('vect2', vect[1]), ('skb', skb), ('inv_vec2', InverseDictVectorizer(vect[1], skb)), ('pathway_scoring', PathwayFvaScaler()), ('vect3', vect[2])]).fit_transform(X, y) (F, pval) = f_classif(X, y) top_n = sorted(zip(vect[2].feature_names_, F, pval), key=lambda x: x[1], reverse=True)[:int(top_num_pathway)] model = DataReader().read_network_model() X, y = DataReader().read_data('BC') bc = NamingService('recon').to(X) subsystem_metabolite = defaultdict(set) for r in model.reactions: subsystem_metabolite[r.subsystem].update(m.id for m in r.metabolites) subsystem_counts = defaultdict(float) for sample in bc: for s, v in subsystem_metabolite.items(): subsystem_counts[s] += len(v.intersection(sample.keys())) subsystem_counts = { i: v / len(subsystem_counts) for i, v in subsystem_counts.items() } for n, v, p in top_n: print('name:', n[:-4]) print('min-max:', n[-3:]) print('metabolites:%s' % subsystem_counts[n[:-4]]) print('F:', v) print('p:', p) print('-' * 10)
def constraint_logging(): (X, y) = DataReader().read_data('BC') X = NamingService('recon').to(X) (X_h, y_h) = [(x, l) for x, l in zip(X, y) if l == 'h'][0] (X_bc, y_bc) = [(x, l) for x, l in zip(X, y) if l == 'bc'][0] FVARangedMeasurement().fit_transform([X_bc, X_h], [y_bc, y_h])
def __init__(self, vectorizer: DictVectorizer): self.vectorizer = vectorizer self.solution_service = SolutionService() self.naming = NamingService('ecolin')
def __init__(self): super().__init__() self.naming = NamingService('recon')
def flux_diff_analysis(): files = ['fva_solutions.enriched_measurements_in_obj.wconst.txt', 'fva_solutions.enriched_measurements_in_obj.woconst.txt', 'fva_solutions.enriched_measurements_in_obj.wconst.useV.txt', 'fva_solutionsfva_solutions.enriched_measurements_in_obj.wconst.lb1.txt'] #['fva_solutions.cameo.wconst.txt', 'fva_solutions.cameo.woconst.txt', # 'fva_solutions.cameo.wconst.weighted.txt', 'fva_solutions6.txt'] model = DataReader().read_network_model() categories = DataReader().read_subsystem_categories() (X, y) = DataReader().read_data('BC') X = NamingService('recon').to(X) subsys_categories = {} subsys_measurement_mapping = {} max_category_len = 0 for category, subsystems in categories.items(): if len(category) > max_category_len: max_category_len = len(category) for subsys in subsystems: subsys_categories[subsys] = category subsys_measurement_mapping[subsys] = [] category_active_counts = {} for measurement_dict in X: for mid, fold_change in measurement_dict.items(): metabolite = model.metabolites.get_by_id(mid) met_subsystems = {} for r in metabolite.reactions: subsys = r.subsystem if subsys in met_subsystems: continue subsys_measurement_mapping[subsys].append(abs(fold_change)) met_subsystems[subsys] = None break for file in files: fluxes, class_labels = DataReader().read_fva_solutions(file) flux_dict = {} ix = 0 max_reaction_length = 0 while ix < len(fluxes): class_label = class_labels[ix] subsys_has_active_reaction = {} for reaction, flux in fluxes[ix].items(): rxn, qualifier = reaction[:-4], reaction[-3:] flux_dict.setdefault(rxn, {}) flux_dict[rxn].setdefault(class_label, {}) flux_dict[rxn][class_label][qualifier] = flux subsys = model.reactions.get_by_id(rxn).subsystem subsys_has_active_reaction.setdefault(subsys, False) if abs(flux) > 0: subsys_has_active_reaction[subsys] = True if len(rxn) > max_reaction_length: max_reaction_length = len(rxn) for category in categories: active = 0 for subsys in categories[category]: try: if subsys_has_active_reaction[subsys]: active += 1 except: continue category_active_counts.setdefault(category, []) category_active_counts[category].append(active) ix += 1 healthy = 'h' diseased = 'bc' distances = [] subsystem_dist_dict = {} max_subsys_length = 0 for reaction, flux_vals in flux_dict.items(): healthy_flux = (flux_vals[healthy]['min'], flux_vals[healthy]['max']) diseased_flux = (flux_vals[diseased]['min'], flux_vals[diseased]['max']) interval_length = max(healthy_flux[1], diseased_flux[1]) - min(healthy_flux[0], diseased_flux[0]) dist = abs(healthy_flux[0] - diseased_flux[0]) + abs(healthy_flux[1] - diseased_flux[1]) if healthy_flux == diseased_flux: dist = 0 else: dist = dist/interval_length distances.append((dist, reaction)) subsys = model.reactions.get_by_id(reaction).subsystem subsystem_dist_dict.setdefault(subsys, []) subsystem_dist_dict[subsys].append(dist) if len(subsys) > max_subsys_length: max_subsys_length = len(subsys) distances.sort(reverse=True) rmean = round(sum([dist for dist, reaction in distances])/len(distances), 4) rmedian = distances[(len(distances)//2) + 1][0] subsystem_distances = [(sum(distances)/len(distances), subsys) for subsys, distances in subsystem_dist_dict.items()] subsystem_distances.sort(reverse=True) smean = round(sum([dist for dist, subsys in subsystem_distances])/len(subsystem_distances), 4) smedian = subsystem_distances[(len(subsystem_distances) // 2) + 1][0] with open('../outputs/diff_%s' % file, 'w') as f: f.write('Reaction Level: mean: %s, median: %s, min: %s, max: %s\n' % (str(rmean), str(rmedian), str(distances[len(distances)-1]), str(distances[0]))) f.write('Subsystem Level: mean: %s, median: %s, min: %s, max: %s\n\n\n' % (str(smean), str(smedian), str(subsystem_distances[len(subsystem_distances) - 1]), str(subsystem_distances[0]))) f.write("Category Activeness (Actual, Avg, Min, Max):\n" + '-' * (max_category_len + 20) + "\n") for category, active_counts in category_active_counts.items(): f.write(('{:>' + str(max_category_len) + '}\t{}\t{:.2f}\t{}\t{}\n').format(category, len(categories[category]), round(sum(active_counts)/len(active_counts), 2), min(active_counts), max(active_counts))) for category in categories: if category in category_active_counts: continue f.write(('{:>' + str(max_category_len) + '}\t{}\t{}\t{}\t{}\n').format(category, len(categories[category]), 0, 0 ,0)) f.write("\n\nSubsystems:\n" + '-'*(max_subsys_length + 25)+"\n") for dist, subsys in subsystem_distances: f.write(('{:>' + str(max_subsys_length) + '}\t{:.2f}\t{}\t{:.2f}\n').format(subsys, round(dist, 2),len(subsys_measurement_mapping.get(subsys, [])), round(sum(subsys_measurement_mapping.get(subsys, []))/ max(len(subsys_measurement_mapping.get(subsys, [])), 1), 2))) f.write("\n\nReactions:\n" + '-'*(max_reaction_length + 5)+"\n") for dist, reaction in distances: f.write(('{:>' + str(max_reaction_length) + '}\t{:.2f}\n').format(reaction, round(dist, 2)))