unfold_dict[label].extend([v] * n) # clean values (try to convert to float and remove leading ;) for k in unfold_dict: unfold_dict[k] = [_clean_value(v) for v in unfold_dict[k]] return pd.DataFrame(unfold_dict, columns=df.columns) ################################################################### if __name__ == "__main__": from metabolinks.datasets import demo_dataset # read sample data set demo2 print('\nLoad demo2: data with labels ------------\n') demo2 = demo_dataset('demo2') data = demo2.data y = demo2.target print('-- info --------------') print(data.transpose().cdl.info()) print('-- global info---------') print(data.transpose().cdl.info(all_data=True)) print('-----------------------') print(data) print('\n--- fillna_zero ----------') new_data = fillna_zero(data) print(new_data) print('--- fillna_value 10 ----------') new_data = fillna_value(data, value=10) print(new_data)
right_index=True) return annotations if __name__ == '__main__': from metabolinks import datasets dbs = load_local_dbs() # for k, v in dbs._asdict().items(): # print(f'----- {k}') # print(v) # print('*'*30) df = datasets.demo_dataset('masstrix_output').data print("Demo data:\n") df.info() print('************************') print(df.KEGG_cid) print('************************') # get one identifier per row cids = df.KEGG_cid.str.split('#').explode() print('\n\n----- Identifier list') print(cids) print('-----------------------------------') # build identifier translation table identifiers = get_identifiers(cids, dbs, trace=False)
else: comps.append('other') return comps if __name__ == '__main__': from io import StringIO from metabolinks import datasets from metabolinks.dataio import read_MassTRIX print('------ test element_composition() ------') for test in 'C11H24NO7P', 'C13H19ClN2O2', 'C12H21O11R': print(test, '->', element_composition(test)) print('\n------ test insert_element_counts() ------') df = datasets.demo_dataset('table_with_formulae').data print(df) print('+++++ after insertion ++++++') dfi = insert_element_counts(df) print(dfi) print('\n------ test element_composition_series ------') # file_name = "MassTRIX_output.tsv" # import os # _THIS_DIR, _ = os.path.split(os.path.abspath(__file__)) # testfile_name = os.path.join(_THIS_DIR, "data", file_name) df = read_MassTRIX( StringIO(datasets.create_demo('masstrix_output').as_str())) def cleanup_cols(df, isotopes=True, uniqueID=True, columns=None):
# empty subset, skip continue else: # common features with exactely len(t) ocorrences if len(t) == 1: all_feats = objects[t[0]] else: all_feats = common([objects[i] for i in t]) features = all_feats[all_feats.isin(count_groups[len(t)-1])] subset_names = tuple([names[i] for i in t]) res[subset_names] = features return res if __name__ == "__main__": print('Demo data with labels------------\n') dataset = datasets.demo_dataset('demo2').data.transpose() print(dataset) print('-- info --------------') print(dataset.cdl.info()) print('-- global info---------') print(dataset.cdl.info(all_data=True)) print('\n***** SIMILARITY MEASURES ****') similarities = mz_similarity(dataset, has_labels=True) print(similarities) print('\n\n***** FEATURE OVERLAP (AND VENN DIAGRAM CALCULATIONS ****') print('--- example data sets') s1 = pd.DataFrame({'Bucket label': ['A0', 'A1', 'A2', 'A3'], 'Name': ['B0', np.nan, 'B2', 'B3'], 'Formula': ['C0', 'C1', 'C2', 'C3']}, index=[0, 1, 2, 3]).set_index('Bucket label')
def load_demo2(): return demo_dataset('demo2')