def fetch_dataset(db_collection, flow_name: str, parameters): # 1. try CSV fetch, if that doesnt work then try PandasSDMX to get the dataframe data_response = requests.get(url, params=parameters, headers={"Accept": "text/csv"}) assert data_response.status_code == 200 with get_tempfile() as fp: fp.write(data_response.text.encode()) fp.seek(0) kwargs = ({} if not flow_name in extra_csv_parms else dict( **extra_csv_parms[flow_name])) try: df = pd.read_csv(fp, **kwargs) save_dataframe(db_collection, {}, df, url, "ECB") return except pd.errors.EmptyDataError: # no data is ignored as far as --fail-fast is concerned print( f"No CSV data to save.. now trying {flow_name} using pandasdmx" ) # FALLTHRU... # 2. try pandassdmx if CSV fetch fails ecb = sdmx.Request("ECB", backend="memory") data_msg = ecb.data(flow_name, params=parameters) df = sdmx.to_pandas(data_msg) assert isinstance(df, pd.DataFrame) save_dataframe(db_collection, {}, df, url, "ECB")
def delete_record(id_): """ Deletes one record based on it's ID. If the ID given doesn't exist, nothing will be deleted. """ df = utils.read_dataframe() df_altered = df[df['ID'] != id_] utils.save_dataframe(data=df_altered) return None
def add_record(name, age, fav_sport): """Adds one record to the database (CSV file)""" df = utils.read_dataframe() df_record_to_add = pd.DataFrame(data={ "ID": utils.generate_random_id(), "Name": name, "Age": age, "FavouriteSport": fav_sport }, index=[0]) df_concatenated = pd.concat(objs=[df, df_record_to_add], ignore_index=True, sort=False) utils.save_dataframe(data=df_concatenated) return None
def main(): """ Main application handler """ # in debug mode, use the cached dataframe if len(sys.argv) >= 2: app.dataframe = utils.load_dataframe("df.pickle") app.run(host='0.0.0.0', port=3000, debug=True) else: app.dataframe = retrieve_stocks() utils.save_dataframe(app.dataframe, "df.pickle") app.run(host='0.0.0.0', port=3000)
def update_record(id_, name=None, age=None, fav_sport=None): """ Updates one record based on it's ID, and some specified parameters. If the ID given doesn't exist, nothing will be updated. >>> update_record(id_="U0E4CXQSWC4S", name="SomeNewName", age=20, fav_sport="SomeNewFavouriteSport") """ df = utils.read_dataframe() df_record_to_update = df[df['ID'] == id_] if df_record_to_update.empty: return None if len(df_record_to_update) == 1: if name: df.loc[df['ID'] == id_, 'Name'] = name if age: df.loc[df['ID'] == id_, 'Age'] = age if fav_sport: df.loc[df['ID'] == id_, 'FavouriteSport'] = fav_sport utils.save_dataframe(data=df) return None raise Exception("Multiple records with same ID exists")
def wikify_region(): """ This function calls the wikifier service to wikifiy a region, and deletes/updates wiki region file's results :return: """ project_folder = get_project_folder() project = get_project(project_folder) action = request.form["action"] region = request.form["region"] context = request.form["context"] flag = int(request.form["flag"]) if action == "wikify_region": if not project.current_data_file: raise web_exceptions.WikifyWithoutDataFileException( "Upload data file before wikifying a region") calc_params=get_calc_params(project) cell_qnode_map, problem_cells = wikify(calc_params, region, context) file_path = save_dataframe(project_folder, cell_qnode_map, "wikify_region_output") project.add_wikifier_file(file_path)#, copy_from_elsewhere=True, overwrite=True) project.update_saved_state(current_wikifiers=[file_path]) project.save() calc_params=get_calc_params(project) data = serialize_item_table(calc_params) if problem_cells: error_dict = { "errorCode": 400, "errorTitle": "Failed to wikify some cellsr", "errorDescription": "Failed to wikify: " + ",".join(problem_cells) } data['problemCells'] = error_dict else: data['problemCells'] = False data['project']=project.__dict__ return data, 200 return {}, 404
def call_wikifier_service(): """ This function calls the wikifier service to wikifiy a region, and deletes/updates wiki region file's results :return: """ project=get_project() region = request.get_json()["region"] context = request.get_json()["context"] calc_params = get_calc_params(project) cell_qnode_map, problem_cells = wikify(calc_params, region, context) file_path = save_dataframe(project, cell_qnode_map, "wikify_region_output.csv") file_path = project.add_wikifier_file(file_path, copy_from_elsewhere=True, overwrite=True) project.save() calc_params = get_calc_params(project) response=dict(project=get_project_dict(project)) response["layers"]=get_qnodes_layer(calc_params) if problem_cells: response['wikifierError'] = "Failed to wikify: " + ",".join(problem_cells) return response, 200
deduplication_count = [] for file in file_list: file_path = dir_path + file print(file_path) df = utils.get_dataframe(file_path) deduplication_before_count.append(len(df)) deduplication_df = df.drop_duplicates(["blog_title", "writer"], keep="first") save_path = dir_path + file utils.save_dataframe(deduplication_df, save_path) deduplication_after_count.append(len(deduplication_df)) drama_title = file.split("_") drama_title = drama_title[5].split(".") drama_title = drama_title[0] deduplication_count.append([drama_title] + [len(df)] + [len(deduplication_df)]) print(deduplication_before_count) print(deduplication_after_count) drama_blog_count_table = pd.DataFrame(deduplication_count, columns=('title', 'before', 'after')) drama_blog_count_table.to_csv('blog_post_count_before_2주_케이블.csv',
pass # create pairs dataframe pairs = pd.DataFrame({ 'sample': emx.columns, 'label': labels }) # save pairs to file print('Saving generated partition file...') pairs.to_csv(args.outfile, sep='\t', header=False, index=False) else: print('error: you must specify either a partition file or partition generation method') sys.exit(-1) # save a sub-matrix for each partition partitions = list(set(pairs['label'])) partitions.sort() for p in partitions: # compute output name outname = '%s.%s.txt' % (basename, p) print('Saving %s...' % (outname)) # extract partition sub-matrix samples = pairs.loc[pairs['label'] == p, 'sample'] submatrix = emx[samples] # save submatrix utils.save_dataframe(outname, submatrix)
classes = ["class-%02d" % i for i in range(args.n_classes)] y = [classes[y_i] for y_i in y] # initialize gene names, sample names x_samples = ["sample-%08d" % i for i in range(args.n_samples)] x_genes = ["gene-%06d" % i for i in range(args.n_genes)] # initialize dataframes x = pd.DataFrame(x, index=x_samples, columns=x_genes) y = pd.DataFrame(y, index=x_samples) # create synthetic gene sets gene_sets = [] for i in range(args.n_sets): n_genes = random.randint( 5, min(max(10, args.n_genes // 10), args.n_genes)) genes = random.sample(x_genes, n_genes) gene_sets.append(["gene-set-%03d" % i] + genes) # save dataset to file utils.save_dataframe(args.dataset, x) # save labels to file y.to_csv(args.labels, sep="\t", header=None) # save gene sets to file f = open(args.gene_sets, "w") f.write("\n".join(["\t".join(gene_set) for gene_set in gene_sets]))
if tag in recent_tags and not a.all: print(f"Skipping recently updated dataset: {wb_id}") continue try: print("Fetching... {} {}".format(wb_id, i['name'])) df = wb.get_dataframe({wb_id: "metric"}, freq=a.freq) if df is None: raise ValueError('No data') df, metadata = fix_dataframe(i, df, countries, tag) #print(metadata) if df is None or len(df) == 0: print(f"WARNING: no data associated with {wb_id}") continue save_dataframe(db.market_data_cache, i, df, tag, 'worldbank') n = save_inverted_index(db, metadata, df, i, tag, countries) print(f"Updated {n} records for {tag}") as_at = now() i.update({ 'last_successful_data': as_at, 'last_updated': as_at, }) update_indicator(db, i) time.sleep(a.delay) n_downloaded += 1 except (RuntimeError, ValueError, TypeError) as e: print(f"ERROR: when processing {i}: {e}") i.update({ 'last_error_when': now(),
component_found = False length_found = False accessibility_found = False for token in tokens: if token in dict_bg_tags_process['mechanics']: df['mechanics'].loc[i] = 1 mechanics_found = True if token in dict_bg_tags_process['theme']: df['theme'].loc[i] = 1 theme_found = True if token in dict_bg_tags_process['component']: df['component'].loc[i] = 1 component_found = True if token in dict_bg_tags_process['length']: df['length'].loc[i] = 1 length_found = True if token in dict_bg_tags_process['accessible']: df['accessibility'].loc[i] = 1 accessibility_found = True if i % 100 == 0: print('On Row number: %d ' % i) utils.save_dataframe(df, FOLDER, OUTPUT_PANDAS)
scaler = sklearn.preprocessing.MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_perturb = scaler.transform(x_perturb) # perturb each class mean to the target class mu_perturbed = perturb_mean_diff(x_train, y_train, args.target, classes) # save mean peturbations to dataframe df_perturbed = pd.DataFrame(data=mu_perturbed, index=genes, columns=classes) utils.save_dataframe( '%s/%s.perturbations.means.txt' % (args.output_dir, classes[args.target]), df_perturbed) # perturb all samples to target class perturbations = perturb_advgan(x_perturb, y_perturb, args.target, output_dir=args.output_dir) # save sample perturbations to dataframe df_perturbed = pd.DataFrame(data=perturbations, index=genes, columns=df_perturb.index) utils.save_dataframe( '%s/%s.perturbations.samples.txt' %
try: data_response = ABS.data(resource_id=dataset, params={"startPeriod": "2010"}) df = data_response.write().unstack().reset_index() assert len(df) > 0 and isinstance(df, pd.DataFrame) tag = f"{dataset}-dataframe" metadata = { "dataset": dataset, "name": title, "tag": tag, "field": dataset, "scope": "abs", "last_updated": now(), "min_date": None, "max_date": None, "n_attributes": len(df.columns), } save_dataframe(db.abs_data_cache, metadata, df, tag, "abs") db.abs_inverted_index.update_one( { "dataset": dataset, "scope": "abs" }, {"$set": metadata}, upsert=True) time.sleep(a.delay) except HTTPError: print(f"WARNING: unable to fetch {dataset}") traceback.print_exc() exit(0)
if rank == 0: print("Performing K-S test and outlier removal...") mask = transform_kstest(X, colnames, keepna=args.ks_keepna, threshold=args.ks_threshold, logfile=args.ks_log) # remove outliers from FPKM matrix if rank == 0: print("Preserved %d / %d samples after outlier removal..." % (sum(mask), len(mask))) X = X[:, mask] colnames = colnames[mask] # perform quantile normalization if args.quantile: if rank == 0: print("Performing quantile normalization...") transform_quantile(X) # save output matrix if rank == 0: print("Saving output expression matrix...") emx = pd.DataFrame(X, index=rownames, columns=colnames) utils.save_dataframe(args.outfile, emx)
import pandas as pd import utils if __name__ == "__main__": CSV_NAME = 'post_treat_pokemon.csv' COLUMNS_NAMES = {'against_bug':'bug','against_dark':'dark','against_dragon':'dragon','against_electric':'electric','against_fairy':'fairy','against_fight':'fight','against_fire':'fire','against_flying':'flying','against_ghost':'ghost', 'against_grass':'grass', 'against_ground':'ground','against_ice':'ice','against_normal':'normal', 'against_poison':'poison', 'against_psychic':'psychic','against_rock':'rock','against_steel':'steel', 'against_water':'water'} df = pd.read_csv('pokemon.csv') type1 = df.type1.values.tolist() type2 = df.type2.values.tolist() img_list = [] type2_list = [] pokedex_number = df.pokedex_number.values.tolist() url = 'https://assets.pokemon.com/assets/cms2/img/pokedex/full/' term = '.png' df = df[['against_bug', 'against_dark', 'against_dragon', 'against_electric', 'against_fairy', 'against_fight', 'against_fire', 'against_flying', 'against_ghost', 'against_grass', 'against_ground', 'against_ice', 'against_normal', 'against_poison', 'against_psychic', 'against_rock', 'against_steel', 'against_water','classfication', 'height_m','name','attack','pokedex_number','type1','type2','weight_kg','generation', 'is_legendary']] img_list = utils.get_img_pokemon(url,term,pokedex_number,img_list) type2_list = utils.replace_type(type1,type2,type2_list) df = utils.rename_columns(df,COLUMNS_NAMES) df = utils.generate_dataframe(df,img_list,type2_list) utils.save_dataframe(df,CSV_NAME)
if args.tsne: # compute t-SNE embedding x_tsne = sklearn.manifold.TSNE().fit_transform(x) # plot t-SNE embedding with class labels plt.axis('off') for c in classes: indices = (y[0] == c) plt.scatter(x_tsne[indices, 0], x_tsne[indices, 1], label=c, edgecolors='w') plt.subplots_adjust(right=0.70) plt.legend(loc='upper left', bbox_to_anchor=(1, 1)) plt.savefig(args.tsne) plt.close() # split dataset into train/perturb sets x_train, x_perturb, y_train, y_perturb = sklearn.model_selection.train_test_split(x, y, test_size=1 - args.train_size) # save datasets to file utils.save_dataframe(args.train_data, x_train) utils.save_dataframe(args.perturb_data, x_perturb) # save labels to file y_train.to_csv(args.train_labels, sep='\t', header=None) y_perturb.to_csv(args.perturb_labels, sep='\t', header=None) # save gene sets to file f = open(args.gene_sets, 'w') f.write('\n'.join(['\t'.join(gene_set) for gene_set in gene_sets]))
#!/usr/bin/env python import argparse import numpy as np import pandas as pd import sys import utils if __name__ == "__main__": # parse command-line arguments parser = argparse.ArgumentParser() parser.add_argument("infile", help="input expression matrix") parser.add_argument("outfile", help="output expression matrix") args = parser.parse_args() # load input dataframe from input format print("Loading %s..." % args.infile) df = utils.load_dataframe(args.infile) # save dataframe in output format print("Saving %s..." % args.outfile) utils.save_dataframe(args.outfile, df)
def save(self, df, *args, **kwargs): # if self.exists(): # raise Exception('Resource already exists, %s.' % self.name) return save_dataframe(df, self.path, self.fmt, *args, **kwargs)
X = pd.DataFrame() y = pd.DataFrame() # load each input file into expression matrix for infile in args.infiles: # load input file print("loading \"%s\"" % infile) X_i = pd.read_csv(infile, sep="\t", index_col=0) # remove extraneous columns X_i.drop(columns=["Entrez_Gene_Id"], inplace=True) # extract labels label = infile.split(".")[0].split("/")[-1] y_i = pd.DataFrame({"sample": X_i.columns, "label": label}) # append input dataframe to output dataframe X = pd.merge(X, X_i, left_index=True, right_index=True, how="outer") # append input labels to ouput labels y = y.append(y_i) y.set_index("sample", inplace=True) # save output expression matrix print("saving \"%s\"" % args.outfile) utils.save_dataframe(args.outfile, X) utils.save_dataframe("%s.labels.txt" % args.outfile.split(".")[0], y)