예제 #1
0
def fetch_dataset(db_collection, flow_name: str, parameters):
    # 1. try CSV fetch, if that doesnt work then try PandasSDMX to get the dataframe
    data_response = requests.get(url,
                                 params=parameters,
                                 headers={"Accept": "text/csv"})
    assert data_response.status_code == 200
    with get_tempfile() as fp:
        fp.write(data_response.text.encode())
        fp.seek(0)
        kwargs = ({} if not flow_name in extra_csv_parms else dict(
            **extra_csv_parms[flow_name]))
        try:
            df = pd.read_csv(fp, **kwargs)
            save_dataframe(db_collection, {}, df, url, "ECB")
            return
        except pd.errors.EmptyDataError:  # no data is ignored as far as --fail-fast is concerned
            print(
                f"No CSV data to save.. now trying {flow_name} using pandasdmx"
            )
            # FALLTHRU...

    # 2. try pandassdmx if CSV fetch fails
    ecb = sdmx.Request("ECB", backend="memory")
    data_msg = ecb.data(flow_name, params=parameters)
    df = sdmx.to_pandas(data_msg)
    assert isinstance(df, pd.DataFrame)
    save_dataframe(db_collection, {}, df, url, "ECB")
예제 #2
0
def delete_record(id_):
    """
    Deletes one record based on it's ID.
    If the ID given doesn't exist, nothing will be deleted.
    """
    df = utils.read_dataframe()
    df_altered = df[df['ID'] != id_]
    utils.save_dataframe(data=df_altered)
    return None
예제 #3
0
def add_record(name, age, fav_sport):
    """Adds one record to the database (CSV file)"""
    df = utils.read_dataframe()
    df_record_to_add = pd.DataFrame(data={
        "ID": utils.generate_random_id(),
        "Name": name,
        "Age": age,
        "FavouriteSport": fav_sport
    }, index=[0])
    df_concatenated = pd.concat(objs=[df, df_record_to_add], ignore_index=True, sort=False)
    utils.save_dataframe(data=df_concatenated)
    return None
예제 #4
0
파일: app.py 프로젝트: IvanYerkinov/DSHW2
def main():
    """
        Main application handler
    """
    # in debug mode, use the cached dataframe
    if len(sys.argv) >= 2:
        app.dataframe = utils.load_dataframe("df.pickle")
        app.run(host='0.0.0.0', port=3000, debug=True)
    else:
        app.dataframe = retrieve_stocks()
        utils.save_dataframe(app.dataframe, "df.pickle")
        app.run(host='0.0.0.0', port=3000)
예제 #5
0
def update_record(id_, name=None, age=None, fav_sport=None):
    """
    Updates one record based on it's ID, and some specified parameters.
    If the ID given doesn't exist, nothing will be updated.
    >>> update_record(id_="U0E4CXQSWC4S", name="SomeNewName", age=20, fav_sport="SomeNewFavouriteSport")
    """
    df = utils.read_dataframe()
    df_record_to_update = df[df['ID'] == id_]
    if df_record_to_update.empty:
        return None
    if len(df_record_to_update) == 1:
        if name:
            df.loc[df['ID'] == id_, 'Name'] = name
        if age:
            df.loc[df['ID'] == id_, 'Age'] = age
        if fav_sport:
            df.loc[df['ID'] == id_, 'FavouriteSport'] = fav_sport
        utils.save_dataframe(data=df)
        return None
    raise Exception("Multiple records with same ID exists")
예제 #6
0
def wikify_region():
    """
    This function calls the wikifier service to wikifiy a region, and deletes/updates wiki region file's results
    :return:
    """
    project_folder = get_project_folder()
    project = get_project(project_folder)
    action = request.form["action"]
    region = request.form["region"]
    context = request.form["context"]
    flag = int(request.form["flag"])
    if action == "wikify_region":
        if not project.current_data_file:
            raise web_exceptions.WikifyWithoutDataFileException(
                "Upload data file before wikifying a region")
        calc_params=get_calc_params(project)

        cell_qnode_map, problem_cells = wikify(calc_params, region, context)
        file_path = save_dataframe(project_folder, cell_qnode_map, "wikify_region_output")

        project.add_wikifier_file(file_path)#, copy_from_elsewhere=True, overwrite=True)
        project.update_saved_state(current_wikifiers=[file_path])
        project.save()
        
        calc_params=get_calc_params(project)
        data = serialize_item_table(calc_params)

        if problem_cells:
            error_dict = {
                "errorCode": 400,
                "errorTitle": "Failed to wikify some cellsr",
                "errorDescription": "Failed to wikify: " + ",".join(problem_cells)
            }
            data['problemCells'] = error_dict
        else:
            data['problemCells'] = False
        data['project']=project.__dict__
        return data, 200
    return {}, 404
예제 #7
0
def call_wikifier_service():
    """
    This function calls the wikifier service to wikifiy a region, and deletes/updates wiki region file's results
    :return:
    """
    project=get_project()
    region = request.get_json()["region"]
    context = request.get_json()["context"]
    calc_params = get_calc_params(project)


    cell_qnode_map, problem_cells = wikify(calc_params, region, context)
    file_path = save_dataframe(project, cell_qnode_map, "wikify_region_output.csv")
    file_path = project.add_wikifier_file(file_path,  copy_from_elsewhere=True, overwrite=True)
    project.save()

    calc_params = get_calc_params(project)
    response=dict(project=get_project_dict(project))
    response["layers"]=get_qnodes_layer(calc_params)

    if problem_cells:
        response['wikifierError'] =  "Failed to wikify: " + ",".join(problem_cells)

    return response, 200
deduplication_count = []

for file in file_list:

    file_path = dir_path + file
    print(file_path)

    df = utils.get_dataframe(file_path)

    deduplication_before_count.append(len(df))

    deduplication_df = df.drop_duplicates(["blog_title", "writer"],
                                          keep="first")

    save_path = dir_path + file
    utils.save_dataframe(deduplication_df, save_path)

    deduplication_after_count.append(len(deduplication_df))

    drama_title = file.split("_")
    drama_title = drama_title[5].split(".")
    drama_title = drama_title[0]
    deduplication_count.append([drama_title] + [len(df)] +
                               [len(deduplication_df)])

print(deduplication_before_count)
print(deduplication_after_count)

drama_blog_count_table = pd.DataFrame(deduplication_count,
                                      columns=('title', 'before', 'after'))
drama_blog_count_table.to_csv('blog_post_count_before_2주_케이블.csv',
예제 #9
0
			pass

		# create pairs dataframe
		pairs = pd.DataFrame({ 'sample': emx.columns, 'label': labels })

		# save pairs to file
		print('Saving generated partition file...')

		pairs.to_csv(args.outfile, sep='\t', header=False, index=False)

	else:
		print('error: you must specify either a partition file or partition generation method')
		sys.exit(-1)

	# save a sub-matrix for each partition
	partitions = list(set(pairs['label']))
	partitions.sort()

	for p in partitions:
		# compute output name
		outname = '%s.%s.txt' % (basename, p)

		print('Saving %s...' % (outname))

		# extract partition sub-matrix
		samples = pairs.loc[pairs['label'] == p, 'sample']
		submatrix = emx[samples]

		# save submatrix
		utils.save_dataframe(outname, submatrix)
예제 #10
0
    classes = ["class-%02d" % i for i in range(args.n_classes)]
    y = [classes[y_i] for y_i in y]

    # initialize gene names, sample names
    x_samples = ["sample-%08d" % i for i in range(args.n_samples)]
    x_genes = ["gene-%06d" % i for i in range(args.n_genes)]

    # initialize dataframes
    x = pd.DataFrame(x, index=x_samples, columns=x_genes)
    y = pd.DataFrame(y, index=x_samples)

    # create synthetic gene sets
    gene_sets = []

    for i in range(args.n_sets):
        n_genes = random.randint(
            5, min(max(10, args.n_genes // 10), args.n_genes))
        genes = random.sample(x_genes, n_genes)

        gene_sets.append(["gene-set-%03d" % i] + genes)

    # save dataset to file
    utils.save_dataframe(args.dataset, x)

    # save labels to file
    y.to_csv(args.labels, sep="\t", header=None)

    # save gene sets to file
    f = open(args.gene_sets, "w")
    f.write("\n".join(["\t".join(gene_set) for gene_set in gene_sets]))
예제 #11
0
        if tag in recent_tags and not a.all:
            print(f"Skipping recently updated dataset: {wb_id}")
            continue
        try:
            print("Fetching... {} {}".format(wb_id, i['name']))
            df = wb.get_dataframe({wb_id: "metric"}, freq=a.freq)
            if df is None:
                raise ValueError('No data')
            df, metadata = fix_dataframe(i, df, countries, tag)
            #print(metadata)
            if df is None or len(df) == 0:
                print(f"WARNING: no data associated with {wb_id}")
                continue

            save_dataframe(db.market_data_cache, i, df, tag, 'worldbank')

            n = save_inverted_index(db, metadata, df, i, tag, countries)
            print(f"Updated {n} records for {tag}")
            as_at = now()
            i.update({
                'last_successful_data': as_at,
                'last_updated': as_at,
            })
            update_indicator(db, i)
            time.sleep(a.delay)
            n_downloaded += 1
        except (RuntimeError, ValueError, TypeError) as e:
            print(f"ERROR: when processing {i}: {e}")
            i.update({
                'last_error_when': now(),
예제 #12
0
    component_found = False
    length_found = False
    accessibility_found = False

    for token in tokens:

        if token in dict_bg_tags_process['mechanics']:
            df['mechanics'].loc[i] = 1
            mechanics_found = True

        if token in dict_bg_tags_process['theme']:
            df['theme'].loc[i] = 1
            theme_found = True

        if token in dict_bg_tags_process['component']:
            df['component'].loc[i] = 1
            component_found = True

        if token in dict_bg_tags_process['length']:
            df['length'].loc[i] = 1
            length_found = True

        if token in dict_bg_tags_process['accessible']:
            df['accessibility'].loc[i] = 1
            accessibility_found = True

    if i % 100 == 0:
        print('On Row number: %d ' % i)

utils.save_dataframe(df, FOLDER, OUTPUT_PANDAS)
예제 #13
0
    scaler = sklearn.preprocessing.MinMaxScaler()
    scaler.fit(x_train)

    x_train = scaler.transform(x_train)
    x_perturb = scaler.transform(x_perturb)

    # perturb each class mean to the target class
    mu_perturbed = perturb_mean_diff(x_train, y_train, args.target, classes)

    # save mean peturbations to dataframe
    df_perturbed = pd.DataFrame(data=mu_perturbed,
                                index=genes,
                                columns=classes)

    utils.save_dataframe(
        '%s/%s.perturbations.means.txt' %
        (args.output_dir, classes[args.target]), df_perturbed)

    # perturb all samples to target class
    perturbations = perturb_advgan(x_perturb,
                                   y_perturb,
                                   args.target,
                                   output_dir=args.output_dir)

    # save sample perturbations to dataframe
    df_perturbed = pd.DataFrame(data=perturbations,
                                index=genes,
                                columns=df_perturb.index)

    utils.save_dataframe(
        '%s/%s.perturbations.samples.txt' %
예제 #14
0
        try:
            data_response = ABS.data(resource_id=dataset,
                                     params={"startPeriod": "2010"})
            df = data_response.write().unstack().reset_index()
            assert len(df) > 0 and isinstance(df, pd.DataFrame)
            tag = f"{dataset}-dataframe"
            metadata = {
                "dataset": dataset,
                "name": title,
                "tag": tag,
                "field": dataset,
                "scope": "abs",
                "last_updated": now(),
                "min_date": None,
                "max_date": None,
                "n_attributes": len(df.columns),
            }
            save_dataframe(db.abs_data_cache, metadata, df, tag, "abs")
            db.abs_inverted_index.update_one(
                {
                    "dataset": dataset,
                    "scope": "abs"
                }, {"$set": metadata},
                upsert=True)
            time.sleep(a.delay)
        except HTTPError:
            print(f"WARNING: unable to fetch {dataset}")
            traceback.print_exc()

    exit(0)
예제 #15
0
        if rank == 0:
            print("Performing K-S test and outlier removal...")

        mask = transform_kstest(X,
                                colnames,
                                keepna=args.ks_keepna,
                                threshold=args.ks_threshold,
                                logfile=args.ks_log)

        # remove outliers from FPKM matrix
        if rank == 0:
            print("Preserved %d / %d samples after outlier removal..." %
                  (sum(mask), len(mask)))

            X = X[:, mask]
            colnames = colnames[mask]

    # perform quantile normalization
    if args.quantile:
        if rank == 0:
            print("Performing quantile normalization...")

        transform_quantile(X)

    # save output matrix
    if rank == 0:
        print("Saving output expression matrix...")

        emx = pd.DataFrame(X, index=rownames, columns=colnames)
        utils.save_dataframe(args.outfile, emx)
예제 #16
0
import pandas as pd
import utils


if __name__ == "__main__":
	CSV_NAME = 'post_treat_pokemon.csv'
	COLUMNS_NAMES = {'against_bug':'bug','against_dark':'dark','against_dragon':'dragon','against_electric':'electric','against_fairy':'fairy','against_fight':'fight','against_fire':'fire','against_flying':'flying','against_ghost':'ghost', 'against_grass':'grass', 'against_ground':'ground','against_ice':'ice','against_normal':'normal', 'against_poison':'poison', 'against_psychic':'psychic','against_rock':'rock','against_steel':'steel', 'against_water':'water'}
	df = pd.read_csv('pokemon.csv')
	type1 = df.type1.values.tolist()
	type2 = df.type2.values.tolist()
	img_list = []
	type2_list = []
	pokedex_number = df.pokedex_number.values.tolist()
	url = 'https://assets.pokemon.com/assets/cms2/img/pokedex/full/'
	term = '.png'
	df = df[['against_bug', 'against_dark', 'against_dragon',
	       'against_electric', 'against_fairy', 'against_fight', 'against_fire',
	       'against_flying', 'against_ghost', 'against_grass', 'against_ground',
	       'against_ice', 'against_normal', 'against_poison', 'against_psychic',
	       'against_rock', 'against_steel', 'against_water','classfication', 
	       'height_m','name','attack','pokedex_number','type1','type2','weight_kg','generation', 'is_legendary']]

	img_list = utils.get_img_pokemon(url,term,pokedex_number,img_list)
	type2_list = utils.replace_type(type1,type2,type2_list)
	df = utils.rename_columns(df,COLUMNS_NAMES)
	df = utils.generate_dataframe(df,img_list,type2_list)
	utils.save_dataframe(df,CSV_NAME)

예제 #17
0
파일: make-inputs.py 프로젝트: rpauly/TSPG
    if args.tsne:
        # compute t-SNE embedding
        x_tsne = sklearn.manifold.TSNE().fit_transform(x)

        # plot t-SNE embedding with class labels
        plt.axis('off')

        for c in classes:
            indices = (y[0] == c)
            plt.scatter(x_tsne[indices, 0], x_tsne[indices, 1], label=c, edgecolors='w')

        plt.subplots_adjust(right=0.70)
        plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
        plt.savefig(args.tsne)
        plt.close()

    # split dataset into train/perturb sets
    x_train, x_perturb, y_train, y_perturb = sklearn.model_selection.train_test_split(x, y, test_size=1 - args.train_size)

    # save datasets to file
    utils.save_dataframe(args.train_data, x_train)
    utils.save_dataframe(args.perturb_data, x_perturb)

    # save labels to file
    y_train.to_csv(args.train_labels, sep='\t', header=None)
    y_perturb.to_csv(args.perturb_labels, sep='\t', header=None)

    # save gene sets to file
    f = open(args.gene_sets, 'w')
    f.write('\n'.join(['\t'.join(gene_set) for gene_set in gene_sets]))
예제 #18
0
#!/usr/bin/env python

import argparse
import numpy as np
import pandas as pd
import sys

import utils

if __name__ == "__main__":
    # parse command-line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("infile", help="input expression matrix")
    parser.add_argument("outfile", help="output expression matrix")

    args = parser.parse_args()

    # load input dataframe from input format
    print("Loading %s..." % args.infile)

    df = utils.load_dataframe(args.infile)

    # save dataframe in output format
    print("Saving %s..." % args.outfile)

    utils.save_dataframe(args.outfile, df)
예제 #19
0
    def save(self, df, *args, **kwargs):
        # if self.exists():
        #     raise Exception('Resource already exists, %s.' % self.name)

        return save_dataframe(df, self.path, self.fmt, *args, **kwargs)
예제 #20
0
파일: merge.py 프로젝트: nnellig/GEMprep
    X = pd.DataFrame()
    y = pd.DataFrame()

    # load each input file into expression matrix
    for infile in args.infiles:
        # load input file
        print("loading \"%s\"" % infile)

        X_i = pd.read_csv(infile, sep="\t", index_col=0)

        # remove extraneous columns
        X_i.drop(columns=["Entrez_Gene_Id"], inplace=True)

        # extract labels
        label = infile.split(".")[0].split("/")[-1]
        y_i = pd.DataFrame({"sample": X_i.columns, "label": label})

        # append input dataframe to output dataframe
        X = pd.merge(X, X_i, left_index=True, right_index=True, how="outer")

        # append input labels to ouput labels
        y = y.append(y_i)

    y.set_index("sample", inplace=True)

    # save output expression matrix
    print("saving \"%s\"" % args.outfile)

    utils.save_dataframe(args.outfile, X)
    utils.save_dataframe("%s.labels.txt" % args.outfile.split(".")[0], y)