def __init__(self, workbench, data_path = "/home/moritz/DataBases/genomes/RefSeq/", clean = False): Database.__init__(self,workbench = workbench, data_path = data_path) if not os.path.exists(self.metadata_file) or clean: ftp = FTP(ncbi) print "Getting metadata from ncbi" FNULL = open(os.devnull, 'w') ftp.login() ftp.cwd('genomes/refseq/bacteria/') info = StringIO.StringIO() ftp.retrbinary("RETR " + "assembly_summary.txt", info.write) info.seek(0) self.metadata = DataFrame.from_csv(info, sep="\t", header=1) ftp.close() self.metadata['assembly_level'] = self.metadata['assembly_level'].apply(lambda x: x.replace(" ","_")) self.metadata = self.metadata.transpose().to_dict() DataFrame.from_dict(self.metadata).to_csv(self.metadata_file) else : print "Loading metadata" self.metadata = DataFrame.from_csv(self.metadata_file).to_dict() print "Loading genomes" for k,v in tqdm(self.metadata.items()): genome_path = pjoin(self.data_path, v['assembly_level'].replace(" ","_"), k) genome_file = pjoin(genome_path, k + ".fna") self.genomes += [Genome(k, genome_path, ref=genome_file, manual_metadata = v, taxDb = self.taxDb, workbench = self.workbench)]
def testGBCLoss(db, count): train = db[:, 0 : count * 5] target = db[:, count * 5] testDataLoss = ["deviance", "exponential"] kfold = 5 itog_val = {} for i in testDataLoss: scores = cross_validation.cross_val_score( GradientBoostingClassifier( loss=i, n_estimators=8, learning_rate=1, max_depth=3, min_samples_split=4, min_samples_leaf=2, min_weight_fraction_leaf=0, subsample=1, max_features="auto", random_state=3200, ), train, target, cv=kfold, ) itog_val[i] = scores.mean() DataFrame.from_dict(data=itog_val, orient="index").plot(kind="barh", legend=False) plt.show()
def mash_matrix(gs, file, clean = False, proc=4): if os.path.exists(file) and not clean: pre_mat = DataFrame.from_csv(file) done = [g for g in gs if g.name in pre_mat.index] to_do = [g for g in gs if not g.name in pre_mat.index] if len(to_do) == 0: out_mat = pre_mat else: mat_small = DataFrame.from_dict({g : g.mash_compare_many(done, proc) for g in tqdm(to_do)}) mat_small.index = Index([m.name for m in mat_small.index]) mat_small.columns = Index([m.name for m in mat_small.columns]) mat_small = mat_small.transpose() mat_big = DataFrame.from_dict({g : g.mash_compare_many(to_do + done, proc) for g in tqdm(to_do)}) mat_big.index = Index([m.name for m in mat_big.index]) mat_big.columns = Index([m.name for m in mat_big.columns]) out_mat = concat([mat_big,concat([mat_small, pre_mat[mat_small.columns]], axis = 0 ).loc[mat_big.index]], axis=1) out_mat = out_mat[out_mat.index] out_mat.to_csv(file) else : out_mat = DataFrame.from_dict({g : g.mash_compare_many(gs, proc) for g in tqdm(gs)}) out_mat.index = Index([m.name for m in out_mat.index]) out_mat.columns = Index([m.name for m in out_mat.columns]) out_mat.to_csv(file) return out_mat.apply(lambda x : [ast.literal_eval(xx) if isinstance(xx,basestring) else xx for xx in x])
def get_document_mapping(): data = json.load(open('/Users/pcravich/repo/personal-agents/search/nlctaglist.json')) labels = list(map(lambda x: x['labels'], data)) df = DataFrame.from_dict(labels[0], orient='index').transpose() for i in range(1, len(labels)): df = df.append(DataFrame.from_dict(labels[i], orient='index').transpose(), ignore_index=True) df['url'] = list(map(lambda x: x['url'], data)) return df
def testKNNNeingh(db, count): train = db[:, 0 : count * 5] target = db[:, count * 5] testData = [i for i in range(1, 21, 2)] kfold = 5 itog_val = {} for i in testData: scores = cross_validation.cross_val_score(KNeighborsClassifier(n_neighbors=i), train, target, cv=kfold) itog_val[i.__str__()] = scores.mean() DataFrame.from_dict(data=itog_val, orient="index").plot(kind="barh", legend=False) plt.show()
def testKNNMetric(db, count): train = db[:, 0 : count * 5] target = db[:, count * 5] testData = ["euclidean", "manhattan", "chebyshev", "minkowski"] kfold = 5 itog_val = {} for i in testData: scores = cross_validation.cross_val_score( KNeighborsClassifier(metric=i, n_neighbors=3), train, target, cv=kfold ) itog_val[i] = scores.mean() DataFrame.from_dict(data=itog_val, orient="index").plot(kind="barh", legend=False) plt.show()
def test_to_dict_index_dtypes(self, into, expected): # GH 18580 # When using to_dict(orient='index') on a dataframe with int # and float columns only the int columns were cast to float df = DataFrame({'int_col': [1, 2, 3], 'float_col': [1.0, 2.0, 3.0]}) result = df.to_dict(orient='index', into=into) cols = ['int_col', 'float_col'] result = DataFrame.from_dict(result, orient='index')[cols] expected = DataFrame.from_dict(expected, orient='index')[cols] tm.assert_frame_equal(result, expected)
def get_boxplot(root): """ get boxplot data :param root: Root Server (in alphabet) :return: """ container4 = {} container6 = {} for file in sorted(os.listdir('datasets/{}/'.format(root))): timestamp = int(file.split('-')[0]) filename = 'datasets/{0}/{1}'.format(root, file) opened_file = DataFrame.from_csv(filename, sep='\t') if not opened_file.empty: res4 = opened_file['len4'] container4[timestamp] = res4 res6 = opened_file['len6'] container6[timestamp] = res6 else: container4[timestamp] = pd.Series() container6[timestamp] = pd.Series() df4 = DataFrame.from_dict(container4) df6 = DataFrame.from_dict(container6) dict4 = defaultdict() dict6 = defaultdict() ####### # IPv4 ####### for ts in df4: dict4[ts] = { 'name': datetime.fromtimestamp(ts).strftime('%Y-%m-%d'), 'type': 'box', 'y': [int(i) for i in df4[ts].dropna()] } result4 = [dict4[i] for i in dict4] ####### # IPv6 ####### for ts in df6: dict6[ts] = { 'name': datetime.fromtimestamp(ts).strftime('%Y-%m-%d'), 'type': 'box', 'y': [int(i) for i in df6[ts].dropna()] } result6 = [dict6[i] for i in dict6] return jsonify({'ipv4': result4, 'ipv6': result6})
def sf_data(query): """ Get opportunity data using supplied query. Get account data. Return both as dataframes. """ USER = SALESFORCE['USERNAME'] PASS = SALESFORCE['PASSWORD'] TOKEN = SALESFORCE['TOKEN'] HOST = SALESFORCE['HOST'] sf = Salesforce(username=USER, password=PASS, security_token=TOKEN) bulk = SalesforceBulk(sessionId=sf.session_id, host=HOST) print "Creating Opportunity job..." job = bulk.create_query_job("Opportunity", contentType='CSV') print "Issuing query..." batch = bulk.query(job, query) while not bulk.is_batch_done(job, batch): print "waiting for query to complete..." sleep(3) bulk.close_job(job) rows = bulk.get_batch_result_iter(job, batch, parse_csv=True) all = list(rows) opps = DataFrame.from_dict(all) job = bulk.create_query_job("Account", contentType='CSV') print "Creating Account job..." batch = bulk.query(job, "SELECT Id, Website, Text_For_Donor_Wall__c FROM Account") print "Issuing query..." while not bulk.is_batch_done(job, batch): print "waiting for query to complete..." sleep(3) bulk.close_job(job) rows = bulk.get_batch_result_iter(job, batch, parse_csv=True) accts = DataFrame.from_dict(list(rows)) accts.rename(columns={'Id': 'AccountId'}, inplace=True) return opps, accts
def getlinks(region, vendeur): #initialisation des variables pageSuivante = True if vendeur=="Particulier": url = "http://www.leboncoin.fr/voitures/offres/"+region+"/?o=1&q=renault%20captur&it=1&f=p" if vendeur=="Pros": url = "http://www.leboncoin.fr/voitures/offres/"+region+"/?o=1&q=renault%20captur&it=1&f=c" tableau = pd.DataFrame() liens = {} vente = {} reg = {} i=0 #récupération des liens et retour du résultat dans un DataFrame while pageSuivante: soup = getSoupFromUrl(url) #utilisation d'un regex sur le lien à récupérer if region == "ile_de_france": balises = soup.find_all(href=re.compile(r'(http:\/\/)(www\.leboncoin\.fr\/)(voitures\/)([\d]{9})\.(htm\?ca\=12\_s)$')) if region == "aquitaine": balises = soup.find_all(href=re.compile(r'(http:\/\/)(www\.leboncoin\.fr\/)(voitures\/)([\d]{9})\.(htm\?ca\=2\_s)$')) if region == "provence_alpes_cote_d_azur": balises = soup.find_all(href=re.compile(r'(http:\/\/)(www\.leboncoin\.fr\/)(voitures\/)([\d]{9})\.(htm\?ca\=21\_s)$')) for lien in balises: liens[i]=lien.get('href') vente[i]=vendeur reg[i]=region i=i+1 #vérification s'il y a une page suivante nav = soup.find_all("a", text = "Page suivante") if nav: url = nav[0].get('href') else: pageSuivante = False tableau = DataFrame.from_dict(liens,'index') tableau.columns = ['Lien'] Vendeurs = DataFrame.from_dict(vente,'index') Vendeurs.columns = ['Vendeur'] Regions = DataFrame.from_dict(reg,'index') Regions.columns = ['Region'] tableau = pd.merge(tableau, Vendeurs, left_index=True, right_index=True) tableau = pd.merge(tableau, Regions, left_index=True, right_index=True) return tableau
def get_as_path_avg_length(root): # read this: http://matplotlib.org/examples/pylab_examples/subplots_demo.html directory = '{}{}/'.format(csv_dir, root) result4 = {} result6 = {} for file in sorted(os.listdir(directory)): timestamp = int(file.split('-')[0]) filename = '{}{}'.format(directory, file) opened_file = DataFrame.from_csv(filename, sep='\t') if not opened_file.empty: res4 = opened_file['len4'] res6 = opened_file['len6'] result4[timestamp] = res4 result6[timestamp] = res6 else: result4[timestamp] = pd.Series() result6[timestamp] = pd.Series() plot_result4 = DataFrame.from_dict(result4) plot_result6 = DataFrame.from_dict(result6) ################ # Plot ################ fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True) plot4 = plot_result4.plot.box(figsize=(14, 5), ax=axes[0], ylim=(1.5, 9.5)) plot6 = plot_result6.plot.box(figsize=(14, 5), ax=axes[1], ylim=(1.5, 9.5)) n = 6 # ticks = plot4.xaxis.get_ticklocs() # ticklabels = [datetime.fromtimestamp(int(l.get_text())).strftime('%d/%m/%y') for l in plot4.xaxis.get_ticklabels()] # plot4.xaxis.set_ticks(ticks[::n]) # plot4.xaxis.set_ticklabels(ticklabels[::n], rotation=25) axes[0].text(3, 8, 'IPv4', fontsize=20, bbox={'facecolor': 'white', 'pad': 5}) axes[1].text(3, 8, 'IPv6', fontsize=20, bbox={'facecolor': 'white', 'pad': 5}) axes[0].grid(True) axes[1].grid(True) ticks = axes[1].xaxis.get_ticklocs() ticklabels = [datetime.fromtimestamp(int(l.get_text())).strftime('%d/%m/%y') for l in axes[1].xaxis.get_ticklabels()] axes[1].xaxis.set_ticks(ticks[::n]) axes[1].xaxis.set_ticklabels(ticklabels[::n], rotation=25) plt.tight_layout() plt.savefig('figs/eps/path_avg_dist_{}.eps'.format(root), format='eps', dpi=1000) plt.savefig('figs/png/path_avg_dist_{}.png'.format(root)) print('finish: path average {}-Root Server'.format(root))
def _as_dataframe(self, gene_obj, df_index=False): """ converts gene object to DataFrame (pandas) """ if not df_avail: print("Error: pandas module must be installed for as_dataframe option.") return if 'hits' in gene_obj: df = DataFrame.from_dict(gene_obj['hits']) else: df = DataFrame.from_dict(gene_obj) if df_index: df = df.set_index('_id') return df
def gen_data(size, seed): data = { 'a': generate_uniform_float_column(size, 0., 1., seed + 1), 'b': generate_uniform_float_column(size, 0., 1., seed + 2), 'c': generate_uniform_float_column(size, 0., 1., seed + 3) } return DataFrame.from_dict(data)
def collector2table(collector): """ collector2table return a station table as a DataFrame. columns are station, sensor, lon, lat, and the index is the station number. This is a substitute for `sos_request`. """ # This accepts only 1-day request, but since we only want the # stations available we try again with end=start. c = copy.copy(collector) try: response = c.raw(responseFormat="text/csv") except ExceptionReport: response = c.filter(end=c.start_time).raw(responseFormat="text/csv") df = read_csv(BytesIO(response.encode('utf-8')), parse_dates=True) columns = {'sensor_id': 'sensor', 'station_id': 'station', 'latitude (degree)': 'lat', 'longitude (degree)': 'lon'} df.rename(columns=columns, inplace=True) df['sensor'] = [s.split(':')[-1] for s in df['sensor']] df['station'] = [s.split(':')[-1] for s in df['station']] df = df[['station', 'sensor', 'lon', 'lat']] g = df.groupby('station') df = dict() for station in g.groups.keys(): df.update({station: g.get_group(station).iloc[0]}) return DataFrame.from_dict(df).T
def apply_skill(dfs, function, remove_mean=True, filter_tides=False): skills = dict() for station, df in dfs.iteritems(): if filter_tides: df = df.apply(low_pass) skill = dict() obs = df.pop('OBS_DATA') if obs.isnull().all(): # No observations. skills.update({station: np.NaN}) continue for model, y in df.iteritems(): # No models. if y.isnull().all(): skills.update({station: np.NaN}) continue mask = both_valid(obs, y) x, y = obs[mask], y[mask] if remove_mean: x, y = x-x.mean(), y-y.mean() if x.size: ret = function(x, y) else: ret = np.NaN skill.update({model: ret}) skills.update({station: skill}) return DataFrame.from_dict(skills)
def test_csv_read_files(): df = DataFrame.from_dict({0:['01',2], 1:['x', 12]}, orient='index') df.columns = ['a','b'] df2 = csv_read_files(tempfilename, index_col=0) assert_frame_equal(df, df2)
def compare_assemblies(assemblies, chunk_size = 2000, identity_threshold = 0.40): """ compares a set of assemblies: assemblies is a dictionary with names of the assemblies as keys and fasta-files of the assemblies as values """ similarities = {} print "make blast dbs" for subject_name, subject in tqdm(assemblies.iteritems()): blast_db_cmd = ["makeblastdb" ,"-in", subject, "-dbtype", "nucl", "-out", subject] with open("/dev/null") as null: blastdb_return = call(blast_db_cmd, stdout=null) print "Run the hell out of it" for scaff_name, scaff in tqdm(assemblies.iteritems()): similarities[scaff_name] = {} chopped_up_query = "tmp.fasta" nb_chunks = len(cut_up_fasta(scaff, chopped_up_query, chunk_size)) for subject_name, subject in assemblies.iteritems(): nics = find_NICs(chopped_up_query, subject, identity_threshold, blast_db = False) # print scaff_name, "vs", subject_name similarities[scaff_name][subject_name] = len(nics.keys())/nb_chunks os.remove(chopped_up_query) print "clean up" for subject_name, subject in tqdm(assemblies.iteritems()): blast_db_files = [subject + ".nhr", subject + ".nin", subject + ".nsq"] for f in blast_db_files: os.remove(f) similars = DataFrame.from_dict(similarities) return similars
def dataFrame(self): from pandas import DataFrame items = self.execute() if len(items) == 0: return DataFrame() return DataFrame.from_dict(items)
def Get_Test_Data_YQL(): # Will have to change conver_objects to specific numeric calls in the future result = load(urlopen("https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.historicaldata%20where%20symbol%20%3D%20%22YHOO%22%20and%20startDate%20%3D%20%222010-01-11%22%20and%20endDate%20%3D%20%222010-05-10%22&format=json&diagnostics=true&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys&callback=")) x = DataFrame.from_dict(result['query']['results']['quote']) x["Date"] = to_datetime(x["Date"]) x = x.convert_objects(convert_numeric=True) return x
def formatChecker(filename): reader = csv.DictReader(open(filename)) result = {} key = -1 for row in reader: key += 1 # key = row.pop('url') if key in result: pass if row['longitude'] is '' or row['latitude'] is '': continue row['longitude'] = float(row['longitude']) row['latitude'] = float(row['latitude']) result[key] = row for k, v, in result.iteritems(): if 'location' not in v: raise NameError('Missing ["location"] header') if 'day' not in v or 'month' not in v or 'year' not in v: raise NameError('Missing ["day"], ["month"], or ["year"] header') tempDate = validDateToJulianDate(v['month'] +'-'+ v['day'] +'-'+ v['year']) result[k].update({'concatDate':tempDate[0]}) result[k].update({'julianDay':tempDate[1]}) result[k].update({'julianDate':tempDate[2]}) df = DataFrame.from_dict(result, orient='index', dtype=None) if 'latitude' not in df.columns and 'longitude' not in df.columns: raise NameError('Missing ["latitude"], or ["longitude"] or header') # df = df.convert_objects(convert_numeric=True).dtypes # df[['latitude', 'longitude']] = df[['latitude', 'longitude']].astype(float) # except: # sys.exit('Date field contains non-digits.') return df
def _project(dataframe, project_q): if not project_q: return dataframe assert_list("project", project_q) if project_q == [["count"]]: # Special case for count only, ~equal to SQL count(*) return DataFrame.from_dict({"count": [len(dataframe)]}) aggregate_fns, alias_expressions = classify_expressions(project_q) if aggregate_fns and alias_expressions: raise_malformed("Cannot mix aliasing and aggregation functions", project_q) if isinstance(dataframe, DataFrameGroupBy): dataframe = _aggregate(dataframe, project_q, aggregate_fns) elif aggregate_fns: return _aggregate_without_group_by(dataframe, project_q, aggregate_fns) elif alias_expressions: dataframe = _alias(dataframe, alias_expressions) else: # Nothing to do here pass columns = [e if type(e) is not list else e[1] for e in project_q] try: return dataframe[columns] except KeyError: missing_columns = set(columns) - set(dataframe.columns.values) raise_malformed("Selected columns not in table", list(missing_columns))
def make_cluster_bmft(self): cluster_table = DataFrame.from_dict({i : {k : len(v) for k,v in c.to_dict()['genes'].iteritems()} for i,c in enumerate(self)}, orient='index') cluster_table = cluster_table.apply(nan_to_num) cluster_table['annotations'] = [c.annotation for c in self] cluster_table['qual_annot'] = [c.annot_fraction for c in self] cluster_table['genes'] = [";".join(c.genes) for c in self] return cluster_table
def cross_validation_test(): data = get_analyze_data() target = data["hand"] train = data.drop(["id"], axis = 1) kfold = 5 cross_val_test = {} print "Cross validation test..." model_rfc = RandomForestClassifier(n_estimators = 100) model_knc = KNeighborsClassifier(n_neighbors = 15) model_lr = LogisticRegression(penalty='l1', tol=0.01) scores = cross_validation.cross_val_score(model_rfc, train, target, cv = kfold) cross_val_test['RFC'] = scores.mean() scores = cross_validation.cross_val_score(model_knc, train, target, cv = kfold) cross_val_test['KNC'] = scores.mean() scores = cross_validation.cross_val_score(model_lr, train, target, cv = kfold) cross_val_test['LR'] = scores.mean() f = plt.figure(figsize = (8, 6)) p = DataFrame.from_dict(data = cross_val_test, orient='index').plot(kind='barh', legend=False, ax = f.gca()) f.savefig('./%s/cross_validation_test.png' % dirs[1]) for k,v in cross_val_test.iteritems(): print "%s : %s" % (k,str(v))
def cross_validation_test(): data = get_train_data() target = data.Cover_Type train = data.drop(['Cover_Type'], axis = 1) kfold = 10 cross_val_final = {} print 'Cross validation test...' model_rfc = RandomForestClassifier(n_estimators = 1024, criterion='entropy', n_jobs = -1) model_knc = KNeighborsClassifier(n_neighbors = 128) model_lr = LogisticRegression(penalty='l1', C=1e5) scores = cross_validation.cross_val_score(model_rfc, train, target, cv = kfold) cross_val_final['RFC'] = scores.mean() print 'RFC: ', scores.mean() scores = cross_validation.cross_val_score(model_knc, train, target, cv = kfold) cross_val_final['KNC'] = scores.mean() print 'KNC: ', scores.mean() scores = cross_validation.cross_val_score(model_lr, train, target, cv = kfold) cross_val_final['LR'] = scores.mean() print 'LR: ', scores.mean() f = plt.figure(figsize = (8, 6)) p = DataFrame.from_dict(data = cross_val_final, orient='index').plot(kind='barh', legend=False, ax = f.gca()) f.savefig('./test_plot/cross_validation_rfc_1024.png')
def read_umi_tools(filename: PathLike) -> AnnData: """Read a gzipped condensed count matrix from umi_tools. Parameters ---------- filename File name to read from. """ # import pandas for conversion of a dict of dicts into a matrix # import gzip to read a gzipped file :-) import gzip from pandas import DataFrame dod = {} # this will contain basically everything fh = gzip.open(fspath(filename)) header = fh.readline() # read the first line for line in fh: t = line.decode('ascii').split('\t') # gzip read bytes, hence the decoding try: dod[t[1]].update({t[0]:int(t[2])}) except KeyError: dod[t[1]] = {t[0]:int(t[2])} df = DataFrame.from_dict(dod, orient='index') # build the matrix df.fillna(value=0., inplace=True) # many NaN, replace with zeros return AnnData(np.array(df), {'obs_names': df.index}, {'var_names': df.columns})
def _collect_requests(query, request_limit): """Collects the string-casted results of a query. Args: query: (aflow.control.Query) A query with unprocessed requests. request_limit: (int) Maximum number of requests to submit. Returns: (DataFrame) Results collected from the query. """ # requests the first page of results to determine number of pages query._request(1, query.k) page_limit = (query._N // query.k) + 1 if request_limit and (page_limit > request_limit): page_limit = request_limit # requests the remaining pages for page in range(2, page_limit + 1): query._request(page, query.k) # collects request responses records = {} for page in range(1, page_limit + 1): records.update(query.responses[page]) return DataFrame.from_dict(data=records, orient='index')
def creat_table_base(records): # saved caculated variable names and descriptions in json format # currently only includes 16 most used variables calculated_vars = {"_iitax": "Federal income tax liability", "_fica": "FICA taxes (ee+er) for OASDI+HI", "c00100": "Federal AGI", "c02500": "OASDI benefits in AGI", "c04600": "Post-phase-out personal exemption", "_prexmp": "Pre-phase-out personal exemption", "c21040": "Itemized deduction that is phased out", "c04470": "Post-phase-out itemized deduction", "c04800": "Federal regular taxable income", "c05200": "Regular tax on taxable income", "c07220": "Child tax credit (adjusted)", "c11070": "Extra child tax credit (refunded)", "c07180": "Child care credit", "_eitc": "Federal EITC", "c62100_everyone": "federal AMT taxable income", "c09600": "federal AMT liability"} cal = DataFrame.from_dict(calculated_vars, orient='index') cal.columns = ['description'] puf_ecodes_info = pd.read_csv(EVAR_PATH) # Use all variable list minus unused variable list # to get used variable list VALID_READ_VARS = records.VALID_READ_VARS CODES_IMP = set(['AGIR1', 'DSI', 'EFI', 'EIC', 'ELECT', 'FDED', 'FLPDYR', 'FLPDMO', 'f2441', 'f3800', 'f6251', 'f8582', 'f8606', 'f8829', 'f8910', 'f8936', 'n20', 'n24', 'n25', 'n30', 'PREP', 'SCHB', 'SCHCF', 'SCHE', 'TFORM', 'IE', 'TXST', 'XFPT', 'XFST', 'XOCAH', 'XOCAWH', 'XOODEP', 'XOPAR', 'XTOT', 'MARS', 'MIDR', 'RECID', 'gender', 'wage_head', 'wage_spouse', 'earnsplit', 'age', 'agedp1', 'agedp2', 'agedp3', 'AGERANGE', 's006', 's008', 's009', 'WSAMP', 'TXRT', 'filer', 'matched_weight', 'e00200p', 'e00200s', 'e00900p', 'e00900s', 'e02100p', 'e02100s']) UNUSED_READ_VARS = records.UNUSED_READ_VARS USED_VARS = list(VALID_READ_VARS - CODES_IMP - UNUSED_READ_VARS) # read variable description from e_variable_info.csv table = {} for i in range(0, len(USED_VARS) - 1): # use variable names as keys of dictionary var_name = USED_VARS[i] f = (puf_ecodes_info.Input_Name == var_name) description = puf_ecodes_info.Definition_2014[f].values[0] table[var_name] = description table = pd.DataFrame.from_dict(table, orient='index') table.columns = ["description"] table = table.append(cal) return table
def pct_students_first_choice(to_compare): results = [] for i in to_compare: results.append(float(sum([1 if len(s.assigned) > 0 and (s.preference[0] in s.assigned) else 0 for s in si_students[i]])) / float(nstudents)) results = map(lambda x: x * 100, results) df = DataFrame.from_dict({'mechanism': sublist(mechanisms, to_compare), 'pct_students': results}) return (results, bar_graph(df, "% Students Matched With Top Choice\n"))
def test_get_dummies_dont_sparsify_all_columns(self, sparse): # GH18914 df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])])) df = get_dummies(df, columns=['Nation'], sparse=sparse) df2 = df.reindex(columns=['GDP']) tm.assert_frame_equal(df[['GDP']], df2)
def calculate(self): self.data = None cursor = connection.cursor() cursor.execute(self.get_query(), dict(year=self.parameters.registry_year, period=self.parameters.registry_period)) self.data = DataFrame.from_dict(self.prepare_data(dictfetchall(cursor)), orient='columns') cursor.close()
def df(country, f=None, **kwargs): date = kwargs['date'] if 'date' in kwargs else None response = search(country, f=f, date=date) if date is None: obj = {} obj[30] = {} for ur in response.aggs.ur30.buckets: obj[30][round(ur.key, 2)] = ur.doc_count obj[60] = {} for ur in response.aggs.ur60.buckets: obj[60][round(ur.key, 2)] = ur.doc_count obj[90] = {} for ur in response.aggs.ur90.buckets: obj[90][round(ur.key, 2)] = ur.doc_count obj[180] = {} for ur in response.aggs.ur180.buckets: obj[180][round(ur.key, 2)] = ur.doc_count obj[360] = {} for ur in response.aggs.ur360.buckets: obj[360][round(ur.key, 2)] = ur.doc_count else: obj = {} obj[30] = {} for ur in response.aggs.ur30.buckets: obj[30][round(ur.key, 2)] = ur.doc_count df = DataFrame.from_dict(obj).T if df.empty: return df df.index.name = 'period' df = df.fillna(0).astype('int64') df['total'] = df.sum(axis=1) df = df.reset_index() return df
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: cases_and_deaths = { "date": "date", "areaCode": "areaCode", "newCasesByPublishDate": "newCasesByPublishDate", "cumCasesByPublishDate": "cumCasesByPublishDate", "newDeaths28DaysByPublishDate": "newDeaths28DaysByPublishDate", "cumDeaths28DaysByPublishDate": "cumDeaths28DaysByPublishDate", "cumPillarOneTestsByPublishDate": "cumPillarOneTestsByPublishDate", } api = Cov19API(filters=["areaType=overview"], structure=cases_and_deaths) data_json = api.get_json() data = DataFrame.from_dict(data_json["data"]) data = table_rename( data, { "areaCode": "country_code", "newCasesByPublishDate": "new_confirmed", "cumCasesByPublishDate": "total_confirmed", "newDeaths28DaysByPublishDate": "new_deceased", "cumDeaths28DaysByDeathDate": "total_deceased", "cumPillarOneTestsByPublishDate": "total_tested", "date": "date", }, drop=True, ) data.date = data.date.apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) # Make sure all records have country code and no subregion code data["key"] = "GB" data["country_code"] = "GB" data["subregion2_code"] = None return data
def df(country, f=None, **kwargs): today = dt.datetime.today().strftime('%Y-%m-%d') date = kwargs['date'] if 'date' in kwargs else today response = search(country, f=f, date=date) obj = {} for model in response.aggs.model.buckets: obj[model.key] = { 'taza 30': round(model.ur30.value, 4), 'total': model.doc_count } df = DataFrame.from_dict(obj).T if df.empty: return df df.index.name = 'modelo' df = df[['taza 30', 'total']] df = df.reset_index() return df
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Regions have case data on a "by specimen date" basis. # This means they don't add up to the counts for nation which are on a # "by publish date" basis. cases_and_deaths = { "date": "date", "areaCode": "areaCode", "newCasesBySpecimenDate": "newCasesBySpecimenDate", "cumCasesBySpecimenDate": "cumCasesBySpecimenDate", "newDeaths28DaysByDeathDate": "newDeaths28DaysByDeathDate", "cumDeaths28DaysByDeathDate": "cumDeaths28DaysByDeathDate", } api = Cov19API(filters=["areaType=region"], structure=cases_and_deaths) regions_json = api.get_json() data = DataFrame.from_dict(regions_json["data"]) data = table_rename( data, { "areaCode": "match_string", "newCasesBySpecimenDate": "new_confirmed", "cumCasesBySpecimenDate": "total_confirmed", "newDeaths28DaysByDeathDate": "new_deceased", "cumDeaths28DaysByDeathDate": "total_deceased", "date": "date", }, drop=True, ) data.date = data.date.apply( lambda x: datetime_isoformat(x, "%Y-%m-%d")) _fix_bad_total_deceased(data) # Make sure all records have country code and no subregion code data["country_code"] = "GB" data["subregion2_code"] = None return data
def _post_process(self) -> DataFrame: columns = self.columns results = defaultdict(list) for row in self.results: for column in row: column_value = column.value if column_value.success is True: column_data = column_value.data if isinstance(column_data, Number): format_: ColumnFormat = columns[ column.column_index].format_ results[column.name].append( round(column_data, format_.precision)) else: results[column.name].append(column_data) else: results[column.name].append(np.NaN) df = DataFrame.from_dict(results) df = self.__handle_filters(df) df = self.__handle_sorts(df) return df
def calcsupp(df, filePath): ''' 计算供应商的vltcv、满足率(actual_origin_rate)、相对满足率 :param df: :param filePath: :return: ''' # df = data; filePath = analysis_path grouped = df.groupby('supp_name') summary = {} for supp_name, group in grouped: # item_sku_id = grouped.groups.keys()[1] # group = grouped.get_group(item_sku_id) sample = group[group.pur_bill_id.isnull() == False] for i in sample.index: # i = sample.index[0] vlt = sample.vlt[i] # supp_brevity_cd:供应商简码 supp_brevity_cd = sample.supp_brevity_cd[i] if np.isnan(vlt): # 使用 vlt 的均值来填充 vlt = np.nanmean(df.vlt) # 有采购单而vlt为空的数目很少 vlt = int(vlt) if vlt > 28: # 大于28天的 vlt 限制成28天 vlt = 28 actual_plan_rate = sample.actual_pur_qtty[i] / float(sample.plan_pur_qtty[i]) # 这个计算出来为空的,应该是订单无效? # actual_origin_rate = 实际/原始 actual_origin_rate = sample.actual_pur_qtty[i] / float(sample.originalnum[i]) summary[i] = {'supp_name': supp_name, 'day_string': sample.day_string[i], 'band': sample.org_nation_sale_num_band[i], 'item_sku_id': sample.item_sku_id[i], 'vlt': vlt, 'supp_brevity_cd': supp_brevity_cd, 'actual_pur_qtty': sample.actual_pur_qtty[i],'plan_pur_qtty': sample.plan_pur_qtty[i], 'originalnum': sample.originalnum[i], 'pur_bill_id': sample.pur_bill_id[i], 'actual_plan_rate': actual_plan_rate, 'actual_origin_rate': actual_origin_rate} z_value_frame = DataFrame.from_dict(summary).T z_value_frame.to_csv(filePath + '\\supp_value_frame.csv', index=False) return z_value_frame
def process(self, num_cores=10): to_dl = [] for k, v in tqdm(self.metadata.items()): genome_path = pjoin(self.data_path, v['assembly_level'].replace(" ", "_"), k) genome_file = pjoin(genome_path, k + ".fna") if not os.path.exists(genome_path): os.makedirs(genome_path) if not os.path.exists(genome_file) or (not os.path.exists( genome_file.replace(".fna", ".faa"))): to_dl += [(v['ftp_path'], genome_path, genome_file)] dlstuff = Parallel(n_jobs=num_cores)(delayed(download)(i) for i in tqdm(to_dl)) Database.process(self) self.taxos = DataFrame.from_dict({ g.name: g.get_taxo(self.taxDb) for g in self.genomes if g.metadata['assembly_level'] == "Complete_Genome" }).transpose().to_csv( pjoin(self.metadata_path, "complete_genomes_pretty_taxo.csv"))
def addToDB( cnx, info ): #Adds all songs, attributes, and artists to the appropriate tables in the database stdout.write("Adding to database...\n") stdout.flush() frame = DataFrame.from_dict( info ) # Convert the list data structure from getLikedSongs to a dataframe for ease of use count = 0 # Track progress for index, song in frame.iterrows(): insertArtist(song["artist"], cnx) # Put artist in the artists table songTuple = (song["id"], song["track"]) # Info for song table insertSong(songTuple, song["artist"], cnx) # Put song in songs table insertSHA(song["artist"], song["id"], cnx) # Link the artist and song in database # Convert all 0's to major and 1's to minor to match database if song['mode'] == 0: song['mode'] = 'major' elif song['mode'] == 1: song['mode'] = 'minor' # Get all attributes for the song in one place attributeTuple = (round(song["acousticness"], 9), round(song["danceability"], 9), song["duration_ms"], round(song["energy"], 9), song["instrumentalness"], song["key"], round(song["liveness"], 9), round(song["loudness"], 9), song["mode"], round(song["speechiness"], 9), round(song["tempo"], 4), round(song["valence"], 9)) insertAttributes(attributeTuple, song["id"], cnx) # Put song attributes in the tracks table # Keep track of progress count += 1 if count % 50 == 0: stdout.write(str(count) + '\n') stdout.flush()
def test_int64_overflow_moar(self): # GH9096 values = range(55109) data = DataFrame.from_dict({"a": values, "b": values, "c": values, "d": values}) grouped = data.groupby(["a", "b", "c", "d"]) assert len(grouped) == len(values) arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) i = np.random.choice(len(arr), len(arr) * 4) arr = np.vstack((arr, arr[i])) # add sume duplicate rows i = np.random.permutation(len(arr)) arr = arr[i] # shuffle rows df = DataFrame(arr, columns=list("abcde")) df["jim"], df["joe"] = np.random.randn(2, len(df)) * 10 gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! assert is_int64_overflow_possible(gr.grouper.shape) # manually compute groupings jim, joe = defaultdict(list), defaultdict(list) for key, a, b in zip(map(tuple, arr), df["jim"], df["joe"]): jim[key].append(a) joe[key].append(b) assert len(gr) == len(jim) mi = MultiIndex.from_tuples(jim.keys(), names=list("abcde")) def aggr(func): f = lambda a: np.fromiter(map(func, a), dtype="f8") arr = np.vstack((f(jim.values()), f(joe.values()))).T res = DataFrame(arr, columns=["jim", "joe"], index=mi) return res.sort_index() tm.assert_frame_equal(gr.mean(), aggr(np.mean)) tm.assert_frame_equal(gr.median(), aggr(np.median))
def test_get_ingress_results(serializer, options, tx_port, rx_port, api): """UDP Flow test traffic configuration """ udp_endpoint = PortTxRx(tx_port_name=tx_port.name, rx_port_names=[rx_port.name]) udp_header = Udp(src_port=Pattern(Counter(start="12001", step="2", count=100), ingress_result_name='UDP SRC PORT'), dst_port=Pattern("20")) udp_flow = Flow(name='UDP Flow', tx_rx=TxRx(udp_endpoint), packet=[ Header(Ethernet()), Header(Vlan()), Header(Ipv4()), Header(udp_header) ], size=Size(128), rate=Rate(unit='pps', value=1000), duration=Duration(FixedPackets(packets=10000))) config = Config(ports=[tx_port, rx_port], flows=[udp_flow], options=options) state = State(ConfigState(config=config, state='set')) print(serializer.json(state)) api.set_state(state) state = State(FlowTransmitState(state='start')) api.set_state(state) from pandas import DataFrame request = FlowRequest(ingress_result_names=['UDP SRC PORT']) while True: results = api.get_flow_results(request) df = DataFrame.from_dict(results) print(df) if df.frames_tx.sum() >= 10000 and df.frames_tx_rate.sum() == 0: break
def get_happinesses_by_method(pop_iterator, fast=False): num_sim, current_sim = 1500, 0 utils_by_scf = Dict() dataframe_dict = Dict() test_num_candidates = [3, 4, 6, 9, 13, 18, 24] # modify each sim to run in parallel while current_sim < num_sim: print(current_sim) # simulate for various numbers of candidates for n_candidates in test_num_candidates: n_voters = n_candidates * 750 for pop, param in pop_iterator(n_voters, n_candidates): n_pref_by_rk, pref_ij = ls.fast_gen_pref_summ(pop.preferences_rk) weights = ls.get_weights_from_counts(n_pref_by_rk) utils = social_util_by_cand(weights) winners_by_scf = simulate_all_elections(pop, fast=fast, n_pref_by_rank=n_pref_by_rk, pref_i_to_j=pref_ij) utils_by_scf[param][n_candidates][current_sim] = \ {k: utils[v] for k, v in winners_by_scf.items()} current_sim += 1 save_directory = 'Population_type_sim=' + pop_iterator.__name__ archive_old_sims(save_directory, 'Previous_sims_all_methods') # utils_by_scf[pop_param][n_candidates][sim_number][scf] # now make dict of DataFrames by paramaters, n_candidates os.mkdir(save_directory) for param, v_upper in utils_by_scf.items(): for n_cand, scf_by_sim_num in v_upper.items(): dataframe_dict[param][n_cand] = DataFrame.from_dict(scf_by_sim_num, orient='index') dataframe_dict[param][n_cand].boxplot(rot=90) # labels? by axis? plt.tight_layout() plt.savefig(save_directory + '/plot_p=' + str(param) + '_n_cand=' + str(n_cand) + '.png') plt.close()
def getData(forDays, dropExtraCols=True): # subtract forDays in terms of seconds from now now = int(time.time()) * 1000 timeToSubtract = int(forDays * 24 * 60 * 60 * 1000) effective_time = now - timeToSubtract print("now:", now) print("timeToSubtract:", timeToSubtract) print("effective_time :", effective_time) mongoClient = mongo.MongoClient(os.environ['mongodbhost'], 27017) db = mongoClient.admin sensorDataCollection = db.sensors sensorData = sensorDataCollection.find({ "source": "composite", "unixtime": { "$gte": effective_time } }).sort([("timestamp", mongo.DESCENDING)]) #temporary to hold data sensorDataArray = [] #populate temp for dataRow in sensorData: if len(dataRow) is 17: sensorDataArray.append(dataRow) else: print("bad data", dataRow) #create df df = DataFrame.from_dict(sensorDataArray) #drop unnecessary cols if dropExtraCols: df = df.drop(['_id', 'timestamp', 'unixtime', 'source'], axis=1) return df
def compare_assemblies(assemblies, chunk_size=2000, identity_threshold=0.40): """ compares a set of assemblies: assemblies is a dictionary with names of the assemblies as keys and fasta-files of the assemblies as values """ similarities = {} print "make blast dbs" for subject_name, subject in tqdm(assemblies.iteritems()): blast_db_cmd = [ "makeblastdb", "-in", subject, "-dbtype", "nucl", "-out", subject ] with open("/dev/null") as null: blastdb_return = call(blast_db_cmd, stdout=null) print "Run the hell out of it" for scaff_name, scaff in tqdm(assemblies.iteritems()): similarities[scaff_name] = {} chopped_up_query = "tmp.fasta" nb_chunks = len(cut_up_fasta(scaff, chopped_up_query, chunk_size)) for subject_name, subject in assemblies.iteritems(): nics = find_NICs(chopped_up_query, subject, identity_threshold, blast_db=False) # print scaff_name, "vs", subject_name similarities[scaff_name][subject_name] = len( nics.keys()) / nb_chunks os.remove(chopped_up_query) print "clean up" for subject_name, subject in tqdm(assemblies.iteritems()): blast_db_files = [subject + ".nhr", subject + ".nin", subject + ".nsq"] for f in blast_db_files: os.remove(f) similars = DataFrame.from_dict(similarities) return similars
def generateChart(algo, traderData): try: chart = DataFrame.from_dict(traderData['candles']) stock = StockDataFrame.retype(chart) value = CCI(chart['close'], chart['high'], chart['low'], 4, 0.0109)[-1] if (time.clock() - traderData['startTime'] > traderData['time']): traderData['buyingEnabled'] = False if algo == "cci": print(traderData['tradingSymbol'], value, traderData['candles'][-1]['close'], traderData['candles'][-1]['date']) cci_value = value if cci_value < -100 and traderData[ 'bought'] == False and traderData['updating'] == False: traderData['waiting'] = True if cci_value > -100 and traderData[ 'bought'] == False and traderData[ 'waiting'] and traderData['buyingEnabled']: buy(traderData) traderData['waiting'] = False elif cci_value > 100 and traderData['bought']: sell(traderData) if algo == "macd": macd_value = stock['macdh'][-1] print(macd_value) if macd_value > 0 and traderData['bought'] == False: buy(traderData) elif macd_value < 0 and traderData['bought']: sell(traderData) #get cci value for current time #if greater than 100 and not bought in yet, buy #if less than 100(NOT -100) and bought in, sell except: print("Unexpected error, trying again") time.sleep(.3) generateChart(algo, traderData)
def df_weighted(country, start=None, end=None, f=None, interval='month'): response = search_weighted(country, start=start, end=end, f=f, interval=interval) dates = [ x.key_as_string for x in response.aggs.stats.date_filter.kingo.dates.buckets ] obj = {x: {0} for x in dates} for date in response.aggs.stats.date_filter.kingo.dates.buckets: obj[date.key_as_string] = {} for model_version in date.model_version.buckets: obj[date.key_as_string][ model_version.key] = model_version.doc_count df = DataFrame.from_dict(obj, orient='index', dtype='int64') if df.empty: return df df.index.name = 'date' df = df.reindex(df.index.astype('datetime64')).sort_index() df = df.fillna(0).astype('int64') bucket_len = [x.days for x in diff(df.index.tolist())] if end is not None: bucket_len.append((date_dt(end) - df.index[-1].date()).days) else: bucket_len.append((local_date_dt(country) - df.index[-1].date()).days) df = df.div(bucket_len, axis='index') df['total'] = df.sum(axis=1) return df.astype('int64')
def get_report(session, args): # Arguments start, end = parse_date_range(args['report_range']) exists = report_exists(session, start, end, 'sla_report') if exists: # No work to do success = True else: # Celery worker task_results = report_task.delay(start, end) task_results.wait() success = task_results.get(timeout=1) if success: cached_results = get_cached_report(session, start, end, 'sla_report') frame = DataFrame.from_dict( cached_results.report ) frame.name = '{rpt} {frm} {to}'.format( rpt=cached_results.name, frm=start, to=end ) # Convert pyexcel table into dataframe index = Series(['{ext} {name}'.format(ext=client_ext, name=client_info['CLIENT_NAME']) for client_ext, client_info in current_app.config['CLIENTS'].items()] + ['Summary']) frame.insert(0, "Client", index) total = len(index) else: frame, total = empty_frame() return frame, total
def test_loc_modify_datetime(self): # see gh-28837 df = DataFrame.from_dict({ "date": [1485264372711, 1485265925110, 1540215845888, 1540282121025] }) df["date_dt"] = pd.to_datetime(df["date"], unit="ms", cache=True) df.loc[:, "date_dt_cp"] = df.loc[:, "date_dt"] df.loc[[2, 3], "date_dt_cp"] = df.loc[[2, 3], "date_dt"] expected = DataFrame( [ [ 1485264372711, "2017-01-24 13:26:12.711", "2017-01-24 13:26:12.711" ], [ 1485265925110, "2017-01-24 13:52:05.110", "2017-01-24 13:52:05.110" ], [ 1540215845888, "2018-10-22 13:44:05.888", "2018-10-22 13:44:05.888" ], [ 1540282121025, "2018-10-23 08:08:41.025", "2018-10-23 08:08:41.025" ], ], columns=["date", "date_dt", "date_dt_cp"], ) columns = ["date_dt", "date_dt_cp"] expected[columns] = expected[columns].apply(pd.to_datetime) tm.assert_frame_equal(df, expected)
def get_index_daily(self, code: str, start_date: str = '', end_date: str = ''): """ 获取指数日线数据 :param code: [description] :type code: str :param start_date: [description], defaults to '' :type start_date: str, optional :param end_date: [description], defaults to '' :type end_date: str, optional :raises QuantzException: [description] :return: [description] :rtype: [type] """ if code is None or '' == code: raise QuantzException( 'Failed to get_index_daily(Index code is empty)') if end_date == '': end_date = utils.now_2_YYYYmmdd() if (start_date != '' and int(start_date) > int(end_date)): raise QuantzException( 'Failed to get_index_daily(start_date must gte end_date)') if not self._is_data_available(code, start_date, end_date): _logd('Not All data available') if not self._obtain_delta_data(code, start_date, end_date): _logw('Failed to obtain data for %s' % (code)) raise QuantzException( 'Could not get full data for %s from %s to %s' % (code, start_date, end_date)) index_objects = IndexDailyItem.objects( ts_code=code, trade_date__gte=start_date, trade_date__lte=end_date).order_by('-trade_date') index_df = DataFrame.from_dict(json.loads(index_objects.to_json())) if index_df.shape[1] > 1: index_df = index_df.drop('_id', axis=1) return index_df
def df_open_now(country, end=None, f=None, interval='month'): response = search_open_now(country, end=end, f=f) models = [x.key for x in response.aggs.models.buckets] obj = {x: {0} for x in models} for model in response.aggs.models.buckets: obj[model.key] = {model.doc_count} df = DataFrame.from_dict(obj, orient='index', dtype='int64', columns=['now']) df.loc['total'] = df.sum() if df.empty: return df df.index.name = 'date' df = df.sort_index() return df
def df(country, f=None, **kwargs): today = dt.datetime.today().strftime('%Y-%m-%d') date = kwargs['date'] if 'date' in kwargs else today response = search(country, f=f, date=date) obj = {} for model in response.aggs.model.buckets: obj[model.key] = {} for ur in model.ur30.buckets: obj[model.key][round(ur.key,2)] = ur.doc_count df = DataFrame.from_dict(obj).T if df.empty: return df df.index.name = 'modelo' df = df.fillna(0).astype('int64') df['total'] = df.sum(axis=1) df.loc['total'] = df.sum() df = df.reset_index() return df
def train_new(): global ensemble content = request.json label = content['labels'] event = content['events'] label_df = DataFrame([[1,label]],columns=['Index','Label'] ) # label_df = DataFrame.from_dict(label) event_df = DataFrame.from_dict(event) # print(event_df) # print(label_df) with open('events_new.csv', 'a') as f: event_df.to_csv(f, header=False) # True if first event with open('labels_new.csv', 'a') as f: label_df.to_csv(f, header=False) # True if first event # new_set = test_set.append(event , ignore_index=True) # preds = ensemble.predict(new_set) # preds = DataFrame(preds) # preds = preds.replace(1.0,'Normal') # preds = preds.replace(-1.0,'Malicious') # js = preds.to_json() # print(preds) return "OK"
def list_privileges(self, to_dataframe: bool = False) -> Union[dict, DataFrame]: """List ALL privileges for Security Role. Optionally return a `DataFrame` object. Args: to_dataframe: If True, return a `DataFrame` object containing privileges """ self.fetch() priv_dict = { int(v[1]): k[1] for k, v in [x.items() for x in self.privileges] } if to_dataframe: df = DataFrame.from_dict(priv_dict, orient='index', columns=['Name']) df.index.name = 'ID' return df else: return priv_dict
def df(country, start=None, end=None, f=None, interval='month'): if country not in COUNTRY_LIST: raise Exception(f'{country} is not a valid country') response = search(country, start=start, end=end, f=f, interval=interval) obj = {} for interval in response.aggs.dates.buckets: obj[interval.key_as_string] = {} for status in interval.status.buckets: obj[interval.key_as_string][status.key] = status.doc_count df = DataFrame.from_dict(obj, orient='index') if df.empty: return df df.index = df.index.astype('datetime64') df.index.name = 'date' df = df.fillna(0).astype('int64') df['total'] = df.sum(axis=1) return df
def predict(data, model=load_model()): """ Returns prediction given the model and data to predict Parameters ---------- model: Model instance returned by load_model API data: Data format as expected by the predict API of the core estimator. For eg. in case of sckit models it could be numpy array/List of list/Panda DataFrame Returns ------- predictions: Output from scoring server Format: {'prediction':output from model.predict method} """ # This is the default implementation of the predict() function specific to this score.py template only. if model == "default_model" or len(data) == 0: return {'prediction': 'Hello world!'} from pandas import read_json, DataFrame from io import StringIO data = read_json(StringIO(data)) if isinstance( data, str) else DataFrame.from_dict(data) pred = model.predict(data).tolist() return {'prediction': pred}
def parse_many_spectra(spectra_dir: str, prefix: str = '', wave_info=(501, 3996, 1715)) -> DataFrame: all_spectra = {} unparsed_spectra = 0 spectra_names = listdir(spectra_dir) spectra_dir = Path(spectra_dir) for spectrum_name in tqdm(spectra_names): try: spectrum_path = spectra_dir / spectrum_name spectrum = opus_reader(spectrum_path) absorbance = spectrum.interpolate(*wave_info)[1] all_spectra[change_ssn(spectrum_name, prefix)] = absorbance except Exception: unparsed_spectra += 1 print(f'Parsing finished. ' f'{unparsed_spectra} spectra not parsed') columns = np.linspace(*wave_info) \ .astype(str) spectra_df = DataFrame.from_dict(all_spectra, orient='index') spectra_df.columns = columns return spectra_df
def make_criteria_csv(): """ Make criteria tables: _build/csv/{all,axes,coords}_criteria.csv """ csv_dir = "_build/csv" os.makedirs(csv_dir, exist_ok=True) # Criteria tables df = DataFrame.from_dict(coordinate_criteria) df = df.dropna(axis=1, how="all") df = df.applymap(lambda x: ", ".join(sorted(x)) if isinstance(x, tuple) else x) df = df.sort_index(axis=0).sort_index(axis=1) # All criteria df.to_csv(os.path.join(csv_dir, "all_criteria.csv")) # Axes and coordinates for keys, name in zip([_AXIS_NAMES, _COORD_NAMES], ["axes", "coords"]): subdf = df[sorted(keys)].dropna(axis=1, how="all") subdf = subdf.dropna(axis=1, how="all").transpose() subdf.to_csv(os.path.join(csv_dir, f"{name}_criteria.csv"))
def index_models_bigg(): try: response = requests.get('http://bigg.ucsd.edu/api/v2/models', timeout=3) except requests.ConnectionError as e: logger.error( "Cannot reach http://bigg.ucsd.edu. Are you sure that you are connected to the internet?" ) raise e if response.ok: try: json = response.json() except Exception as e: logger.error( 'No json could be decoded from server response coming from http://bigg.ucsd.edu.' ) raise e else: return DataFrame.from_dict(json['results']) else: raise Exception( "Could not index available models. bigg.ucsd.edu returned status code {}" .format(response.status_code))
def describe(self) -> DataFrame: """Describe a TimeSeriesDataset with the describe function from Pandas TODO Define what describe should do on TimeSeriesDataset (see issue 56) Returns: TODO Define return type """ min = self.min() max = self.max() mean = self.mean() median = self.median() kurtosis = self.kurtosis() skewness = self.skewness() return DataFrame.from_dict({ 'minimum': min, 'maximum': max, 'mean': mean, 'median': median, 'kurtosis': kurtosis, 'skewness': skewness })
def get_pathway_to_definition_map(species): """Map kegg paths to their definition.""" kegg_list = REST.kegg_list("pathway", species) clean_kegg_path = re.compile(r"path:{}|\n".format(species)) rowdicts = [] for kegg_path_line in kegg_list: try: kegg_path_line = kegg_path_line.decode("utf-8") except AttributeError: pass kegg_info = re.sub(clean_kegg_path, "", kegg_path_line) pathway, definition = kegg_info.split("\t") definition = definition.split(" - ")[0] # Remove species info rowdict = {"kegg_pathway": pathway, "kegg_pathway_definition": definition} rowdicts.append(rowdict) return DataFrame.from_dict(rowdicts)
def user_top_recommended_stories( context, recommender_model: TruncatedSVD, user_story_matrix: IndexedCooMatrix) -> DataFrame: """The top stories for each commenter (user).""" # Compute XV, which has a row for each user and a column for each component XV = recommender_model.transform(user_story_matrix.matrix) # Now we want to project XV back into story-space. As a dense matrix, the product would be way # too big - | # users * # stories|, so we sparsify both the multiplicands to make it more # manageable. XV[np.abs(XV) < 1] = 0 sparse_XV = csr_matrix(XV) context.log.info(f"sparse_XV shape: {sparse_XV.shape}") context.log.info(f"sparse_XV non-zero: {sparse_XV.count_nonzero()}") recommender_model.components_[ np.abs(recommender_model.components_) < 1e-2] = 0 sparse_components = csc_matrix(recommender_model.components_) context.log.info( f"recommender_model.components_ shape: {recommender_model.components_.shape}" ) context.log.info( f"sparse_components non-zero: {sparse_components.count_nonzero()}") # A matrix with the same dimensions as user_story_matrix, but reduced in rank X_hat = sparse_XV @ sparse_components coo = coo_matrix(X_hat) story_ids = user_story_matrix.col_index[coo.col].values user_ids = user_story_matrix.row_index[coo.row].values context.log.info(f"recommendations: {len(story_ids)}") return DataFrame.from_dict({ "user_id": user_ids, "story_id": story_ids, "relevance": coo.data })