def __add__(self, that): # ensure all rows are present for both tables, filling in 0 if necessary # (otherwise the empty rows will be treated as if they contain NaN when adding) me = self._df you = that._df for row in me.index: if row not in that._df.index: you = you.append(DataFrame.from_items([(e, {row: '' if me[e][row]=='' else 0}) for e in PRCounter.COLUMNS])) for row in you.index: if row not in self._df.index: me = me.append(DataFrame.from_items([(e, {row: '' if you[e][row]=='' else 0}) for e in PRCounter.COLUMNS])) # add counts new_df = me + you # recompute ratios new_df['P'] = new_df['Numer'] / new_df['PDenom'] new_df['R'] = new_df['Numer'] / new_df['RDenom'] denom = (new_df['P'] + new_df['R']) new_df['F'] = 2 * new_df['P'] * new_df['R'] / denom[denom>0] new_df['Acc'] = new_df['T'] / new_df['N'] result = PRCounter() result._df = new_df return result
def get_contributions(self): data = [] columns = ['mp-id', 'contribution', 'efermi', 'ehull', 'bandgap'] docs = self.query_contributions(criteria=self.tam_perovskites_query, projection={ '_id': 1, 'mp_cat_id': 1, 'content.efermi': 1, 'content.ehull': 1, 'content.bandgap': 1 }) if not docs: raise Exception( 'No contributions found for TamPerovskites Explorer!') for doc in docs: mpfile = MPFile.from_contribution(doc) mp_id = mpfile.ids[0] contrib = mpfile.hdata[mp_id] cid_url = '/'.join([ self.preamble.rsplit('/', 1)[0], 'explorer', 'materials', doc['_id'] ]) row = [ mp_id, cid_url, contrib['efermi'], contrib['ehull'], contrib['bandgap'] ] data.append((mp_id, row)) return DataFrame.from_items(data, orient='index', columns=columns)
def TreeMatrix(D, desc, L, Env=None, DshapeLarge=True): """ Applying tree information (desc,L) on a given count matrix (D) and columns grouping (Env) to obtain a matrix of count over the tree """ #I assume that D has correct columns name if not DshapeLarge: Z = (D[["Sample", "Taxon"]]).values Z = MultiIndex.from_tuples(map(tuple, tuple(Z)), names=["Sample", "Taxon"]) D.index = Z Dlarge = D.Count.unstack(level=0) Dlarge.fillna(value=0, inplace=True) #I assume that Environment has correct index and columns names ExperimentalDesignColumns = MultiIndex.from_tuples( map(tuple, tuple(Env.ix[Dlarge.columns].values)), names=["Sample", "Group"]) else: # if D is already Large Environment information is already included Dlarge = D ExperimentalDesignColumns = Dlarge.columns #if taxon only present in tree but not in table, access mode .ix correctly report NA for that line, that later will be converted to zero. NodeTableLarge = [[x[0], Dlarge.ix[x[-1]].sum()] for x in desc] Dtree = DataFrame.from_items(NodeTableLarge).transpose() NodeAndLeafNamesIndex = MultiIndex.from_tuples(map( tuple, tuple(L.loc[:, ["Name", "Is_Leaf"]].ix[Dtree.index].values)), names=["Name", "Is_Leaf"]) Dtree.index = NodeAndLeafNamesIndex Dtree.columns = ExperimentalDesignColumns Dtree.columns = Dtree.columns.reorder_levels(["Group", "Sample"]) return Dtree
def _update_margins(self): for variable in self.margins_by_variable: survey_scenario = self.survey_scenario simulation = survey_scenario.simulation column_by_name = survey_scenario.tax_benefit_system.column_by_name assert variable in column_by_name column = survey_scenario.tax_benefit_system.column_by_name[variable] weight = self.weight filter_by = self.filter_by initial_weight = self.initial_weight value = simulation.calculate_add(variable) margin_items = [ ('actual', weight[filter_by]), ('initial', initial_weight[filter_by]), ] if column.__class__ in [AgeCol, BoolCol, EnumCol]: margin_items.append(('category', value[filter_by])) # TODO: should not use DataFrame for that ... margins_data_frame = DataFrame.from_items(margin_items) margins_data_frame = margins_data_frame.groupby('category', sort = True).sum() margin_by_type = margins_data_frame.to_dict() else: margin_by_type = dict( actual = (weight[filter_by] * value[filter_by]).sum(), initial = (initial_weight[filter_by] * value[filter_by]).sum(), ) self.margins_by_variable[variable].update(margin_by_type)
def StormSums(Stormslist,Data,offset=0): eventlist = [] index =[] for storm_index,storm in Stormslist.iterrows(): #print storm start = storm['start']-timedelta(minutes=offset) ##if Storms are defined by stream response you have to grab the preceding precip data end= storm['end'] data = True ## Innocent until proven guilty try: event = Data.ix[start:end] ### slice list of Data for event except KeyError: start = start+timedelta(minutes=15) ## if the start time falls between 2 30minute periods try: event = Data.ix[start:end] except KeyError: end = end+timedelta(minutes=15) try: event = Data.ix[start:end] except KeyError: print 'no precip data available for storm' data = False pass if data != False: eventcount = event.count() eventsum = event.sum() eventmax = event.max() eventlist.append((storm['start'],[storm['start']-timedelta(minutes=offset),storm['end'],eventcount,eventsum,eventmax])) Events=DataFrame.from_items(eventlist,orient='index',columns=['start','end','count','sum','max']) return Events
def TreeMatrix(D,desc,L, Env=None,DshapeLarge=True): """ Applying tree information (desc,L) on a given count matrix (D) and columns grouping (Env) to obtain a matrix of count over the tree """ #I assume that D has correct columns name if not DshapeLarge: Z=(D[["Sample","Taxon"]]).values Z=MultiIndex.from_tuples(map(tuple,tuple(Z)), names=["Sample","Taxon"]) D.index=Z Dlarge=D.Count.unstack(level=0) Dlarge.fillna(value=0,inplace=True) #I assume that Environment has correct index and columns names ExperimentalDesignColumns=MultiIndex.from_tuples( map(tuple,tuple(Env.ix[Dlarge.columns].values)) , names=["Sample","Group"]) else: # if D is already Large Environment information is already included Dlarge=D ExperimentalDesignColumns=Dlarge.columns #if taxon only present in tree but not in table, access mode .ix correctly report NA for that line, that later will be converted to zero. NodeTableLarge=[[x[0],Dlarge.ix[x[-1]].sum()] for x in desc] Dtree=DataFrame.from_items(NodeTableLarge).transpose() NodeAndLeafNamesIndex=MultiIndex.from_tuples( map(tuple,tuple(L.loc[:,["Name","Is_Leaf"]].ix[Dtree.index].values)) , names=["Name","Is_Leaf"]) Dtree.index=NodeAndLeafNamesIndex Dtree.columns=ExperimentalDesignColumns Dtree.columns=Dtree.columns.reorder_levels(["Group", "Sample"]) return Dtree
def update_margins(self): for variable in self.margins_by_name: survey_scenario = self.survey_scenario simulation = survey_scenario.simulation column_by_name = survey_scenario.tax_benefit_system.column_by_name assert variable in column_by_name column = survey_scenario.tax_benefit_system.column_by_name[ variable] weight = self.weight filter_by = self.filter_by initial_weight = self.initial_weight value = simulation.calculate(variable) margin_items = [ ('actual', weight[filter_by]), ('initial', initial_weight[filter_by]), ] if column.__class__ in [AgeCol, BoolCol, EnumCol]: margin_items.append(('category', value[filter_by])) margins_data_frame = DataFrame.from_items(margin_items) margins_data_frame = margins_data_frame.groupby( 'category', sort=True).sum() margin_by_type = margins_data_frame.to_dict() else: margin_by_type = dict( actual=(weight[filter_by] * value[filter_by]).sum(), initial=(initial_weight[filter_by] * value[filter_by]).sum(), ) self.margins_by_name[variable].update(margin_by_type) if self.total_population is not None: target = self.margins_by_name[variable].get('target', False)
def test_scientific_no_exponent(self): # see gh-12215 df = DataFrame.from_items([("w", ["2e"]), ("x", ["3E"]), ("y", ["42e"]), ("z", ["632E"])]) data = df.to_csv(index=False) for prec in self.float_precision_choices: df_roundtrip = self.read_csv(StringIO(data), float_precision=prec) tm.assert_frame_equal(df_roundtrip, df)
def test_get_dummies_dont_sparsify_all_columns(self, sparse): # GH18914 df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])]) df = get_dummies(df, columns=['Nation'], sparse=sparse) df2 = df.reindex(columns=['GDP']) tm.assert_frame_equal(df[['GDP']], df2)
def get_contributions(self, phase=None): data = [] phase_query_key = {'$exists': 1} if phase is None else phase columns = ['mp-id', 'contribution', 'formula'] if phase is None: columns.append('phase') columns += ['dH (formation)', 'dH (hydration)', 'GS?', 'CIF'] for doc in self.query_contributions(criteria={ 'project': { '$in': ['LBNL', 'MIT'] }, 'content.info.Phase': phase_query_key }, projection={ '_id': 1, 'mp_cat_id': 1, 'content': 1 }): mpfile = MPFile.from_contribution(doc) mp_id = mpfile.ids[0] info = mpfile.hdata[mp_id]['info'] row = [mp_id, get_short_object_id(doc['_id']), info['Formula']] if phase is None: row.append(info['Phase']) row += [info['dHf'], info['dHh'], info['GS'], 'TODO'] # TODO URLs for mp_id and cid data.append((mp_id, row)) return DataFrame.from_items(data, orient='index', columns=columns)
def __setitem__(self, k, v): if isinstance(v[0], int): N, gold_set, pred_set = v if gold_set or pred_set: assert N>0,(N,gold_set,pred_set) else: gold_set, pred_set = v N = '' entry = { 'Numer': len(gold_set & pred_set), 'PDenom': len(pred_set), 'RDenom': len(gold_set), 'N': N } entry['P'] = entry['Numer'] / entry['PDenom'] if entry['PDenom'] else float('nan') entry['R'] = entry['Numer'] / entry['RDenom'] if entry['RDenom'] else float('nan') entry['F'] = 2 * entry['P'] * entry['R'] / (entry['P'] + entry['R']) if (entry['P'] + entry['R']) else float('nan') if N=='': entry['T'] = None entry['Acc'] = None else: if len(gold_set)==len(pred_set)==N: entry['T'] = entry['Numer'] else: tp = entry['Numer'] fp = len(pred_set-gold_set) fn = len(gold_set-pred_set) entry['T'] = N-fp-fn assert entry['T']>=0,(entry,gold_set,pred_set) entry['Acc'] = float('nan') if N==0 else entry['T'] / N df = DataFrame.from_items([(e, {k: entry[e]}) for e in PRCounter.COLUMNS]) self._df = self._df.append(df)
def into(a, b, columns=None, schema=None, **kwargs): if not columns and schema: columns = dshape(schema)[0].names return DataFrame.from_items(((column, b[column][:]) for column in sorted(b.names)), orient='columns', columns=columns)
def Report(FullCounts, t, otu, db): print "Unequal Effort" Halpha, Hgamma, Hbeta, HE, tots = UltraTreeTest(FullCounts, t, otu) H = db.GetEntropiesPandas(q="1", Pairwise=1, EqualEffort=0) subFull = FullCounts.iloc[:, [0, 1]] subFull.columns.set_labels([0, 1], level=0, inplace=True) Halpha_k, Hgamma_k, Hbeta_k, HE_k, tots_k = UltraTreeTest(subFull, t, otu) result = DataFrame.from_items( [["Hgamma", [Hgamma, H["Hgamma"]]], ["Halpha", [Halpha, H["HalphaByEnvironment"]]], ["Hbeta", [Hbeta, H["MI_treeAndEnvironment"]]], ["DistTurnover", [Hbeta / HE, H["DistTurnover"].iloc[1, 0]]], [ "DistTurnoverbySample", [Hbeta_k / HE_k, H["DistTurnoverBySample"].iloc[1, 0]] ]], columns=["Test", "RegularRoutine"], orient="index") result["Dif"] = result.Test - result.RegularRoutine print(result) print "Equal Effort" H = db.GetEntropiesPandas(q="1", Pairwise=1, EqualEffort=1) #Halpha,Hgamma,Hbeta,HE=UltraTreeTest(countsA,countsB,[0.5,0.5]) Halpha, Hgamma, Hbeta, HE, tots = UltraTreeTest(FullCounts, t, otu, Equal=True) Halpha_k, Hgamma_k, Hbeta_k, HE_k, tots_k = UltraTreeTest(subFull, t, otu, Equal=True) result = DataFrame.from_items( [["Hgamma", [Hgamma, H["Hgamma"]]], ["Halpha", [Halpha, H["HalphaByEnvironment"]]], ["Hbeta", [Hbeta, H["MI_treeAndEnvironment"]]], ["DistTurnover", [Hbeta / HE, H["DistTurnover"].iloc[1, 0]]], [ "DistTurnoverbySample", [Hbeta_k / HE_k, H["DistTurnoverBySample"].iloc[1, 0]] ]], columns=["Test", "RegularRoutine"], orient="index") result["Dif"] = result.Test - result.RegularRoutine print(result) return None
def test_scientific_no_exponent(self): # see gh-12215 df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']), ('y', ['42e']), ('z', ['632E'])]) data = df.to_csv(index=False) for prec in self.float_precision_choices: df_roundtrip = self.read_csv(StringIO(data), float_precision=prec) tm.assert_frame_equal(df_roundtrip, df)
def setUp(self): self.season_period = 2 self.values = range(1, self.season_period * 2+1) self.dataframe = DataFrame.from_items([('values', self.values)]) self.model = HoltWinters(self.dataframe, season_period=self.season_period) self.model._init_starting_arrays() self.model.coefs = [0.5, 0.5, 0.5]
def simulate(adj, theta, num_samples): data = DataFrame.from_items( [(node, Series(np.zeros(num_samples, int))) for node in adj.columns] ) for node in adj.columns: P = parents(node, adj) for n in range(num_samples): key = ','.join( [str(data.ix[n,parent]) for parent in P] ) pdt = theta[node][key] data.ix[n,node] = draw(pdt) return data
def test_scientific_no_exponent(self): # see gh-12215 df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']), ('y', ['42e']), ('z', ['632E'])]) data = df.to_csv(index=False) for prec in self.float_precision_choices: df_roundtrip = self.read_csv( StringIO(data), float_precision=prec) tm.assert_frame_equal(df_roundtrip, df)
def fetch_genes(taxon_id): c.execute(""" SELECT id, symbol, name FROM gene WHERE taxon_id=%s ORDER BY id""", (taxon_id,)) return DataFrame.from_items([(row[0], row) for row in c], columns=["id", "symbol", "name"], orient="index")
def test_reader_seconds(self): # Test reading times with and without milliseconds. GH5945. _skip_if_no_xlrd() import xlrd if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): # Xlrd >= 0.9.3 can handle Excel milliseconds. expected = DataFrame.from_items([("Time", [ time(1, 2, 3), time(2, 45, 56, 100000), time(4, 29, 49, 200000), time(6, 13, 42, 300000), time(7, 57, 35, 400000), time(9, 41, 28, 500000), time(11, 25, 21, 600000), time(13, 9, 14, 700000), time(14, 53, 7, 800000), time(16, 37, 0, 900000), time(18, 20, 54) ])]) else: # Xlrd < 0.9.3 rounds Excel milliseconds. expected = DataFrame.from_items([("Time", [ time(1, 2, 3), time(2, 45, 56), time(4, 29, 49), time(6, 13, 42), time(7, 57, 35), time(9, 41, 29), time(11, 25, 22), time(13, 9, 15), time(14, 53, 8), time(16, 37, 1), time(18, 20, 54) ])]) epoch_1900 = os.path.join(self.dirpath, 'times_1900.xls') epoch_1904 = os.path.join(self.dirpath, 'times_1904.xls') actual = read_excel(epoch_1900, 'Sheet1') tm.assert_frame_equal(actual, expected) actual = read_excel(epoch_1904, 'Sheet1') tm.assert_frame_equal(actual, expected)
def save_data(): ts = time.time() base_dir = filedialog.askdirectory() filename_time = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d-%H%M') filename_base = os.path.join(base_dir, filename_time) filename = '%s.csv' % filename_base print("Saving Data...") df1 = DataFrame.from_items([('Pitch',pdx), ('Roll',pdy),('Yaw',pdz)]) df1.stack(level=0, dropna = False) # Takes rows and converts to columns df1.to_csv(filename) # outputs to csv file
def setUp(self): self.season_period = 2 self.values = [np.array([[i+1], [i]]) for i in range(1, self.season_period * 2+1)] self.dataframe = DataFrame.from_items([('values', self.values)]) self.hwi = HoltWintersI(self.dataframe, season_period=self.season_period) self.hwi._init_starting_arrays() self.coefs = [0.5] * 12 self.A, self.B, self.G = flats_to_matrix(self.coefs)
def setUp(self): self.periods = 2 self.values = [ np.array([[i + 1], [i]]) for i in range(1, self.periods + 1) ] self.dataframe = DataFrame.from_items([('values', self.values)]) self.model = HoltI(self.dataframe) self.coefs = [0.5] * 8 self.A, self.B = flats_to_matrix(self.coefs) self.model._init_starting_arrays()
def simulate(adj, theta, num_samples): data = DataFrame.from_items([(node, Series(np.zeros(num_samples, int))) for node in adj.columns]) for node in adj.columns: P = parents(node, adj) for n in range(num_samples): key = ','.join([str(data.ix[n, parent]) for parent in P]) pdt = theta[node][key] data.ix[n, node] = draw(pdt) return data
def calibrate(self): """ Calibrate according to margins found in frame """ df = self.frame inputs = self.simulation.survey output_table = self.simulation.output_table margins = {} if df is not None: df = df.reset_index(drop=True) df = df.set_index(['var','mod'], inplace = True) for var, mod in df.index: # Dealing with non categorical vars ... if df.get_value((var,mod), u"modalités") == 'total': margins[var] = df.get_value((var,mod), 'cible') # ... and categorical vars else: if not margins.has_key(var): margins[var] = {} margins[var][mod] = df.get_value((var,mod), 'cible') param = self.get_param() if self.totalpop is not None: margins['totalpop'] = self.totalpop adjusted_margins = self.update_weights(margins, param=param) if 'totalpop' in margins.keys(): del margins['totalpop'] w = self.weights for var in margins.keys(): if var in inputs.column_by_name: value = inputs.get_value(var, self.entity) else: entity = self.entity enum = output_table._inputs.column_by_name.get('qui'+self.entity).enum people = [x[1] for x in enum] value = output_table.get_value(var, entity=entity, opt=people, sum_=True) if isinstance(margins[var], dict): items = [('marge', w ),('mod', value)] updated_margins = DataFrame.from_items(items).groupby('mod', sort= True).sum() for mod in margins[var].keys(): df.set_value((var,mod), u"cible ajustée", adjusted_margins[var][mod]) df.set_value((var,mod), u"marge", updated_margins['marge'][mod]) else: updated_margin = (w*value).sum() df.set_value((var,0), u"cible ajustée", adjusted_margins[var]) df.set_value((var,0), u"marge", updated_margin) if self.frame is not None: self.frame = df.reset_index()
def test_reader_special_dtypes(self): _skip_if_no_xlrd() expected = DataFrame.from_items([ ("IntCol", [1, 2, -3, 4, 0]), ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), ("BoolCol", [True, False, True, True, False]), ("StrCol", [1, 2, 3, 4, 5]), # GH5394 - this is why convert_float isn't vectorized ("Str2Col", ["a", 3, "c", "d", "e"]), ("DateCol", [ datetime(2013, 10, 30), datetime(2013, 10, 31), datetime(1905, 1, 1), datetime(2013, 12, 14), datetime(2015, 3, 14) ]) ]) xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx') xls_path = os.path.join(self.dirpath, 'test_types.xls') # should read in correctly and infer types for path in (xls_path, xlsx_path): actual = read_excel(path, 'Sheet1') tm.assert_frame_equal(actual, expected) # if not coercing number, then int comes in as float float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[1, "Str2Col"] = 3.0 for path in (xls_path, xlsx_path): actual = read_excel(path, 'Sheet1', convert_float=False) tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) for icol, name in enumerate(expected.columns): actual = read_excel(xlsx_path, 'Sheet1', index_col=icol) actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name) exp = expected.set_index(name) tm.assert_frame_equal(actual, exp) tm.assert_frame_equal(actual2, exp) # convert_float and converters should be different but both accepted expected["StrCol"] = expected["StrCol"].apply(str) actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str}) tm.assert_frame_equal(actual, expected) no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str}, convert_float=False) tm.assert_frame_equal(actual, no_convert_float)
def test_reader_seconds(self): # Test reading times with and without milliseconds. GH5945. _skip_if_no_xlrd() import xlrd if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): # Xlrd >= 0.9.3 can handle Excel milliseconds. expected = DataFrame.from_items([("Time", [time(1, 2, 3), time(2, 45, 56, 100000), time(4, 29, 49, 200000), time(6, 13, 42, 300000), time(7, 57, 35, 400000), time(9, 41, 28, 500000), time(11, 25, 21, 600000), time(13, 9, 14, 700000), time(14, 53, 7, 800000), time(16, 37, 0, 900000), time(18, 20, 54)])]) else: # Xlrd < 0.9.3 rounds Excel milliseconds. expected = DataFrame.from_items([("Time", [time(1, 2, 3), time(2, 45, 56), time(4, 29, 49), time(6, 13, 42), time(7, 57, 35), time(9, 41, 29), time(11, 25, 22), time(13, 9, 15), time(14, 53, 8), time(16, 37, 1), time(18, 20, 54)])]) epoch_1900 = os.path.join(self.dirpath, 'times_1900.xls') epoch_1904 = os.path.join(self.dirpath, 'times_1904.xls') actual = read_excel(epoch_1900, 'Sheet1') tm.assert_frame_equal(actual, expected) actual = read_excel(epoch_1904, 'Sheet1') tm.assert_frame_equal(actual, expected)
def save_data(): ts = time.time() base_dir = filedialog.askdirectory() filename_time = datetime.datetime.fromtimestamp(ts).strftime( '%Y-%m-%d-%H%M') filename_base = os.path.join(base_dir, filename_time) filename = '%s.csv' % filename_base print("Saving Data...") df1 = DataFrame.from_items([('Pitch', pdx), ('Roll', pdy), ('Yaw', pdz)]) df1.stack(level=0, dropna=False) # Takes rows and converts to columns df1.to_csv(filename) # outputs to csv file
def main(): conf = SparkConf() # conf.set("spark.executor.memory","3g") # conf.set("spark.speculation", "True") sc = SparkContext(conf=conf) v = sc.textFile('hdfs:///data/ad_f.csv') \ .map(lambda line: line.split(",")) \ .filter(lambda line: line[5] != 'NULL') \ .map(lambda line: (int(line[0]), line[1], [line[3], line[4], line[5]])) # .collect() vl = v.map(lambda i: DenseVector(i[2])) index_vl = v.map(lambda i: (i[0], DenseVector(i[2]))) index_vl.cache(pyspark.storagelevel.MEMORY_AND_DISK) max_iterations = 33 k = 10 clusters = [ ] # contain several dense vectors that is choosen to be centroid met = np.array([[1.39120240528e-06, -7.11964361751e-08, 1.68554275438e-07], [-7.1196436173e-08, 4.18367212413e-06, -2.45888145316e-07], [1.6855427544e-07, -2.45888145311e-07, 1.43614586304e-06]]) row_num = vl.count() b_met = sc.broadcast(met) # broadcast met,access value by b_met.value b_row = sc.broadcast(row_num) # broadcast row numbers populate_initial_centers() # compute_distances() d = compute_distances() a = get_clusters(d) counter = 0 while True: counter += 1 previous_clusters = clusters print("counter: ", counter) print("Previous ", previous_clusters) clusters = compute_new_centers(a) print(clusters) d = compute_distances() a = get_clusters(d) if counter >= max_iterations: break sse = d.map(lambda x: x[1]**2).sum() x = a.map(lambda x: (x[0][0], float(x[0][1]))) p = v.join(x) q = p.map(lambda x: x[1]) sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(q, ['ad_id', 'cluster']) df.repartition(1).write.csv(path="/data/op_clu", header=True) # c = sc.parallelize(clusters).map(lambda x: str(x)[1:-1]) clusters.append(np.array([sse, 0, 0])) t = zip(range(k + 1), clusters) cen = DataFrame.from_items(t, range(k), 'index') df_cen = sqlContext.createDataFrame(cen, ['campaign_id', 'customer', 'brand']) df_cen.repartition(1).write.csv(path="/data/op_cen", header=True)
def __add__(self, that): # ensure all rows are present for both tables, filling in 0 if necessary # (otherwise the empty rows will be treated as if they contain NaN when adding) me = self._df you = that._df for row in me.index: if row not in that._df.index: you = you.append(DataFrame.from_items([(e, {row: '' if me[e][row]=='' else 0}) for e in PRCounter.COLUMNS])) for row in you.index: if row not in self._df.index: me = me.append(DataFrame.from_items([(e, {row: '' if you[e][row]=='' else 0}) for e in PRCounter.COLUMNS])) # add counts new_df = me + you result = PRCounter() result._df = new_df if self.COMPUTE_RATIOS_ON_ADD: # recompute ratios self.compute_ratios() return result
def calibrate_old(self): """ Calibrate according to margins found in frame """ df = self.frame margins = {} if df is not None: df.reset_index(drop=True, inplace=True) df.set_index(['var', 'mod'], inplace=True) for var, mod in df.index: # Dealing with non categorical vars ... if df.get_value((var, mod), u"modalités") == 'total': margins[var] = df.get_value((var, mod), 'cible') # ... and categorical vars else: if var not in margins: margins[var] = {} margins[var][mod] = df.get_value((var, mod), 'cible') parameters = self.get_parameters() if self.total_population is not None: margins['total_population'] = self.total_population adjusted_margins = self.update_weights(margins, parameters=parameters) if 'total_population' in margins.keys(): del margins['total_population'] w = self.weight for var in margins.keys(): if var in self.survey_scenario.tax_benefit_system.column_by_name: value = self.survey_scenario.simulation.calculate( var) # TODO sum over menage if isinstance(margins[var], dict): items = [('marge', w), ('mod', value)] updated_margins = DataFrame.from_items(items).groupby( 'mod', sort=True).sum() for mod in margins[var].keys(): df.set_value((var, mod), u"cible ajustée", adjusted_margins[var][mod]) df.set_value((var, mod), u"marge", updated_margins['marge'][mod]) else: updated_margin = (w * value).sum() df.set_value((var, 0), u"cible ajustée", adjusted_margins[var]) df.set_value((var, 0), u"marge", updated_margin) if self.frame is not None: self.frame = df.reset_index()
def compute_aggregates(self, filter_by=None): """ Compute aggregate amounts """ column_by_name = self.simulation.tax_benefit_system.column_by_name V = [] M = {'data': [], 'default': []} B = {'data': [], 'default': []} U = [] M_label = { 'data': self.labels['dep'], 'default': self.labels['dep_default'] } B_label = { 'data': self.labels['benef'], 'default': self.labels['benef_default'] } for var in self.varlist: # amounts and beneficiaries from current data and default data if exists montant_benef = self.get_aggregate(var, filter_by) V.append(column_by_name[var].label) entity = column_by_name[var].entity_key_plural U.append(entity) for dataname in montant_benef: M[dataname].append(montant_benef[dataname][0]) B[dataname].append(montant_benef[dataname][1]) # build items list items = [(self.labels['var'], V)] for dataname in M: if M[dataname]: items.append((M_label[dataname], M[dataname])) items.append((B_label[dataname], B[dataname])) items.append((self.labels['entity'], U)) aggr_frame = DataFrame.from_items(items) self.aggr_frame = None for code, label in self.labels.iteritems(): try: col = aggr_frame[label] if self.aggr_frame is None: self.aggr_frame = DataFrame(col) else: self.aggr_frame = self.aggr_frame.join(col, how="outer") except: pass
def test_agg_period_index(): prng = period_range('2012-1-1', freq='M', periods=3) df = DataFrame(np.random.randn(3, 2), index=prng) rs = df.groupby(level=0).sum() assert isinstance(rs.index, PeriodIndex) # GH 3579 index = period_range(start='1999-01', periods=5, freq='M') s1 = Series(np.random.rand(len(index)), index=index) s2 = Series(np.random.rand(len(index)), index=index) series = [('s1', s1), ('s2', s2)] df = DataFrame.from_items(series) grouped = df.groupby(df.index.month) list(grouped)
def test_reader_special_dtypes(self): _skip_if_no_xlrd() expected = DataFrame.from_items([ ("IntCol", [1, 2, -3, 4, 0]), ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), ("BoolCol", [True, False, True, True, False]), ("StrCol", [1, 2, 3, 4, 5]), # GH5394 - this is why convert_float isn't vectorized ("Str2Col", ["a", 3, "c", "d", "e"]), ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31), datetime(1905, 1, 1), datetime(2013, 12, 14), datetime(2015, 3, 14)]) ]) xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx') xls_path = os.path.join(self.dirpath, 'test_types.xls') # should read in correctly and infer types for path in (xls_path, xlsx_path): actual = read_excel(path, 'Sheet1') tm.assert_frame_equal(actual, expected) # if not coercing number, then int comes in as float float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[1, "Str2Col"] = 3.0 for path in (xls_path, xlsx_path): actual = read_excel(path, 'Sheet1', convert_float=False) tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) for icol, name in enumerate(expected.columns): actual = read_excel(xlsx_path, 'Sheet1', index_col=icol) actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name) exp = expected.set_index(name) tm.assert_frame_equal(actual, exp) tm.assert_frame_equal(actual2, exp) # convert_float and converters should be different but both accepted expected["StrCol"] = expected["StrCol"].apply(str) actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str}) tm.assert_frame_equal(actual, expected) no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str}, convert_float=False) tm.assert_frame_equal(actual, no_convert_float)
def task_cohen(dataOne, dataTwo): labelOne = 'column1' labelTwo = 'column2' # Create input df = DataFrame.from_items([(labelOne, dataOne), (labelTwo, dataTwo)]) meanx = mean(df[labelOne]) meany = mean(df[labelTwo]) sdx = std(df[labelOne]) sdy = std(df[labelTwo]) s = sqrt((sdx**2 + sdy**2) / 2) d = round(fabs(meanx - meany) / s, 4) return d
def to_dataframe(iterable, *attrfields): ''' If user specifies fields, only those fields, in that order, will be cast into a data frame. Otherwise, fields are taken from first element in iterable. Fields must be a list of strings.''' if attrfields: columns=attrfields #Empty dataframe of fixed column/row size ### The autoassign below only works for namedtuples (uses _fields attribute) and takes from the iterable[0] entry ### else: try: columns=iterable[0]._fields #FOR NOW ONLY WORKS FOR NAMETUPLE attribute except AttributeError: raise AttributeError('to_dataframe requires a list of attributes') else: fget=attrgetter(*columns) items=[(idx, fget(v)) for idx, v in enumerate(iterable)] #Key value pairs, key=index position, value =array of results return DataFrame.from_items(items, columns, orient='index') #Orient lets it know keys are for row indexing not
def task_cohen(dataOne, dataTwo): labelOne = 'column1' labelTwo = 'column2' # Create input df = DataFrame.from_items([(labelOne, dataOne), (labelTwo, dataTwo)]) meanx = mean(df[labelOne]) meany = mean(df[labelTwo]) sdx = std(df[labelOne]) sdy = std(df[labelTwo]) s = sqrt((sdx**2 + sdy**2)/2) d = round(fabs(meanx-meany)/s, 4) return d
def compute_aggregates(self, filter_by = None): """ Compute aggregate amounts """ column_by_name = self.simulation.tax_benefit_system.column_by_name V = [] M = {'data': [], 'default': []} B = {'data': [], 'default': []} U = [] M_label = {'data': self.labels['dep'], 'default': self.labels['dep_default']} B_label = {'data': self.labels['benef'], 'default': self.labels['benef_default']} for var in self.varlist: # amounts and beneficiaries from current data and default data if exists montant_benef = self.get_aggregate(var, filter_by) V.append(column_by_name[var].label) entity = column_by_name[var].entity_key_plural U.append(entity) for dataname in montant_benef: M[dataname].append(montant_benef[dataname][0]) B[dataname].append(montant_benef[dataname][1]) # build items list items = [(self.labels['var'], V)] for dataname in M: if M[dataname]: items.append((M_label[dataname], M[dataname])) items.append((B_label[dataname], B[dataname])) items.append((self.labels['entity'], U)) aggr_frame = DataFrame.from_items(items) self.aggr_frame = None for code, label in self.labels.iteritems(): try: col = aggr_frame[label] if self.aggr_frame is None: self.aggr_frame = DataFrame(col) else: self.aggr_frame = self.aggr_frame.join(col, how="outer") except: pass
def update_output(self, output_data, description = None): QApplication.setOverrideCursor(QCursor(Qt.WaitCursor)) if output_data is None: return self.set_data(output_data) if description is not None: self.set_distribution_choices(description) if not hasattr(self, 'distribution_by_var'): self.distribution_by_var = 'typmen15' by_var = self.distribution_by_var V = [] M = [] B = [] for var in self.varlist: montant, benef = self.get_aggregate(var) V.append(var) M.append(montant) B.append(benef) items = [(u'Mesure', V), (u"Dépense\n(millions d'€)", M), (u"Bénéficiaires\n(milliers de ménages)", B)] aggr_frame = DataFrame.from_items(items) self.aggregate_view.set_dataframe(aggr_frame) dist_frame = self.group_by(['revdisp', 'nivvie'], by_var) by_var_label = self.var2label[by_var] dist_frame.insert(0,by_var_label,u"") enum = self.var2enum[by_var] dist_frame[by_var_label] = dist_frame[by_var].apply(lambda x: enum._vars[x]) dist_frame.pop(by_var) self.distribution_view.set_dataframe(dist_frame) self.distribution_view.reset() self.calculated() QApplication.restoreOverrideCursor()
def __setitem__(self, k, v): points = {} if isinstance(v[0], int): N, gold_set, pred_set = v if gold_set or pred_set: assert N>0,(N,gold_set,pred_set) else: N = '' gold, pred = v pred_set = set(pred.keys()) if isinstance(pred, dict) else pred gold_set = set(gold.keys()) if isinstance(gold, dict) else gold if isinstance(gold, dict): points.update(gold) if isinstance(pred, dict): for elt in gold_set & pred_set: assert gold[elt]==pred[elt],(elt,gold[elt],pred[elt]) if isinstance(pred, dict): points.update(pred) entry = { 'Numer': sum(points.get(elt,1) for elt in gold_set & pred_set), 'PDenom': sum(points.get(elt,1) for elt in pred_set), 'RDenom': sum(points.get(elt,1) for elt in gold_set), 'N': N } entry['P'] = entry['Numer'] / entry['PDenom'] if entry['PDenom'] else float('nan') entry['R'] = entry['Numer'] / entry['RDenom'] if entry['RDenom'] else float('nan') entry['F'] = 2 * entry['P'] * entry['R'] / (entry['P'] + entry['R']) if (entry['P'] + entry['R']) else float('nan') if N=='': entry['T'] = None entry['Acc'] = None else: if len(gold_set)==len(pred_set)==N: entry['T'] = entry['Numer'] else: tp = entry['Numer'] fp = len(pred_set-gold_set) fn = len(gold_set-pred_set) entry['T'] = N-fp-fn assert entry['T']>=0,(entry,gold_set,pred_set) entry['Acc'] = float('nan') if N==0 else entry['T'] / N df = DataFrame.from_items([(e, {k: entry[e]}) for e in PRCounter.COLUMNS]) self._df = self._df.append(df)
def get_contributions(self, phase=None): data = [] phase_query_key = {'$exists': 1} if phase is None else phase columns = ['mp-id', 'contribution', 'formula'] if phase is None: columns.append('phase') columns += ['dH (formation)', 'dH (hydration)', 'GS?', 'CIF'] docs = self.query_contributions( criteria={ 'content.doi': '10.1021/jacs.6b11301', 'content.data.Phase': phase_query_key }, projection={ '_id': 1, 'mp_cat_id': 1, 'content.data': 1, 'content.{}'.format(mp_level01_titles[3]): 1 } ) if not docs: raise Exception('No contributions found for MnO2 Phase Selection Explorer!') for doc in docs: mpfile = MPFile.from_contribution(doc) mp_id = mpfile.ids[0] contrib = mpfile.hdata[mp_id]['data'] cid_url = '/'.join([ self.preamble.rsplit('/', 1)[0], 'explorer', 'materials', doc['_id'] ]) row = [mp_id, cid_url, contrib['Formula']] if phase is None: row.append(contrib['Phase']) row += [contrib['dHf'], contrib['dHh'], contrib['GS']] cif_url = '' structures = mpfile.sdata.get(mp_id) if structures: cif_url = '/'.join([ self.preamble.rsplit('/', 1)[0], 'explorer', 'materials', doc['_id'], 'cif', structures.keys()[0] ]) row.append(cif_url) data.append((mp_id, row)) return DataFrame.from_items(data, orient='index', columns=columns)
def update_output(self, output_data, description=None): QApplication.setOverrideCursor(QCursor(Qt.WaitCursor)) if output_data is None: return self.set_data(output_data) if description is not None: self.set_distribution_choices(description) if not hasattr(self, 'distribution_by_var'): self.distribution_by_var = 'typmen15' by_var = self.distribution_by_var V = [] M = [] B = [] for var in self.varlist: montant, benef = self.get_aggregate(var) V.append(var) M.append(montant) B.append(benef) items = [(u'Mesure', V), (u"Dépense\n(millions d'€)", M), (u"Bénéficiaires\n(milliers de ménages)", B)] aggr_frame = DataFrame.from_items(items) self.aggregate_view.set_dataframe(aggr_frame) dist_frame = self.group_by(['revdisp', 'nivvie'], by_var) by_var_label = self.var2label[by_var] dist_frame.insert(0, by_var_label, u"") enum = self.var2enum[by_var] dist_frame[by_var_label] = dist_frame[by_var].apply( lambda x: enum._vars[x]) dist_frame.pop(by_var) self.distribution_view.set_dataframe(dist_frame) self.distribution_view.reset() self.calculated() QApplication.restoreOverrideCursor()
def get_contributions(self): data = [] columns = [ 'mp-id', 'contribution', 'kohn-sham_bandgap(indirect)', 'kohn-sham_bandgap(direct)', 'derivative_discontinuity', 'quasi-particle_bandgap(indirect)', 'quasi-particle_bandgap(direct)' ] docs = self.query_contributions( criteria=self.dtu_query, projection={ '_id': 1, 'mp_cat_id': 1, 'content.kohn-sham_bandgap.indirect': 1, 'content.kohn-sham_bandgap.direct': 1, 'content.derivative_discontinuity': 1, 'content.quasi-particle_bandgap.indirect': 1, 'content.quasi-particle_bandgap.direct': 1 }) if not docs: raise Exception('No contributions found for DTU Explorer!') for doc in docs: mpfile = MPFile.from_contribution(doc) mp_id = mpfile.ids[0] contrib = mpfile.hdata[mp_id] cid_url = '/'.join([ self.preamble.rsplit('/', 1)[0], 'explorer', 'materials', doc['_id'] ]) row = [ mp_id, cid_url, contrib['kohn-sham_bandgap']['indirect'], contrib['kohn-sham_bandgap']['direct'], contrib['derivative_discontinuity'], contrib['quasi-particle_bandgap']['indirect'], contrib['quasi-particle_bandgap']['direct'] ] data.append((mp_id, row)) return DataFrame.from_items(data, orient='index', columns=columns)
def display_restriction_sites(sequence): """This function takes in a sequence, reads it using the read_fasta fucntion from last week's assignment it then uses a regular expression to define a restriction sequence. using the data drame and other statistical tools from panda, this function will then fill up a 4 by 7 frequency column with the distribution of nucleotides in all the sequences found in a given file of dna sequence the parameter sequence: any fasta file that contains a single string of DNA. """ # find the hits sequence1 = read_fasta(sequence) recognition_sequence=r'[AG]GG[AGTC]CC[CT]' found = re.findall( recognition_sequence , sequence1 ) from pandas import Series, DataFrame import numpy s= Series(found) #convert the data type of found from a list to Series, which is a one-dimensional array for the next step frequency_matrix = numpy.zeros((4, len(s[0])), dtype=numpy.int) # we create an empty array of 4 rows, each with 7 spots. Will fill this array by countinf the number of A,G,T and C nucleotides at each position for all the instances when the draII sequence has occured base2index = {'A': 0, 'C': 1, 'G': 2, 'T': 3} # we have created a dictionary, where the A, C, G and T will be represented by row 1, 1, 2 and 3 respectively in the array for sequence in s: # a for loop iterating over all the individual incidences of the draII restriction site in a given DNA sequence for index, base in enumerate(sequence): #Enumerate() method adds a counter to an iterable and returns it in a form of enumerate object. frequency_matrix[base2index[base]][index] += 1 # for each restriction site found, all the nucleotides will be distributed across the four rows and the empty frequency table will be filled from pandas import Series, DataFrame Data=DataFrame.from_items([('A', frequency_matrix[0,]), ('C',frequency_matrix[1,] ), ('G',frequency_matrix[2,]), ('T',frequency_matrix[3,] )], orient='index', columns=['one', 'two', 'three', 'four', 'five', 'six', 'seven']) Data_new = Data.loc[:,"one":"seven"].div(Data.sum( axis= 0,skipna=True)) # converts the frequency matrix into a position weight matrix by dividing each element of a column by the sum of that column return Data_new
data[9], data[10], data[11], data[12], data[13], ], ) ) ##append tuple to list print dt except: print "skipped day" pass else: print "passed" pass frame = DataFrame.from_items(datalist, orient="index", columns=columns) frame.columns = columns frame = frame.applymap(lambda x: np.nan if x == "-9999" else x) datafile = frame.to_csv("C:/Users/Alex/Desktop/samoa/WATERSHED_ANALYSIS/BarometricData/NSTU/NSTU-current_10_28.csv") #### Append all ##files = os.listdir('C:/Users/Alex/Desktop/samoa/WATERSHED_ANALYSIS/BarometricData/NSTP6/') ##alldata = open('C:/Users/Alex/Desktop/samoa/WATERSHED_ANALYSIS/BarometricData/NSTP6/'+'2013.txt','w') ##for f in files: ## if f.endswith('.csv')==True: ## print f ## with open(f,'wb') as csvfile: ## data=csv.reader(csvfile,dialect='excel') ## for row in data: ## alldatata.write(row)
import pandas from pandas import DataFrame logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) stoplist = stopwords.words('english') seasons, episode_ids = [], [] with open("data/import/episodes.csv", "r") as episodesfile: reader = csv.reader(episodesfile, delimiter = ",") reader.next() for row in reader: seasons.append(int(row[3])) episode_ids.append(int(row[0])) df = DataFrame.from_items([('Season', seasons), ('EpisodeId', episode_ids)]) last_episode_in_season = list(df.groupby("Season").max()["EpisodeId"]) print "process sentences" episodes = defaultdict(list) with open("data/import/sentences.csv", "r") as sentencesfile: reader = csv.reader(sentencesfile, delimiter = ",") reader.next() for row in reader: episodes[row[1]].append([ word for word in nltk.word_tokenize(row[4].lower()) if word not in string.punctuation and word not in stoplist ] ) texts = []
def add_var2(self, varname, target=None, source = 'free'): """ Add a variable in the dataframe Parameters ---------- varname : str name of the variable target : float target for the margin of the variable source : str, default 'free' database source """ w_init = self.weights_init*self.champm w = self.weights*self.champm inputs = self.simulation.survey output_table = self.simulation.output_table varcol = self.simulation.get_col(varname) entity = self.entity enum = inputs.column_by_name.get('qui'+self.entity).enum people = [x[1] for x in enum] if varname in inputs.column_by_name: value = inputs.get_value(varname, index = idx) elif output_table is not None and varname in output_table.column_by_name: value = output_table.get_value(varname, index = idx, opt = people, sum_ = True) label = varcol.label # TODO: rewrite this using pivot table items = [ ('marge' , w[self.champm] ), ('marge initiale' , w_init[self.champm] )] if varcol.__class__ in MODCOLS: items.append(('mod', value[self.champm])) df = DataFrame.from_items(items) res = df.groupby('mod', sort= True).sum() else: res = DataFrame(index = ['total'], data = {'marge' : (value*w).sum(), 'marge initiale' : (value*w_init).sum() } ) res.insert(0, u"modalités",u"") res.insert(2, "cible", 0) res.insert(2, u"cible ajustée", 0) res.insert(4, "source", source) mods = res.index if target is not None: if len(mods) != len(target.keys()): drop_indices = [ (varname, mod) for mod in target.keys()] if source == 'input': self.input_margins_df.drop(drop_indices, inplace=True) self.input_margins_df.index.names = ['var','mod'] if source == 'output': self.output_margins_df.drop(drop_indices, inplace=True) self.output_margins_df.index.names = ['var','mod'] return if isinstance(varcol, EnumCol): if varcol.enum: enum = varcol.enum res[u'modalités'] = [enum._vars[mod] for mod in mods] res['mod'] = mods else: res[u'modalités'] = [mod for mod in mods] res['mod'] = mods elif isinstance(varcol, BoolCol): res[u'modalités'] = bool(mods) res['mod'] = mods elif isinstance(varcol, IntCol): res[u'modalités'] = mods res['mod'] = mods elif isinstance(varcol, AgeCol): res[u'modalités'] = mods res['mod'] = mods else: res[u'modalités'] = "total" res['mod'] = 0 if label is not None: res['variable'] = label else: res['variable'] = varname res['var'] = varname if target is not None: for mod, margin in target.iteritems(): if mod == varname: # dirty to deal with non catgorical data res['cible'][0] = margin else: res['cible'][mod] = margin if self.frame is None: self.frame = res else: self.frame = concat([self.frame, res]) self.frame = self.frame.reset_index(drop=True)
theta = compute_theta(data) # M-step print("Run %d produced theta of:" % i) print_theta(theta) #log_likelihood(data, theta) #=============================================== #TODO: infer varaibles and state sizes from data nodes = ['T', 'E1', 'E2', 'E3', 'E4'] N = len(nodes) # create a blank adjacency matrix, then # set the directed edges. each row (node) # should have a 1 in the column of each parent. adj = DataFrame.from_items( [(node, Series(np.zeros(N, int))) for node in nodes] ) adj.index = nodes adj.ix['E1', 'T'] = 1 adj.ix['E2', 'T'] = 1 adj.ix['E3', 'T'] = 1 adj.ix['E4', 'T'] = 1 print(adj) # specify the TRUE joint distribution, theta. specified as a # dict of node -> cpt, where each cpt is a dict # of comma-separated values of the ordered parents -> prob theta = {} theta['T'] = {'': {0: 0.75, 1: 0.25}} theta['E1'] = {'0': {0: 0.45, 1: 0.55}, '1': {0: 0.05, 1: 0.95}, } theta['E2'] = {'0': {0: 0.40, 1: 0.60}, '1': {0: 0.05, 1: 0.95}, } theta['E3'] = {'0': {0: 0.50, 1: 0.50}, '1': {0: 0.10, 1: 0.90}, }
def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) result.dtypes str(result) # assignment # GH 3687 arr = np.random.randn(3, 2) idx = lrange(2) df = DataFrame(arr, columns=['A', 'A']) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) idx = date_range('20130101', periods=4, freq='Q-NOV') df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['a', 'a', 'a', 'a']) df.columns = idx expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['foo', 'bar', 'foo', 'hello']) df['string'] = 'bah' expected = DataFrame( [[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], [2, 1, 3, 5, 'bah']], columns=['foo', 'bar', 'foo', 'hello', 'string']) check(df, expected) with assertRaisesRegexp(ValueError, 'Length of value'): df.insert(0, 'AnotherColumn', range(len(df.index) - 1)) # insert same dtype df['foo2'] = 3 expected = DataFrame( [[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3], [2, 1, 3, 5, 'bah', 3]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # set (non-dup) df['foo2'] = 4 expected = DataFrame( [[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4], [2, 1, 3, 5, 'bah', 4]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) df['foo2'] = 3 # delete (non dup) del df['bar'] expected = DataFrame( [[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], [2, 3, 5, 'bah', 3]], columns=['foo', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # try to delete again (its not consolidated) del df['hello'] expected = DataFrame( [[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # consolidate df = df.consolidate() expected = DataFrame( [[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # insert df.insert(2, 'new_col', 5.) expected = DataFrame( [[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], [2, 3, 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'string', 'foo2']) check(df, expected) # insert a dup assertRaisesRegexp(ValueError, 'cannot insert', df.insert, 2, 'new_col', 4.) df.insert(2, 'new_col', 4., allow_duplicates=True) expected = DataFrame( [[1, 1, 4., 5., 'bah', 3], [1, 2, 4., 5., 'bah', 3], [2, 3, 4., 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'new_col', 'string', 'foo2']) check(df, expected) # delete (dup) del df['foo'] expected = DataFrame( [[4., 5., 'bah', 3], [4., 5., 'bah', 3], [4., 5., 'bah', 3]], columns=['new_col', 'new_col', 'string', 'foo2']) assert_frame_equal(df, expected) # dup across dtypes df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]], columns=['foo', 'bar', 'foo', 'hello']) check(df) df['foo2'] = 7. expected = DataFrame( [[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], [2, 1, 3., 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) result = df['foo'] expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]], columns=['foo', 'foo']) check(result, expected) # multiple replacements df['foo'] = 'string' expected = DataFrame( [['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) del df['foo'] expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'hello', 'foo2']) check(df, expected) # values df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x']) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) self.assertTrue((result == expected).all().all()) # rename, GH 4403 df4 = DataFrame( { 'TClose': [22.02], 'RT': [0.0454], 'TExg': [0.0422] }, index=MultiIndex.from_tuples([(600809, 20130331)], names=['STK_ID', 'RPT_Date'])) df5 = DataFrame( { 'STK_ID': [600809] * 3, 'RPT_Date': [20120930, 20121231, 20130331], 'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')], 'TClose': [38.05, 41.66, 30.01] }, index=MultiIndex.from_tuples([(600809, 20120930), (600809, 20121231), (600809, 20130331)], names=['STK_ID', 'RPT_Date'])) k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True) result = k.rename(columns={ 'TClose_x': 'TClose', 'TClose_y': 'QT_Close' }) str(result) result.dtypes expected = (DataFrame( [[0.0454, 22.02, 0.0422, 20130331, 600809, u('饡驦'), 30.01]], columns=[ 'RT', 'TClose', 'TExg', 'RPT_Date', 'STK_ID', 'STK_Name', 'QT_Close' ]).set_index(['STK_ID', 'RPT_Date'], drop=False)) assert_frame_equal(result, expected) # reindex is invalid! df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) self.assertRaises(ValueError, df.reindex, columns=['bar']) self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo']) # drop df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) result = df.drop(['a'], axis=1) expected = DataFrame([[1], [1], [1]], columns=['bar']) check(result, expected) result = df.drop('a', axis=1) check(result, expected) # describe df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['bar', 'a', 'a'], dtype='float64') result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'A']) for index in [df.index, pd.Index(list('edcba'))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) expected_df = DataFrame.from_items([('A', expected_ser), ('B', this_df['B']), ('A', expected_ser)]) this_df['A'] = index check(this_df, expected_df) # operations for op in ['__add__', '__mul__', '__sub__', '__truediv__']: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) expected.columns = ['A', 'A'] df.columns = ['A', 'A'] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 df = DataFrame(np.random.randn(5, 2), columns=['that', 'that']) expected = DataFrame(1.0, index=range(5), columns=['that', 'that']) df['that'] = 1.0 check(df, expected) df = DataFrame(np.random.rand(5, 2), columns=['that', 'that']) expected = DataFrame(1, index=range(5), columns=['that', 'that']) df['that'] = 1 check(df, expected)
def compute(self): """ Compute inequality dataframe """ output = self.simulation.output_table final_df = None WEIGHT = model.WEIGHT FILTERING_VARS = model.FILTERING_VARS for varname, entities in self.vars.iteritems(): for entity in entities: #idx = output.index[entity] val = output.get_value(varname, entity) weights = output._inputs.get_value(WEIGHT, entity) filter_var_name = FILTERING_VARS[0] filter_var= output._inputs.get_value(filter_var_name, entity) items = [] # Compute mean moy = (weights*filter_var*val).sum()/(weights*filter_var).sum() items.append( ("Moyenne", [moy])) # Compute deciles labels = range(1,11) method = 2 decile, values = mark_weighted_percentiles(val, labels, weights*filter_var, method, return_quantiles=True) labels = [ 'D'+str(d) for d in range(1,11)] del decile for l, v in zip(labels[:-1],values[1:-1]): items.append( (l, [v])) # Compute Gini gini_coeff = gini(val, weights*filter_var) items.append( ( _("Gini index"), [gini_coeff])) df = DataFrame.from_items(items, orient = 'index', columns = [varname]) df = df.reset_index() if final_df is None: final_df = df else: final_df = final_df.merge(df, on='index') final_df[u"Initial à net"] = (final_df['nivvie_net']-final_df['nivvie_ini'])/final_df['nivvie_ini'] final_df[u"Net à disponible"] = (final_df['nivvie']-final_df['nivvie_net'])/final_df['nivvie_net'] final_df = final_df[['index','nivvie_ini', u"Initial à net", 'nivvie_net',u"Net à disponible",'nivvie']] self.inequality_dataframe = final_df # poverty poverty = dict() entity = "men" varname = "nivvie" for percentage in [ 40, 50, 60]: # idx = output.index[entity] varname = "pauvre" + str(percentage) val = output.get_value(varname, entity) weights = output._inputs.get_value(WEIGHT, entity) filter_var_name = FILTERING_VARS[0] filter_var= output._inputs.get_value(filter_var_name, entity) poverty[percentage] = (weights*filter_var*val).sum()/(weights*filter_var).sum() self.poverty = poverty
def _run_comparison(self, random_trials=100, sig_threshold=0.05, debug=False): self.log( 'Running ' + str(random_trials) + ' random sets for each term and comparing them.') dens = dict() significant_terms = 0 n = 0 # Use only 25 terms for testing purposes if debug: ont_loci = list(self.ont_loci.items())[:25] else: ont_loci = list(self.ont_loci.items()) # Iterate through all terms for term, loci in ont_loci: # Log how many teerms are done, to ensure people it hasn't crashed if n % 100 == 0: self.log('Compared {}/{} terms so far.', n, len(ont_loci)) real_density = self.cob.density(loci) dens[term] = [real_density] # Run the random samples loci_count = len(loci) loci_tot_list = self.cob.refgen.random_genes( loci_count*random_trials ) scores = [] aboves = 0 for x in range(random_trials): # Get the random genes loci_list = [loci_tot_list.pop() for x in range(loci_count)] # Find the density and save it score = self.cob.density(loci_list) if score >= real_density: aboves += 1 scores.append(score) # Add on the states from the scores dens[term].append(np.mean(scores)) dens[term].append(np.std(scores)) dens[term].append(aboves) # Figure out if that makes it significant if dens[term][-1] <= (random_trials*sig_threshold): dens[term].append(1) significant_terms += 1 else: dens[term].append(0) n += 1 self.log('Compared all {} terms.', n) # Convert the dict to a DataFrame ans = DataFrame.from_items( dens.items(), columns=[ self.cob.name+' Density', 'Random Density Mean', 'Random STD', 'Items >= '+self.cob.name, 'Significant' ], orient='index' ) self.log('Number of Significant Terms: ' + str(significant_terms)) self.log('Number Random Significants Expected: '+str(len(dens)*0.05)) return ans
def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) result.dtypes str(result) # assignment # GH 3687 arr = np.random.randn(3, 2) idx = lrange(2) df = DataFrame(arr, columns=['A', 'A']) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) idx = date_range('20130101', periods=4, freq='Q-NOV') df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['a', 'a', 'a', 'a']) df.columns = idx expected = DataFrame( [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['foo', 'bar', 'foo', 'hello']) df['string'] = 'bah' expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], [2, 1, 3, 5, 'bah']], columns=['foo', 'bar', 'foo', 'hello', 'string']) check(df, expected) with assertRaisesRegexp(ValueError, 'Length of value'): df.insert(0, 'AnotherColumn', range(len(df.index) - 1)) # insert same dtype df['foo2'] = 3 expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3], [2, 1, 3, 5, 'bah', 3]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # set (non-dup) df['foo2'] = 4 expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4], [2, 1, 3, 5, 'bah', 4]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) df['foo2'] = 3 # delete (non dup) del df['bar'] expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], [2, 3, 5, 'bah', 3]], columns=['foo', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # try to delete again (its not consolidated) del df['hello'] expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # consolidate df = df.consolidate() expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # insert df.insert(2, 'new_col', 5.) expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], [2, 3, 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'string', 'foo2']) check(df, expected) # insert a dup assertRaisesRegexp(ValueError, 'cannot insert', df.insert, 2, 'new_col', 4.) df.insert(2, 'new_col', 4., allow_duplicates=True) expected = DataFrame([[1, 1, 4., 5., 'bah', 3], [1, 2, 4., 5., 'bah', 3], [2, 3, 4., 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'new_col', 'string', 'foo2']) check(df, expected) # delete (dup) del df['foo'] expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3], [4., 5., 'bah', 3]], columns=['new_col', 'new_col', 'string', 'foo2']) assert_frame_equal(df, expected) # dup across dtypes df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]], columns=['foo', 'bar', 'foo', 'hello']) check(df) df['foo2'] = 7. expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], [2, 1, 3., 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) result = df['foo'] expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]], columns=['foo', 'foo']) check(result, expected) # multiple replacements df['foo'] = 'string' expected = DataFrame([['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) del df['foo'] expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[ 'bar', 'hello', 'foo2']) check(df, expected) # values df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x']) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) self.assertTrue((result == expected).all().all()) # rename, GH 4403 df4 = DataFrame( {'TClose': [22.02], 'RT': [0.0454], 'TExg': [0.0422]}, index=MultiIndex.from_tuples([(600809, 20130331)], names=['STK_ID', 'RPT_Date'])) df5 = DataFrame({'STK_ID': [600809] * 3, 'RPT_Date': [20120930, 20121231, 20130331], 'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')], 'TClose': [38.05, 41.66, 30.01]}, index=MultiIndex.from_tuples( [(600809, 20120930), (600809, 20121231), (600809, 20130331)], names=['STK_ID', 'RPT_Date'])) k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True) result = k.rename( columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'}) str(result) result.dtypes expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809, u('饡驦'), 30.01]], columns=['RT', 'TClose', 'TExg', 'RPT_Date', 'STK_ID', 'STK_Name', 'QT_Close']) .set_index(['STK_ID', 'RPT_Date'], drop=False)) assert_frame_equal(result, expected) # reindex is invalid! df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) self.assertRaises(ValueError, df.reindex, columns=['bar']) self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo']) # drop df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) result = df.drop(['a'], axis=1) expected = DataFrame([[1], [1], [1]], columns=['bar']) check(result, expected) result = df.drop('a', axis=1) check(result, expected) # describe df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['bar', 'a', 'a'], dtype='float64') result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'A']) for index in [df.index, pd.Index(list('edcba'))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) expected_df = DataFrame.from_items([('A', expected_ser), ('B', this_df['B']), ('A', expected_ser)]) this_df['A'] = index check(this_df, expected_df) # operations for op in ['__add__', '__mul__', '__sub__', '__truediv__']: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) expected.columns = ['A', 'A'] df.columns = ['A', 'A'] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 df = DataFrame(np.random.randn(5, 2), columns=['that', 'that']) expected = DataFrame(1.0, index=range(5), columns=['that', 'that']) df['that'] = 1.0 check(df, expected) df = DataFrame(np.random.rand(5, 2), columns=['that', 'that']) expected = DataFrame(1, index=range(5), columns=['that', 'that']) df['that'] = 1 check(df, expected)
from pandas import DataFrame def is_popuar(row): if row['fb_likes'] >= 2000: val = 1 else: val = 0 return val data = DataFrame.from_items( [('category', ['Entertainment', 'Lifestyle', 'Technology']), ('fb_likes', [2349, 1299, 6589]) ]) data['is_popular'] = data.apply(is_popuar, axis=1) train=data.sample(frac=0.8,random_state=100) test=data.drop(train.index) x_train = train.ix[:,:-1] x_train_target = train.ix[:,-1:] y_test = test.ix[:,:-1] y_test_target = test.ix[:,-1:] print x_train print x_train_target #Just use x_train.values and x_train_target.values (same with test data) before fitting
def run_system(model, system, group, intraday=False, quantity=1): r"""Run a system for a given group, creating a trades frame. Parameters ---------- model : alphapy.Model The model object with specifications. system : alphapy.System The system to run. group : alphapy.Group The group of symbols to trade. intraday : bool, optional If true, this is an intraday system. quantity : float, optional The amount to trade for each symbol, e.g., number of shares Returns ------- tf : pandas.DataFrame All of the trades for this ``group``. """ system_name = system.name logger.info("Generating Trades for System %s", system_name) # Unpack the model data. directory = model.specs['directory'] extension = model.specs['extension'] separator = model.specs['separator'] # Extract the group information. gname = group.name gmembers = group.members gspace = group.space # Run the system for each member of the group gtlist = [] for symbol in gmembers: # generate the trades for this member tlist = trade_system(model, system, gspace, intraday, symbol, quantity) if tlist: # add trades to global trade list for item in tlist: gtlist.append(item) else: logger.info("No trades for symbol %s", symbol) # Create group trades frame tf = None if gtlist: tspace = Space(system_name, "trades", group.space.fractal) gtlist = sorted(gtlist, key=lambda x: x[0]) tf = DataFrame.from_items(gtlist, orient='index', columns=Trade.states) tfname = frame_name(gname, tspace) system_dir = SSEP.join([directory, 'systems']) labels = ['date'] if intraday: labels.append('time') write_frame(tf, system_dir, tfname, extension, separator, index=True, index_label=labels) del tspace else: logger.info("No trades were found") # Return trades frame return tf
import json import sys from pandas import DataFrame filename = sys.argv[1] data = json.load(open(filename)) frame_data = [(k, (len(v['following']), len(v['followers']))) for k,v in data.items()] frame = DataFrame.from_items(frame_data, orient='index', columns=['following', 'followers']) print frame print '## Following' print frame['following'].describe() print frame.sort('following', ascending=False)[:10] print '## Followers' print frame['followers'].describe() print frame.sort('followers', ascending=False)[:10]
def compute(self): """ Compute inequality dataframe """ final_df = None simulation = self.survey_scenario.new_simulation() column_by_name = simulation.tax_benefit_system.column_by_name # amounts and beneficiaries from current data and default data if exists # Build weights for each entity from openfisca_france_data import FILTERING_VARS for varname, entities in self.vars.iteritems(): for entity_key_plural in entities: column = column_by_name[varname] weight_name = self.survey_scenario.weight_column_name_by_entity_key_plural[column.entity_key_plural] filter_by = FILTERING_VARS[0] filter_by_name = FILTERING_VARS[0] if column.entity_key_plural is not 'menages': filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural) val = simulation.calculate(varname) weights = simulation.calculate(weight_name) filter_var = simulation.calculate(filter_by_name) items = [] # Compute mean moy = (weights * filter_var * val).sum() / (weights * filter_var).sum() items.append(("Moyenne", [moy])) # Compute deciles labels = range(1, 11) method = 2 decile, values = mark_weighted_percentiles(val, labels, weights * filter_var, method, return_quantiles = True) labels = ['D' + str(d) for d in range(1, 11)] del decile for l, v in zip(labels[:-1], values[1:-1]): items.append((l, [v])) # Compute Gini gini_coeff = gini(val, weights * filter_var) items.append((_("Gini index"), [gini_coeff])) df = DataFrame.from_items(items, orient = 'index', columns = [varname]) df = df.reset_index() if final_df is None: final_df = df else: final_df = final_df.merge(df, on='index') final_df[u"Initial à net"] = (final_df['nivvie_net'] - final_df['nivvie_ini']) / final_df['nivvie_ini'] final_df[u"Net à disponible"] = (final_df['nivvie'] - final_df['nivvie_net']) / final_df['nivvie_net'] final_df = final_df[['index', 'nivvie_ini', u"Initial à net", 'nivvie_net', u"Net à disponible", 'nivvie']] self.inequality_data_frame = final_df # Poverty poverty = dict() varname = "nivvie" for percentage in [40, 50, 60]: varname = "pauvre{}".format(percentage) column = column_by_name[varname] weight_name = self.survey_scenario.weight_column_name_by_entity_key_plural[column.entity_key_plural] filter_by_name = FILTERING_VARS[0] if column.entity_key_plural is not 'menages': filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural) val = simulation.calculate(varname) weights = simulation.calculate(weight_name) filter_var = simulation.calculate(filter_by_name) poverty[percentage] = (weights * filter_var * val).sum() / (weights * filter_var).sum() self.poverty = poverty