def commodityTonnage_by_level(d, outdir, level): name = '-'.join(level) outpath = path.join(outdir, 'tonnage_by_{}.csv'.format(name)) if replace or not path.exists(outpath): dr = bz.by(bz.merge(d[level], d.commodity, d.year, d.month, d.category), commodityTonnage=d.commodityTonnage.sum()) do = bz.by(bz.merge(d[level], d.commodity, d.category), commodityTonnage=d.commodityTonnage.sum()) #d[[level, 'commodity']] save(do, outpath, replace) content, by_year_month = commodityTonnage_over_time(dr, outdir, level) commodityTonnage_by_year(dr, outdir, level) if content: commodityTonnage_by_month(by_year_month, outdir, level) return
def build_training_set( rundb: str, topicsfile: str, index_path: str, axioms: typing.Sequence[PairwiseAxiom], max_rank=100): queries_by_id, rd, ctx = initialize(rundb, topicsfile, index_path, max_rank) system_query = rd[['system', 'query']].distinct().sort('query') from blaze import by, merge ranking_lengths = by(merge(rd.system, rd.query), n=rd.rank.count()).n cpair_count = int((ranking_lengths * ranking_lengths - ranking_lengths).sum()) iter_count = cpair_count * len(axioms) * 2 pbar = tqdm.tqdm(total=iter_count) def loop(i): for item in i: pbar.update() yield item for sys, qid in system_query: sqrun = rd[(rd['system'] == sys) & (rd['query'] == qid)] ranking = [ctx.c.get_document(did[0]) for did in sqrun.docid] query = queries_by_id[qid] part = build_training_set_for_one_ranking(ctx, axioms, query, sys, ranking, loop) yield part
def _odo_object(self): from blaze import by, merge, head table = self.binded_table if self.whereclauses: wheres = bind_list(self, self.whereclauses) table = table[reduce(lambda x, y: x and y, wheres)] tb = self.tables[self.table.name] self.tables[self.table.name] = table columns = bind_list(self, self.columns) or [table[_] for _ in table.fields] self.tables[self.table.name] = tb if self.groupclauses: groups = bind_list(self, self.groupclauses) groups = [table[_.fields[0]] for _ in groups] names = [_.fields[0] for _ in groups] groups = merge(*groups) if len(groups) > 1 else groups[0] table = by( groups, **{ c.fields[0]: c for c in columns if c.fields[0] not in names }) if self.orderclauses: orders = bind_list(self, self.orderclauses) for order in orders.reverse(): table = table.sort(*order) if self.limit: table = head(table, self.limit) return table[[_.fields[0] for _ in columns]]
def commodityTonnage_by_year(d, outdir, level=None): df = odo.odo(d, pd.DataFrame) d = bz.Data(df, d.dshape) if level: if isinstance(level, list): expr = [l for l in level] else: expr = level name = '-'.join(level) outpath = path.join(outdir, 'tonnage_by_{}_year.csv'.format(name)) if replace or not path.exists(outpath): do = bz.by(bz.merge(d[expr], d.year, d.commodity, d.category), commodityTonnage=d.commodityTonnage.sum()) save(do, outpath, replace) else: outpath = path.join(outdir, 'tonnage_by_year.csv') if replace or not path.exists(outpath): do = bz.by(bz.merge(d.year, d.commodity, d.category), commodityTonnage=d.commodityTonnage.sum()) save(do, outpath, replace) return
def test_foreign_key_merge_expression(fkey): from blaze import merge t = symbol('fkey', discover(fkey)) expr = merge(t.sym_id.sym, t.sym_id.main.data) expected = """ select pkey.sym, main.data from fkey, pkey, main where fkey.sym_id = pkey.id and pkey.main = main.id """ result = compute(expr, fkey, return_type='native') assert normalize(str(result)) == normalize(expected)
def test_foreign_key_merge_expression(fkey): from blaze import merge t = symbol('fkey', discover(fkey)) expr = merge(t.sym_id.sym, t.sym_id.main.data) expected = """ select pkey.sym, main.data from fkey, pkey, main where fkey.sym_id = pkey.id and pkey.main = main.id """ result = compute(expr, fkey) assert normalize(str(result)) == normalize(expected)
def delta(data1, col1, data2, col2, *args): # TODO: validate that they are of the same length etc d = merge(data1, data2) for feature_class in args: f = feature_class() name = type(f).__name__ new_col = "delta_" + col1 + "_" + col2 + "_" + name s = "transform(d, " + new_col + " = data1." + col1 + "_" + name + " - data2." + col2 + "_" + name + ")" d2 = eval(s) # Pardonnez-moi! d = d2 print "Adding " + new_col return d
def multiply(data, col, *args): d = None for feature_class in args: f = feature_class() new_col = col + "_" + type(f).__name__ d2 = data[col].map(f.value_fn(), f.schema_type()) d2 = d2.relabel(**{col: new_col}) if d is None: print 'D is not here' d = d2 else: d = merge(d, d2) print 'D is here' print "Adding " + new_col return d
def test_merge(rdd): col = (t['amount'] * 2).label('new') expr = merge(t['name'], col) assert compute(expr, rdd).collect() == [ (row[0], row[1] * 2) for row in data]
def _blaze(self, _selects, _wheres, _groups, _aggs, _offset, _limit, _sorts, _count, _q): import blaze as bz import datashape # TODO: Not caching blaze connections parameters = self.params.get('parameters', {}) bzcon = bz.Data( self.params['url'] + ('::' + self.params['table'] if self.params.get('table') else ''), **parameters) table = bz.Symbol('table', bzcon.dshape) columns = table.fields query = table if _wheres: wh_re = re.compile(r'([^=><~!]+)([=><~!]{1,2})([\s\S]+)') wheres = None for where in _wheres: match = wh_re.search(where) if match is None: continue col, oper, val = match.groups() col = table[col] if oper in ['==', '=']: whr = (col == val) elif oper == '>=': whr = (col >= val) elif oper == '<=': whr = (col <= val) elif oper == '>': whr = (col > val) elif oper == '<': whr = (col < val) elif oper == '!=': whr = (col != val) elif oper == '~': whr = (col.like('*' + val + '*')) elif oper == '!~': whr = (~col.like('*' + val + '*')) wheres = whr if wheres is None else wheres & whr query = query if wheres is None else query[wheres] alias_cols = [] if _groups and _aggs: byaggs = { 'min': bz.min, 'max': bz.max, 'sum': bz.sum, 'count': bz.count, 'mean': bz.mean, 'nunique': bz.nunique } agg_re = re.compile(r'([^:]+):([aA-zZ]+)\(([^:]+)\)') grps = bz.merge(*[query[group] for group in _groups]) aggs = {} for agg in _aggs: match = agg_re.search(agg) if match is None: continue name, oper, col = match.groups() alias_cols.append(name) aggs[name] = byaggs[oper](query[col]) query = bz.by(grps, **aggs) if _q: wheres = None for col in columns: if isinstance(table[col].dshape.measure.ty, datashape.coretypes.String): whr = table[col].like('*' + _q + '*') wheres = whr if wheres is None else wheres | whr if wheres is not None: query = query[wheres] count_query = query.count() if _sorts: order = {'asc': True, 'desc': False} sorts = [] for sort in _sorts: col, odr = sort.partition(':')[::2] if col not in columns + alias_cols: continue sorts.append(col) if sorts: query = query.sort(sorts, ascending=order.get(odr, True)) if _offset: _offset = int(_offset) if _limit: _limit = int(_limit) if _offset and _limit: _limit += _offset if _offset or _limit: query = query[_offset:_limit] if _selects: query = query[_selects] # TODO: Improve json, csv, html outputs using native odo result = { 'query': query, 'data': bz.odo(bz.compute(query, bzcon.data), pd.DataFrame), } if _count: count = bz.odo(bz.compute(count_query, bzcon.data), pd.DataFrame) result['count'] = count.iloc[0, 0] return result
def blaze_tutorial(): accounts = bl.Symbol('accounts', 'var * {id: int, name: string, amount: int}') deadbeats = accounts[accounts.amount < 0].name list_ = [[1, 'Alice', 100], [2, 'Bob', -200], [3, 'Charlie', 300], [4, 'Denis', 400], [5, 'Edith', -500]] print(list(bl.compute(deadbeats, list_))) df_ = bl.DataFrame(list_, columns=['id', 'name', 'amount']) print(bl.compute(deadbeats, df_)) bl_df_dir = dir(df_) df_ = pd.DataFrame(list_, columns=['id', 'name', 'amount']) print(df_[df_.amount < 0].name) pd_df_dir = dir(df_) print(len(bl_df_dir), len(pd_df_dir)) print(len([d for d in bl_df_dir if d in pd_df_dir])) print([d for d in bl_df_dir if d not in pd_df_dir]) print([d for d in pd_df_dir if d not in bl_df_dir]) df_ = bl.Data([(1, 'Alice', 100), (2, 'Bob', -200), (3, 'Charlie', 300), (4, 'Denis', 400), (5, 'Edith', -500)], fields=['id', 'name', 'balance']) print(repr(df_)) print(repr(df_[df_.balance < 0])) print(repr(df_[df_.balance < 0].name)) print(list(df_[df_.balance < 0].name)) iris = bl.Data('examples/iris.csv') print(repr(iris)) iris = bl.Data('sqlite:///examples/iris.db::iris') print(repr(iris)) print(repr(bl.by(iris.species, min=iris.petal_width.min(), max=iris.petal_width.max()))) result = bl.by(iris.species, min=iris.petal_width.min(), max=iris.petal_width.max()) print(odo(result, bl.DataFrame)) print(odo(result, pd.DataFrame)) ### odo has weird issue with unicode filenames, apparently... name = 'output.csv' print(odo(result, bl.CSV(name))) print(repr(iris.sepal_length.mean())) print(repr(bl.mean(iris.sepal_length))) print(repr(bl.by(iris.species, shortest=iris.petal_length.min(), longest=iris.petal_length.max(), average=iris.petal_length.mean()))) print(repr(iris.head())) iris = bl.transform(iris, sepal_ratio=iris.sepal_length / iris.sepal_width, petal_ratio=iris.petal_length / iris.petal_width) print(repr(iris.head())) versicolor = iris[iris.species.like('%versicolor')] print(repr(versicolor)) print((len(versicolor), len(versicolor.fields))) print(repr(iris.relabel(petal_length='PETAL-LENGTH', petal_width='PETAL-WIDTH'))) pd_df = pd.DataFrame({'name': ['Alice', 'Bob', 'Joe', 'Bob'], 'amount': [100, 200, 300, 400], 'id': [1, 2, 3, 4]}) # put the `df` DataFrame odo a Blaze Data object bl_df = bl.DataFrame(pd_df) bl_dt = bl.Data(pd_df) print(repr(pd_df.amount * 2)) print(repr(bl_df.amount * 2)) print(repr(bl_dt.amount * 2)) print(repr(pd_df[['id', 'amount']])) print(repr(bl_df[['id', 'amount']])) print(repr(bl_dt[['id', 'amount']])) print(repr(pd_df[pd_df.amount > 300])) print(repr(bl_df[bl_df.amount > 300])) print(repr(bl_dt[bl_dt.amount > 300])) print(repr(pd_df.groupby('name').amount.mean())) print(repr(pd_df.groupby(['name', 'id']).amount.mean())) print(repr(bl_df.groupby('name').amount.mean())) print(repr(bl_df.groupby(['name', 'id']).amount.mean())) print(repr(bl.by(bl_dt.name, amount=bl_dt.amount.mean()))) print(repr(bl.by(bl.merge(bl_dt.name, bl_dt.id), amount=bl_dt.amount.mean()))) #pd.merge(pd_df, pd_df2, on='name') #bl.join(bl_dt, bl_dt2, 'name') print(repr(pd_df.amount.map(lambda x: x + 1))) print(repr(bl_df.amount.map(lambda x: x + 1))) print(repr(bl_dt.amount.map(lambda x: x + 1, 'int64'))) print(repr(pd_df.rename(columns={'name': 'alias', 'amount': 'dollars'}))) print(repr(bl_df.rename(columns={'name': 'alias', 'amount': 'dollars'}))) print(repr(bl_dt.relabel(name='alias', amount='dollars'))) print(repr(pd_df.drop_duplicates())) print(repr(bl_df.drop_duplicates())) print(repr(bl_dt.distinct())) print(repr(pd_df.name.drop_duplicates())) print(repr(bl_df.name.drop_duplicates())) print(repr(bl_dt.name.distinct())) print(repr(pd_df.amount.mean())) print(repr(bl_df.amount.mean())) print(repr(bl_dt.amount.mean())) print(repr(pd_df)) print(repr(bl_df)) print(repr(bl_dt)) print(repr(pd_df.amount.value_counts()), '\n') print(repr(bl_df.amount.value_counts()), '\n') print(repr(bl_dt.amount.count_values()), '\n') print(repr(pd_df.dtypes), '\n') print(repr(bl_df.dtypes), '\n') print(repr(bl_df.columns), '\n') print(repr(bl_dt.dshape), '\n') print(repr(pd_df.amount.dtypes), '\n') print(repr(bl_df.amount.dtypes), '\n') print(repr(bl_dt.amount.dshape), '\n') print(type(pd_df), type(bl_df), type(bl_dt), '\n') os.remove('output.csv') for fn_ in glob.glob('*.csv.gz'): os.remove(fn_) return
from pipeline import * from operations import * from features import FEATURE_CLASSES translations_and_scores = pump(scores='data/hy.basic.scores.csv') q_features = multiply(translations_and_scores, 'q', *FEATURE_CLASSES) r_features = multiply(translations_and_scores, 'r', *FEATURE_CLASSES) feature_deltas = delta(q_features, 'q', r_features, 'r', *FEATURE_CLASSES) # delta is not commutative, so consider also the reverse: # delta(r_features, q_features, *FEATURE_CLASSES) #print translations_and_scores #print q_features #print r_features #print feature_deltas from blaze import merge, odo everything = merge(translations_and_scores, feature_deltas) odo(everything, 'data/features.csv')
def crosstabs(data, columns=None, values=None, correction=False, pairs_top=10000, details=True): ''' Identifies the strength of relationship between every pair of categorical columns in a DataFrame Parameters ---------- data : Blaze data A data with at least 2 columns having categorical values. columns : list of column names in data If not specified, uses ``autolyse.types(data)['groups']`` to identify all columns with categorical data. values : str, column name Optional column that contains weights to aggregate by summing up. By default, each row is counted as an observation. correction : boolean If True, and the degrees of freedom is 1, apply Yates' correction for continuity. The effect of the correction is to adjust each observed value by 0.5 towards the corresponding expected value. Defaults to False since Cramer's V (a more useful metric than chi-squared) must be computed without this correction. pairs_top: integer, Pick only top 10000 pairs by default details: boolean If True, will return observed and expected dataframes for pairs. Defaults to False. ''' if columns is None: columns = types(data)['groups'] parameters = ('p', 'chi2', 'dof', 'V') for index, column in itertools.combinations(columns, 2): agg_col = values if values in data.fields else column agg_func = bz.count(data[agg_col]) if agg_col == column else bz.sum(data[agg_col]) data_grouped = bz.into(pd.DataFrame, bz.by(bz.merge(data[index], data[column]), values=agg_func) .sort('values') # Generated SQL inefficient .head(pairs_top)) # BUG: bz.count: non-null count, gives 0 count for NULL groups # .nrows needs to fixed blaze/issues/1484 # For now, we'll ignore NULL groups # Remove NULL groups data_grouped = data_grouped.dropna() if data_grouped.empty: result = {(index, column): {}} else: r = _crosstab(data_grouped[index], column=data_grouped[column], values=data_grouped['values'], correction=correction) if details: result = { 'index': index, 'column': column, 'observed': r['observed'].to_json(), 'expected': r['expected'].to_json(), 'stats': {param: r[param] for param in parameters} } else: result = { 'index': index, 'column': column, 'stats': {param: r[param] for param in parameters} } yield result