示例#1
0
def commodityTonnage_by_level(d, outdir, level):
    name = '-'.join(level)
    outpath = path.join(outdir, 'tonnage_by_{}.csv'.format(name))
    if replace or not path.exists(outpath):
        dr = bz.by(bz.merge(d[level], d.commodity, d.year, d.month, d.category), commodityTonnage=d.commodityTonnage.sum())
        do = bz.by(bz.merge(d[level], d.commodity, d.category), commodityTonnage=d.commodityTonnage.sum())
        #d[[level, 'commodity']]
        save(do, outpath, replace)
        content, by_year_month = commodityTonnage_over_time(dr, outdir, level)
        commodityTonnage_by_year(dr, outdir, level)
        if content:
            commodityTonnage_by_month(by_year_month, outdir, level)
    return
示例#2
0
def build_training_set(
        rundb: str,
        topicsfile: str,
        index_path: str,
        axioms: typing.Sequence[PairwiseAxiom],
        max_rank=100):

    queries_by_id, rd, ctx = initialize(rundb, topicsfile, index_path, max_rank)

    system_query = rd[['system', 'query']].distinct().sort('query')

    from blaze import by, merge
    ranking_lengths = by(merge(rd.system, rd.query), n=rd.rank.count()).n
    cpair_count = int((ranking_lengths * ranking_lengths - ranking_lengths).sum())
    iter_count = cpair_count * len(axioms) * 2

    pbar = tqdm.tqdm(total=iter_count)

    def loop(i):
        for item in i:
            pbar.update()
            yield item


    for sys, qid in system_query:
        sqrun = rd[(rd['system'] == sys) & (rd['query'] == qid)]
        ranking = [ctx.c.get_document(did[0]) for did in sqrun.docid]
        query = queries_by_id[qid]

        part = build_training_set_for_one_ranking(ctx, axioms, query, sys, ranking, loop)

        yield part
示例#3
0
    def _odo_object(self):
        from blaze import by, merge, head

        table = self.binded_table
        if self.whereclauses:
            wheres = bind_list(self, self.whereclauses)
            table = table[reduce(lambda x, y: x and y, wheres)]
        tb = self.tables[self.table.name]
        self.tables[self.table.name] = table
        columns = bind_list(self,
                            self.columns) or [table[_] for _ in table.fields]
        self.tables[self.table.name] = tb
        if self.groupclauses:
            groups = bind_list(self, self.groupclauses)
            groups = [table[_.fields[0]] for _ in groups]
            names = [_.fields[0] for _ in groups]
            groups = merge(*groups) if len(groups) > 1 else groups[0]
            table = by(
                groups, **{
                    c.fields[0]: c
                    for c in columns if c.fields[0] not in names
                })
        if self.orderclauses:
            orders = bind_list(self, self.orderclauses)
            for order in orders.reverse():
                table = table.sort(*order)
        if self.limit:
            table = head(table, self.limit)
        return table[[_.fields[0] for _ in columns]]
示例#4
0
def commodityTonnage_by_year(d, outdir, level=None):
    df = odo.odo(d, pd.DataFrame)
    d = bz.Data(df, d.dshape)
    if level:
        if isinstance(level, list):
            expr = [l for l in level]
        else:
            expr = level
        name = '-'.join(level)
        outpath = path.join(outdir, 'tonnage_by_{}_year.csv'.format(name))
        if replace or not path.exists(outpath):
            do = bz.by(bz.merge(d[expr], d.year, d.commodity, d.category), commodityTonnage=d.commodityTonnage.sum())
            save(do, outpath, replace)
    else:
        outpath = path.join(outdir, 'tonnage_by_year.csv')
        if replace or not path.exists(outpath):
            do = bz.by(bz.merge(d.year, d.commodity, d.category), commodityTonnage=d.commodityTonnage.sum())
            save(do, outpath, replace)
    return
示例#5
0
def test_foreign_key_merge_expression(fkey):
    from blaze import merge

    t = symbol('fkey', discover(fkey))
    expr = merge(t.sym_id.sym, t.sym_id.main.data)
    expected = """
        select pkey.sym, main.data
        from
            fkey, pkey, main
        where
            fkey.sym_id = pkey.id and pkey.main = main.id
    """
    result = compute(expr, fkey, return_type='native')
    assert normalize(str(result)) == normalize(expected)
示例#6
0
def test_foreign_key_merge_expression(fkey):
    from blaze import merge

    t = symbol('fkey', discover(fkey))
    expr = merge(t.sym_id.sym, t.sym_id.main.data)
    expected = """
        select pkey.sym, main.data
        from
            fkey, pkey, main
        where
            fkey.sym_id = pkey.id and pkey.main = main.id
    """
    result = compute(expr, fkey)
    assert normalize(str(result)) == normalize(expected)
示例#7
0
def delta(data1, col1, data2, col2, *args):
    # TODO: validate that they are of the same length etc
    d = merge(data1, data2)
    
    for feature_class in args:
        f = feature_class()
        name = type(f).__name__
        new_col = "delta_" + col1 + "_" + col2 + "_" + name

        s = "transform(d, " + new_col + " = data1." + col1 + "_" + name + " - data2." + col2 + "_" + name + ")"
        d2 = eval(s) # Pardonnez-moi!
        
        d = d2
        print "Adding " + new_col
    return d
示例#8
0
def multiply(data, col, *args):
    d = None
    for feature_class in args:
        f = feature_class()
        new_col = col + "_" + type(f).__name__
        d2 = data[col].map(f.value_fn(), f.schema_type())
        d2 = d2.relabel(**{col: new_col})
        if d is None:
            print 'D is not here'
            d = d2
        else:
            d = merge(d, d2)
            print 'D is here'

        print "Adding " + new_col
    return d
示例#9
0
def test_merge(rdd):
    col = (t['amount'] * 2).label('new')
    expr = merge(t['name'], col)

    assert compute(expr, rdd).collect() == [
        (row[0], row[1] * 2) for row in data]
示例#10
0
    def _blaze(self, _selects, _wheres, _groups, _aggs, _offset, _limit,
               _sorts, _count, _q):
        import blaze as bz
        import datashape
        # TODO: Not caching blaze connections
        parameters = self.params.get('parameters', {})
        bzcon = bz.Data(
            self.params['url'] +
            ('::' + self.params['table'] if self.params.get('table') else ''),
            **parameters)
        table = bz.Symbol('table', bzcon.dshape)
        columns = table.fields
        query = table

        if _wheres:
            wh_re = re.compile(r'([^=><~!]+)([=><~!]{1,2})([\s\S]+)')
            wheres = None
            for where in _wheres:
                match = wh_re.search(where)
                if match is None:
                    continue
                col, oper, val = match.groups()
                col = table[col]
                if oper in ['==', '=']:
                    whr = (col == val)
                elif oper == '>=':
                    whr = (col >= val)
                elif oper == '<=':
                    whr = (col <= val)
                elif oper == '>':
                    whr = (col > val)
                elif oper == '<':
                    whr = (col < val)
                elif oper == '!=':
                    whr = (col != val)
                elif oper == '~':
                    whr = (col.like('*' + val + '*'))
                elif oper == '!~':
                    whr = (~col.like('*' + val + '*'))
                wheres = whr if wheres is None else wheres & whr
            query = query if wheres is None else query[wheres]

        alias_cols = []
        if _groups and _aggs:
            byaggs = {
                'min': bz.min,
                'max': bz.max,
                'sum': bz.sum,
                'count': bz.count,
                'mean': bz.mean,
                'nunique': bz.nunique
            }
            agg_re = re.compile(r'([^:]+):([aA-zZ]+)\(([^:]+)\)')
            grps = bz.merge(*[query[group] for group in _groups])
            aggs = {}
            for agg in _aggs:
                match = agg_re.search(agg)
                if match is None:
                    continue
                name, oper, col = match.groups()
                alias_cols.append(name)
                aggs[name] = byaggs[oper](query[col])
            query = bz.by(grps, **aggs)

        if _q:
            wheres = None
            for col in columns:
                if isinstance(table[col].dshape.measure.ty,
                              datashape.coretypes.String):
                    whr = table[col].like('*' + _q + '*')
                    wheres = whr if wheres is None else wheres | whr
            if wheres is not None:
                query = query[wheres]

        count_query = query.count()

        if _sorts:
            order = {'asc': True, 'desc': False}
            sorts = []
            for sort in _sorts:
                col, odr = sort.partition(':')[::2]
                if col not in columns + alias_cols:
                    continue
                sorts.append(col)
            if sorts:
                query = query.sort(sorts, ascending=order.get(odr, True))

        if _offset:
            _offset = int(_offset)
        if _limit:
            _limit = int(_limit)
        if _offset and _limit:
            _limit += _offset
        if _offset or _limit:
            query = query[_offset:_limit]

        if _selects:
            query = query[_selects]

        # TODO: Improve json, csv, html outputs using native odo
        result = {
            'query': query,
            'data': bz.odo(bz.compute(query, bzcon.data), pd.DataFrame),
        }
        if _count:
            count = bz.odo(bz.compute(count_query, bzcon.data), pd.DataFrame)
            result['count'] = count.iloc[0, 0]
        return result
示例#11
0
def blaze_tutorial():
    accounts = bl.Symbol('accounts',
                         'var * {id: int, name: string, amount: int}')
    deadbeats = accounts[accounts.amount < 0].name

    list_ = [[1, 'Alice', 100],
             [2, 'Bob', -200],
             [3, 'Charlie', 300],
             [4, 'Denis', 400],
             [5, 'Edith', -500]]

    print(list(bl.compute(deadbeats, list_)))

    df_ = bl.DataFrame(list_, columns=['id', 'name', 'amount'])
    print(bl.compute(deadbeats, df_))
    bl_df_dir = dir(df_)

    df_ = pd.DataFrame(list_, columns=['id', 'name', 'amount'])
    print(df_[df_.amount < 0].name)
    pd_df_dir = dir(df_)

    print(len(bl_df_dir), len(pd_df_dir))
    print(len([d for d in bl_df_dir if d in pd_df_dir]))
    print([d for d in bl_df_dir if d not in pd_df_dir])
    print([d for d in pd_df_dir if d not in bl_df_dir])

    df_ = bl.Data([(1, 'Alice', 100),
                   (2, 'Bob', -200),
                   (3, 'Charlie', 300),
                   (4, 'Denis', 400),
                   (5, 'Edith', -500)],
                  fields=['id', 'name', 'balance'])

    print(repr(df_))
    print(repr(df_[df_.balance < 0]))

    print(repr(df_[df_.balance < 0].name))
    print(list(df_[df_.balance < 0].name))

    iris = bl.Data('examples/iris.csv')
    print(repr(iris))

    iris = bl.Data('sqlite:///examples/iris.db::iris')
    print(repr(iris))

    print(repr(bl.by(iris.species, min=iris.petal_width.min(),
                     max=iris.petal_width.max())))

    result = bl.by(iris.species, min=iris.petal_width.min(),
                   max=iris.petal_width.max())

    print(odo(result, bl.DataFrame))
    print(odo(result, pd.DataFrame))

    ### odo has weird issue with unicode filenames, apparently...
    name = 'output.csv'
    print(odo(result, bl.CSV(name)))

    print(repr(iris.sepal_length.mean()))
    print(repr(bl.mean(iris.sepal_length)))

    print(repr(bl.by(iris.species, shortest=iris.petal_length.min(),
                     longest=iris.petal_length.max(),
                     average=iris.petal_length.mean())))

    print(repr(iris.head()))

    iris = bl.transform(iris, sepal_ratio=iris.sepal_length / iris.sepal_width,
                        petal_ratio=iris.petal_length / iris.petal_width)
    print(repr(iris.head()))

    versicolor = iris[iris.species.like('%versicolor')]
    print(repr(versicolor))

    print((len(versicolor), len(versicolor.fields)))

    print(repr(iris.relabel(petal_length='PETAL-LENGTH',
                            petal_width='PETAL-WIDTH')))

    pd_df = pd.DataFrame({'name': ['Alice', 'Bob', 'Joe', 'Bob'],
                          'amount': [100, 200, 300, 400],
                          'id': [1, 2, 3, 4]})

    # put the `df` DataFrame odo a Blaze Data object
    bl_df = bl.DataFrame(pd_df)
    bl_dt = bl.Data(pd_df)

    print(repr(pd_df.amount * 2))
    print(repr(bl_df.amount * 2))
    print(repr(bl_dt.amount * 2))

    print(repr(pd_df[['id', 'amount']]))
    print(repr(bl_df[['id', 'amount']]))
    print(repr(bl_dt[['id', 'amount']]))

    print(repr(pd_df[pd_df.amount > 300]))
    print(repr(bl_df[bl_df.amount > 300]))
    print(repr(bl_dt[bl_dt.amount > 300]))

    print(repr(pd_df.groupby('name').amount.mean()))
    print(repr(pd_df.groupby(['name', 'id']).amount.mean()))

    print(repr(bl_df.groupby('name').amount.mean()))
    print(repr(bl_df.groupby(['name', 'id']).amount.mean()))

    print(repr(bl.by(bl_dt.name, amount=bl_dt.amount.mean())))
    print(repr(bl.by(bl.merge(bl_dt.name, bl_dt.id),
                     amount=bl_dt.amount.mean())))

    #pd.merge(pd_df, pd_df2, on='name')
    #bl.join(bl_dt, bl_dt2, 'name')

    print(repr(pd_df.amount.map(lambda x: x + 1)))
    print(repr(bl_df.amount.map(lambda x: x + 1)))
    print(repr(bl_dt.amount.map(lambda x: x + 1, 'int64')))

    print(repr(pd_df.rename(columns={'name': 'alias', 'amount': 'dollars'})))
    print(repr(bl_df.rename(columns={'name': 'alias', 'amount': 'dollars'})))
    print(repr(bl_dt.relabel(name='alias', amount='dollars')))

    print(repr(pd_df.drop_duplicates()))
    print(repr(bl_df.drop_duplicates()))
    print(repr(bl_dt.distinct()))

    print(repr(pd_df.name.drop_duplicates()))
    print(repr(bl_df.name.drop_duplicates()))
    print(repr(bl_dt.name.distinct()))

    print(repr(pd_df.amount.mean()))
    print(repr(bl_df.amount.mean()))
    print(repr(bl_dt.amount.mean()))

    print(repr(pd_df))
    print(repr(bl_df))
    print(repr(bl_dt))

    print(repr(pd_df.amount.value_counts()), '\n')
    print(repr(bl_df.amount.value_counts()), '\n')
    print(repr(bl_dt.amount.count_values()), '\n')

    print(repr(pd_df.dtypes), '\n')
    print(repr(bl_df.dtypes), '\n')
    print(repr(bl_df.columns), '\n')
    print(repr(bl_dt.dshape), '\n')

    print(repr(pd_df.amount.dtypes), '\n')
    print(repr(bl_df.amount.dtypes), '\n')
    print(repr(bl_dt.amount.dshape), '\n')

    print(type(pd_df), type(bl_df), type(bl_dt), '\n')

    os.remove('output.csv')
    for fn_ in glob.glob('*.csv.gz'):
        os.remove(fn_)

    return
示例#12
0
from pipeline import *
from operations import *
from features import FEATURE_CLASSES


translations_and_scores = pump(scores='data/hy.basic.scores.csv')


q_features = multiply(translations_and_scores, 'q', *FEATURE_CLASSES)

r_features = multiply(translations_and_scores, 'r', *FEATURE_CLASSES)

feature_deltas = delta(q_features, 'q', r_features, 'r', *FEATURE_CLASSES)
# delta is not commutative, so consider also the reverse:
#   delta(r_features, q_features, *FEATURE_CLASSES)


#print translations_and_scores
#print q_features
#print r_features
#print feature_deltas

from blaze import merge, odo
everything = merge(translations_and_scores, feature_deltas)
odo(everything, 'data/features.csv')
示例#13
0
def crosstabs(data, columns=None, values=None,
              correction=False,
              pairs_top=10000,
              details=True):
    '''
    Identifies the strength of relationship between every pair of categorical
    columns in a DataFrame

    Parameters
    ----------
    data : Blaze data
        A data with at least 2 columns having categorical values.
    columns : list of column names in data
        If not specified, uses ``autolyse.types(data)['groups']`` to identify
        all columns with categorical data.
    values : str, column name
        Optional column that contains weights to aggregate by summing up. By
        default, each row is counted as an observation.
    correction : boolean
        If True, and the degrees of freedom is 1, apply Yates' correction for
        continuity. The effect of the correction is to adjust each observed
        value by 0.5 towards the corresponding expected value. Defaults to False
        since Cramer's V (a more useful metric than chi-squared) must be computed
        without this correction.
    pairs_top: integer, Pick only top 10000 pairs by default
    details: boolean
        If True, will return observed and expected dataframes for pairs.
        Defaults to False.
    '''
    if columns is None:
        columns = types(data)['groups']

    parameters = ('p', 'chi2', 'dof', 'V')
    for index, column in itertools.combinations(columns, 2):
        agg_col = values if values in data.fields else column
        agg_func = bz.count(data[agg_col]) if agg_col == column else bz.sum(data[agg_col])
        data_grouped = bz.into(pd.DataFrame,
                               bz.by(bz.merge(data[index], data[column]),
                                     values=agg_func)
                               .sort('values')  # Generated SQL inefficient
                               .head(pairs_top))
        # BUG: bz.count: non-null count, gives 0 count for NULL groups
        # .nrows needs to fixed blaze/issues/1484
        # For now, we'll ignore NULL groups
        # Remove NULL groups
        data_grouped = data_grouped.dropna()
        if data_grouped.empty:
            result = {(index, column): {}}
        else:
            r = _crosstab(data_grouped[index],
                          column=data_grouped[column],
                          values=data_grouped['values'],
                          correction=correction)
            if details:
                result = {
                        'index': index,
                        'column': column,
                        'observed': r['observed'].to_json(),
                        'expected': r['expected'].to_json(),
                        'stats': {param: r[param] for param in parameters}
                }
            else:
                result = {
                        'index': index,
                        'column': column,
                        'stats': {param: r[param] for param in parameters}
                }

        yield result
示例#14
0
def crosstabs(data, columns=None, values=None,
              correction=False,
              pairs_top=10000,
              details=True):
    '''
    Identifies the strength of relationship between every pair of categorical
    columns in a DataFrame

    Parameters
    ----------
    data : Blaze data
        A data with at least 2 columns having categorical values.
    columns : list of column names in data
        If not specified, uses ``autolyse.types(data)['groups']`` to identify
        all columns with categorical data.
    values : str, column name
        Optional column that contains weights to aggregate by summing up. By
        default, each row is counted as an observation.
    correction : boolean
        If True, and the degrees of freedom is 1, apply Yates' correction for
        continuity. The effect of the correction is to adjust each observed
        value by 0.5 towards the corresponding expected value. Defaults to False
        since Cramer's V (a more useful metric than chi-squared) must be computed
        without this correction.
    pairs_top: integer, Pick only top 10000 pairs by default
    details: boolean
        If True, will return observed and expected dataframes for pairs.
        Defaults to False.
    '''
    if columns is None:
        columns = types(data)['groups']

    parameters = ('p', 'chi2', 'dof', 'V')
    for index, column in itertools.combinations(columns, 2):
        agg_col = values if values in data.fields else column
        agg_func = bz.count(data[agg_col]) if agg_col == column else bz.sum(data[agg_col])
        data_grouped = bz.into(pd.DataFrame,
                               bz.by(bz.merge(data[index], data[column]),
                                     values=agg_func)
                               .sort('values')  # Generated SQL inefficient
                               .head(pairs_top))
        # BUG: bz.count: non-null count, gives 0 count for NULL groups
        # .nrows needs to fixed blaze/issues/1484
        # For now, we'll ignore NULL groups
        # Remove NULL groups
        data_grouped = data_grouped.dropna()
        if data_grouped.empty:
            result = {(index, column): {}}
        else:
            r = _crosstab(data_grouped[index],
                          column=data_grouped[column],
                          values=data_grouped['values'],
                          correction=correction)
            if details:
                result = {
                        'index': index,
                        'column': column,
                        'observed': r['observed'].to_json(),
                        'expected': r['expected'].to_json(),
                        'stats': {param: r[param] for param in parameters}
                }
            else:
                result = {
                        'index': index,
                        'column': column,
                        'stats': {param: r[param] for param in parameters}
                }

        yield result