Exemplo n.º 1
0
def find(cube, query, fields=None, date=None, most_recent=True):
    logger.debug('Running Find')
    if date is not None:
        # we will be doing a timeline query so we need to rename the fields
        # WARNING: might not work if some field is a substring of other field
        all_fields = get_fields(cube, '__all__')
        for f in all_fields:
            query = re.sub(f, 'fields.%s' % f, query)
        # add the date constraint
        query = query + ' and ' + _get_date_pql_string(date)
    pql_parser = pql.SchemaFreeParser()
    try:
        # FIXME: make it a schema aware parser
        spec = pql_parser.parse(query)
    except Exception as e:
        raise ValueError("Invalid Query (%s)" % str(e))

    c = get_cube(cube)
    _cube = c.get_collection(timeline=(date is not None))

    logger.debug('Query: %s' % spec)

    fields = get_fields(cube, fields)

    if date is not None:
        project_d = dict([(f, '$fields.%s' % f) for f in fields])
        project_d.update(dict(_id='$id', _start='$start', _end='$end'))
        if most_recent:
            docs = _cube.aggregate([{'$match': spec},
                                    {'$sort': {'start': -1}},
                                    {'$group': {'_id': '$id',
                                                'fields': {'$first':
                                                           '$fields'},
                                                'start': {'$first': '$start'},
                                                'end':  {'$first': '$end'},
                                                'id': {'$first': '$id'}}},
                                    {'$project': project_d}])
        else:
            docs = _cube.aggregate([{'$match': spec},
                                    {'$project': project_d}])
        docs = docs['result']
    else:
        docs = _cube.find(spec, fields)
        docs.batch_size(10000000)  # hard limit is 16M...
    docs = [d for d in docs]
    return docs
Exemplo n.º 2
0
def index_warehouse(cube, fields, force=False):
    '''
    NOTE: _id key index is generated automatically by mongo
    '''
    c = get_cube(cube)
    _cube = c.get_collection(admin=True)

    fields = get_fields(cube, fields)
    result = {}
    for field in fields:
        name = '%s-tokens' % field
        if force or c.get_field_property('index', field):
            logger.info(' %s... Indexing Warehouse (%s)%s' %
                        (YELLOW, field, ENDC))
            key = [(field, -1)]
            result[field] = _cube.ensure_index(key, name=name)
        else:
            result[field] = -1
    return result
Exemplo n.º 3
0
def extract(cube, **kwargs):
    logger.info(' Starting Update operation!')
    logger.info(' %sCube: %s%s' % (YELLOW, cube, ENDC))
    c = get_cube(cube)

    logger.debug('%sExtract - Start%s' % (YELLOW, ENDC))

    _fields = kwargs.get('fields')
    fields = get_fields(cube, _fields)

    if fields:
        result = {}
        for field in fields:
            kwargs['field'] = field
            logger.debug('%sField: %s%s' % (YELLOW, field, ENDC))
            result[field] = c.extract_func(**kwargs)
            logger.info('Extract - Complete: (%s.%s): %s' %
                        (cube, field, result[field]))
    else:
        result = c.extract_func(**kwargs)
        logger.info('Extract - Complete: (%s): %s' % (cube, result))

    return result
Exemplo n.º 4
0
def fetch(cube, fields, skip=0, limit=0, ids=None):
    logger.debug('Running Fetch (skip:%s, limit:%s, ids:%s)' % (
        skip, limit, len(ids)))
    logger.debug('... Fields: %s' % fields)

    c = get_cube(cube)
    _cube = c.get_collection()

    fields = get_fields(cube, fields)
    logger.debug('Return Fields: %s' % fields)

    sort = [('_id', 1)]

    if ids:
        spec = {'_id': {'$in': parse_ids(ids)}}
    else:
        spec = {}

    docs = _cube.find(spec, fields,
                      skip=skip, limit=limit,
                      sort=sort)
    docs.batch_size(10000000)  # hard limit is 16M...
    return [d for d in docs]