示例#1
0
def _extract_func(cube, field, **kwargs):
    c = get_cube(cube)
    # id_x if None will become ObjectID()
    id_x = c.get_field_property('id_x', field)
    # raw_x if None will become field
    raw_x = c.get_field_property('raw_x', field, field)
    # convert if None will skip convert step
    convert = c.get_field_property('convert', field)
    # _type will be default if not set
    _type = c.get_field_property('type', field)

    saved = 0
    failed = []
    for item in c._reader:
        if not item:
            continue

        try:
            id_ = id_x(item)
        except TypeError:
            id_ = item[id_x]

        try:
            raw = raw_x(item)
        except TypeError:
            raw = item[raw_x]

        tokens = type_cast(raw, _type)
        if convert:
            tokens = convert(tokens)

        saved += save_doc(c.name, field, tokens, id_)
        if not saved:
            failed.append(id_)

    result = {'saved': saved}
    if failed:
        result.update({'failed_ids': failed})
    return result
示例#2
0
def _extract_func(cube, field, **kwargs):
    c = get_cube(cube)
    # id_x if None will become ObjectID()
    id_x = c.get_field_property('id_x', field)
    # raw_x if None will become field
    raw_x = c.get_field_property('raw_x', field, field)
    # convert if None will skip convert step
    convert = c.get_field_property('convert', field)
    # _type will be default if not set
    _type = c.get_field_property('type', field)

    saved = 0
    failed = []
    for item in c._reader:
        if not item:
            continue

        try:
            id_ = id_x(item)
        except TypeError:
            id_ = item[id_x]

        try:
            raw = raw_x(item)
        except TypeError:
            raw = item[raw_x]

        tokens = type_cast(raw, _type)
        if convert:
            tokens = convert(tokens)

        saved += save_doc(c.name, field, tokens, id_)
        if not saved:
            failed.append(id_)

    result = {'saved': saved}
    if failed:
        result.update({'failed_ids': failed})
    return result
示例#3
0
def _extract_func(cube, **kwargs):
    '''
    SQL import method
    '''
    c = get_cube(cube)
    field = kwargs.get('field')
    if not field:
        raise ValueError("Field argument required")
    force = int(kwargs.get('force', 0))
    id_delta = kwargs.get('id_delta', None)

    if id_delta:
        if force:
            raise RuntimeError("force and id_delta can't be used simultaneously")
        else:
            touch = False
    else:
        touch = True

    db = c.get_field_property('db', field)
    table = c.get_field_property('table', field)
    db_table = '%s.%s' % (db, table)
    column = c.get_field_property('column', field)
    table_column = '%s.%s' % (table, column)

    # max number of rows to return per call (ie, LIMIT)
    row_limit = c.get_field_property('row_limit', field, c.row_limit)
    try:
        row_limit = int(row_limit)
    except (TypeError, ValueError):
        raise ValueError("row_limit must be a number")

    sql_where = []
    sql_groupby = ''
    _sql = c.get_field_property('sql', field)
    if not _sql:
        sql = 'SELECT %s, %s.%s FROM %s' % (
            table_column, table, field, db_table)
    else:
        sql = 'SELECT %s, %s FROM ' % (table_column, _sql[0])
        _from = [db_table]

        # FIXME: THIS IS UGLY! use a dict... or sqlalchemy
        if _sql[1]:
            _from.extend(_sql[1])
        sql += ', '.join(_from)
        sql += ' '

        if _sql[2]:
            sql += ' '.join(_sql[2])
        sql += ' '

        if _sql[3]:
            sql_where.append('(%s)' % ' OR '.join(_sql[3]))

        try:
            if _sql[4]:
                sql_groupby = _sql[4]
        except IndexError:
            pass

    delta_filter = []
    delta_filter_sql = None

    # force full update
    if force:
        _delta = False
    else:
        _delta = c.get_field_property('delta', field, True)

    if _delta:
        # delta is enabled
        # the following deltas are mutually exclusive
        if id_delta:
            delta_sql = "(%s IN (%s))" % (table_column, id_delta)
            delta_filter.append(delta_sql)
        elif c.get_field_property('delta_new_ids', field):
            # if we delta_new_ids is on, but there is no 'last_id',
            # then we need to do a FULL run...
            last_id = get_last_id(c.name, field)
            if last_id:
                # FIXME: any reason to ensure we know what the _id is typecasted as?
                try:
                        last_id = int(last_id)
                except (TypeError, ValueError):
                        pass

                if type(last_id) in [INT_TYPE, FLOAT_TYPE]:
                    last_id_sql = "%s > %s" % (table_column, last_id)
                else:
                    last_id_sql = "%s > '%s'" % (table_column, last_id)
                delta_filter.append(last_id_sql)

            mtime_columns = c.get_field_property('delta_mtime', field)
            if mtime_columns:
                if isinstance(mtime_columns, basestring):
                    mtime_columns = [mtime_columns]
                last_update_dt = last_known_warehouse_mtime(c.name, field)
                if last_update_dt:
                    last_update_dt = last_update_dt.strftime('%Y-%m-%d %H:%M:%S %z')
                    dt_format = "yyyy-MM-dd HH:mm:ss z"
                    for _column in mtime_columns:
                        _sql = "%s > parseTimestamp('%s', '%s')" % (
                            _column, last_update_dt, dt_format)
                        delta_filter.append(_sql)

    if delta_filter:
        delta_filter_sql = ' OR '.join(delta_filter)
        sql_where.append('(%s)' % delta_filter_sql)

    if sql_where:
        sql += ' WHERE %s ' % ' AND '.join(sql_where)

    if sql_groupby:
        sql += ' GROUP BY %s ' % sql_groupby

    if c.get_field_property('sort', field, True):
        sql += " ORDER BY %s ASC" % table_column

    # whether to query for distinct rows only or not; default, no
    if c.get_field_property('distinct', field, False):
        sql = re.sub('^SELECT', 'SELECT DISTINCT', sql)

    start = 0
    saved = 0
    _stop = False
    rows = []
    failed = []

    # FIXME: prefetch the next set of rows while importing to mongo
    logger.debug('... ... Starting SQL fetchall routine!')

    container = c.get_field_property('container', field)

    if touch:
        now = datetime.now(UTC)
        spec_mtime = {'cube': cube}
        update_mtime = {'$set': {field: {'mtime': now}}}

    while not _stop:
        rows = c._sql_fetchall(sql, start, field, row_limit)
        k = len(rows)
        if k > 0:
            logger.debug('... ... Starting Processer')
            grouped = c.grouper(rows)
            logger.debug('... ... Saving docs now!')
            t0 = time.time()
            _id_k = 0
            for _id in grouped.iterkeys():
                _id_k += 1
                for field in grouped[_id].iterkeys():
                    tokens = grouped[_id][field]
                    if not tokens:
                        tokens = None
                    elif container and type(tokens) is not list:
                        tokens = [tokens]
                    elif not container and type(tokens) is list:
                        if len(tokens) > 1:
                            raise TypeError(
                                "Tokens contains too many values (%s); "
                                "(set container=True?)" % (tokens))
                        else:
                            tokens = tokens[0]

                    try:
                        saved += save_doc(c.name, field, tokens, _id)
                    except Exception as e:
                        logger.error(
                            'Error saving (%s) %s: %s' % (tokens, _id, e))
                        saved = 0
                    if not saved:
                        failed.append(_id)
            t1 = time.time()
            logger.info('... ... Saved %i docs (%i/sec)' % (
                k, k / (t1 - t0)))
        else:
            logger.debug('... ... No rows; nothing to process')

        if k < row_limit:
            _stop = True
        else:
            start += k
            if k != row_limit:  # theoretically, k == row_limit
                logger.warn(
                    "rows count seems incorrect! row_limit: %s, row returned: %s" % (
                        row_limit, k))

    result = {'saved': saved}
    if failed:
        result.update({'failed_ids': failed})
    else:
        if touch:
            # update the mtimestamp for when this field was last touched
            # to the moment we started updating
            c.c_etl_activity.update(spec_mtime, update_mtime, upsert=True)
    return result
示例#4
0
def _extract_func(cube, **kwargs):
    '''
    SQL import method
    '''
    c = get_cube(cube)
    field = kwargs.get('field')
    if not field:
        raise ValueError("Field argument required")
    force = int(kwargs.get('force', 0))
    id_delta = kwargs.get('id_delta', None)

    if id_delta:
        if force:
            raise RuntimeError(
                "force and id_delta can't be used simultaneously")
        else:
            touch = False
    else:
        touch = True

    db = c.get_field_property('db', field)
    table = c.get_field_property('table', field)
    db_table = '%s.%s' % (db, table)
    column = c.get_field_property('column', field)
    table_column = '%s.%s' % (table, column)

    # max number of rows to return per call (ie, LIMIT)
    row_limit = c.get_field_property('row_limit', field, c.row_limit)
    try:
        row_limit = int(row_limit)
    except (TypeError, ValueError):
        raise ValueError("row_limit must be a number")

    sql_where = []
    sql_groupby = ''
    _sql = c.get_field_property('sql', field)
    if not _sql:
        sql = 'SELECT %s, %s.%s FROM %s' % (table_column, table, field,
                                            db_table)
    else:
        sql = 'SELECT %s, %s FROM ' % (table_column, _sql[0])
        _from = [db_table]

        # FIXME: THIS IS UGLY! use a dict... or sqlalchemy
        if _sql[1]:
            _from.extend(_sql[1])
        sql += ', '.join(_from)
        sql += ' '

        if _sql[2]:
            sql += ' '.join(_sql[2])
        sql += ' '

        if _sql[3]:
            sql_where.append('(%s)' % ' OR '.join(_sql[3]))

        try:
            if _sql[4]:
                sql_groupby = _sql[4]
        except IndexError:
            pass

    delta_filter = []
    delta_filter_sql = None

    # force full update
    if force:
        _delta = False
    else:
        _delta = c.get_field_property('delta', field, True)

    if _delta:
        # delta is enabled
        # the following deltas are mutually exclusive
        if id_delta:
            delta_sql = "(%s IN (%s))" % (table_column, id_delta)
            delta_filter.append(delta_sql)
        elif c.get_field_property('delta_new_ids', field):
            # if we delta_new_ids is on, but there is no 'last_id',
            # then we need to do a FULL run...
            last_id = get_last_id(c.name, field)
            if last_id:
                # FIXME: any reason to ensure we know what the _id is typecasted as?
                try:
                    last_id = int(last_id)
                except (TypeError, ValueError):
                    pass

                if type(last_id) in [INT_TYPE, FLOAT_TYPE]:
                    last_id_sql = "%s > %s" % (table_column, last_id)
                else:
                    last_id_sql = "%s > '%s'" % (table_column, last_id)
                delta_filter.append(last_id_sql)

            mtime_columns = c.get_field_property('delta_mtime', field)
            if mtime_columns:
                if isinstance(mtime_columns, basestring):
                    mtime_columns = [mtime_columns]
                last_update_dt = last_known_warehouse_mtime(c.name, field)
                if last_update_dt:
                    last_update_dt = last_update_dt.strftime(
                        '%Y-%m-%d %H:%M:%S %z')
                    dt_format = "yyyy-MM-dd HH:mm:ss z"
                    for _column in mtime_columns:
                        _sql = "%s > parseTimestamp('%s', '%s')" % (
                            _column, last_update_dt, dt_format)
                        delta_filter.append(_sql)

    if delta_filter:
        delta_filter_sql = ' OR '.join(delta_filter)
        sql_where.append('(%s)' % delta_filter_sql)

    if sql_where:
        sql += ' WHERE %s ' % ' AND '.join(sql_where)

    if sql_groupby:
        sql += ' GROUP BY %s ' % sql_groupby

    if c.get_field_property('sort', field, True):
        sql += " ORDER BY %s ASC" % table_column

    # whether to query for distinct rows only or not; default, no
    if c.get_field_property('distinct', field, False):
        sql = re.sub('^SELECT', 'SELECT DISTINCT', sql)

    start = 0
    saved = 0
    _stop = False
    rows = []
    failed = []

    # FIXME: prefetch the next set of rows while importing to mongo
    logger.debug('... ... Starting SQL fetchall routine!')

    container = c.get_field_property('container', field)

    if touch:
        now = datetime.now(UTC)
        spec_mtime = {'cube': cube}
        update_mtime = {'$set': {field: {'mtime': now}}}

    while not _stop:
        rows = c._sql_fetchall(sql, start, field, row_limit)
        k = len(rows)
        if k > 0:
            logger.debug('... ... Starting Processer')
            grouped = c.grouper(rows)
            logger.debug('... ... Saving docs now!')
            t0 = time.time()
            _id_k = 0
            for _id in grouped.iterkeys():
                _id_k += 1
                for field in grouped[_id].iterkeys():
                    tokens = grouped[_id][field]
                    if not tokens:
                        tokens = None
                    elif container and type(tokens) is not list:
                        tokens = [tokens]
                    elif not container and type(tokens) is list:
                        if len(tokens) > 1:
                            raise TypeError(
                                "Tokens contains too many values (%s); "
                                "(set container=True?)" % (tokens))
                        else:
                            tokens = tokens[0]

                    try:
                        saved += save_doc(c.name, field, tokens, _id)
                    except Exception as e:
                        logger.error('Error saving (%s) %s: %s' %
                                     (tokens, _id, e))
                        saved = 0
                    if not saved:
                        failed.append(_id)
            t1 = time.time()
            logger.info('... ... Saved %i docs (%i/sec)' % (k, k / (t1 - t0)))
        else:
            logger.debug('... ... No rows; nothing to process')

        if k < row_limit:
            _stop = True
        else:
            start += k
            if k != row_limit:  # theoretically, k == row_limit
                logger.warn(
                    "rows count seems incorrect! row_limit: %s, row returned: %s"
                    % (row_limit, k))

    result = {'saved': saved}
    if failed:
        result.update({'failed_ids': failed})
    else:
        if touch:
            # update the mtimestamp for when this field was last touched
            # to the moment we started updating
            c.c_etl_activity.update(spec_mtime, update_mtime, upsert=True)
    return result