示例#1
0
    def __init__(self, *args, **kwargs):
        super(StaticXmlExtractor, self).__init__(*args, **kwargs)

        if 'item_xpath' not in self.source_definition:
            raise ConfigurationError('Missing \'item_xpath\' definition')

        if not self.source_definition['item_xpath']:
            raise ConfigurationError('The \'item_xpath\' is empty')

        self.item_xpath = self.source_definition['item_xpath']
    def __init__(self, *args, **kwargs):
        super(StaticFileBaseExtractor, self).__init__(*args, **kwargs)

        if 'file_url' not in self.source_definition:
            raise ConfigurationError('Missing \'file_url\' definition')

        if not self.source_definition['file_url']:
            raise ConfigurationError('The \'file_url\' is empty')

        self.file_url = self.source_definition['file_url']
示例#3
0
    def __init__(self, *args, **kwargs):
        super(PagingHTMLExtractor, self).__init__(*args, **kwargs)

        if 'next_page_xpath' not in self.source_definition:
            raise ConfigurationError('Missing \'next_page_xpath\' definition')

        if not self.source_definition['next_page_xpath']:
            raise ConfigurationError('The \'next_page_xpath\' is empty')

        self.next_page_xpath = self.source_definition['next_page_xpath']

        # default max 5 pages
        self.next_page_max_count = self.source_definition.get('next_page_max_count', 5)
    def start(self, *args, **kwargs):
        self.index_name = kwargs.get('new_index_name')

        if not self.index_name:
            raise ConfigurationError('The name of the index is not provided')

        return super(ElasticsearchLoader, self).start(*args, **kwargs)
示例#5
0
    def __init__(self, *args, **kwargs):
        super(LocalPathBaseExtractor, self).__init__(*args, **kwargs)

        if 'path' not in self.source_definition:
            raise ConfigurationError('Missing \'path\' definition')

        if not self.source_definition['path']:
            raise ConfigurationError('The \'path\' is empty')

        if 'pattern' not in self.source_definition:
            raise ConfigurationError('Missing \'pattern\' definition')

        if not self.source_definition['pattern']:
            raise ConfigurationError('The \'pattern\' is empty')

        self.path = self.source_definition['path']
        self.pattern = self.source_definition['pattern']
示例#6
0
    def __init__(self, *args, **kwargs):
        super(GreenValleyBaseExtractor, self).__init__(*args, **kwargs)

        self.base_url = None
        self.username = None
        self.key = None
        self.hash = None

        for opt in ['base_url', 'username', 'key', 'hash']:
            opt_key = 'greenvalley_%s' % (opt, )
            if opt_key not in self.source_definition:
                raise ConfigurationError('Missing \'%s\' definition' %
                                         (opt_key, ))

            if not self.source_definition[opt_key]:
                raise ConfigurationError('The \'%s\' is empty' % (opt_key, ))

            setattr(self, opt, self.source_definition[opt_key])
示例#7
0
    def run(self, *args, **kwargs):
        self.current_index_name = kwargs.get('current_index_name')
        self.index_name = kwargs.get('new_index_name')
        self.alias = kwargs.get('index_alias')

        if not self.index_name:
            raise ConfigurationError('The name of the index is not provided')

        return super(ElasticsearchLoader, self).run(*args, **kwargs)
示例#8
0
def get_current_index(index_alias):
    # Find the current index name behind the alias specified in the config
    try:
        current_index_aliases = es.indices.get_alias(name=index_alias)
    except NotFoundError:
        raise ConfigurationError('Index with alias "{index_alias}" does '
                                 'not exist'.format(index_alias=index_alias))

    return current_index_aliases.keys()[0]
示例#9
0
    def __init__(self, *args, **kwargs):
        super(FacebookExtractor, self).__init__(*args, **kwargs)

        if 'facebook' not in self.source_definition:
            raise ConfigurationError('Missing \'facebook\' definition')

        for fld in [
            'api_version', 'app_id', 'app_secret', 'graph_url'
        ]:
            if fld not in self.source_definition['facebook']:
                raise ConfigurationError(
                    'Missing \'%s\' definition of facebook' % (fld,))

            if not self.source_definition['facebook'][fld]:
                raise ConfigurationError(
                    'The \'%s\' in facebook is empty' % (fld,))

            setattr(
                self, 'fb_%s' % (fld,),
                self.source_definition['facebook'][fld])
示例#10
0
    def __init__(self, *args, **kwargs):
        super(StaticHtmlExtractor, self).__init__(*args, **kwargs)

        if 'item_xpath' not in self.source_definition:
            raise ConfigurationError('Missing \'item_xpath\' definition')

        if not self.source_definition['item_xpath']:
            raise ConfigurationError('The \'item_xpath\' is empty')

        self.item_xpath = self.source_definition['item_xpath']

        if 'item_id_xpath' not in self.source_definition:
            raise ConfigurationError('Missing \'item_id_xpath\' definition')

        if not self.source_definition['item_id_xpath']:
            raise ConfigurationError('The \'item_id_xpath\' is empty')

        self.item_id_xpath = self.source_definition['item_id_xpath']

        self.default_namespace = None
        if 'default_namespace' in self.source_definition:
            self.default_namespace = self.source_definition[
                'default_namespace']
示例#11
0
    def run(self, *args, **kwargs):
        self.current_index_name = kwargs.get('current_index_name')
        self.index_name = kwargs.get('new_index_name')
        self.alias = kwargs.get('index_alias')
        self.new_index_names = kwargs.get('new_index_names',
                                          [settings.COMBINED_INDEX])
        try:
            self.combined_index_name = [
                i for i in self.new_index_names
                if i.startswith('%s_' % (settings.COMBINED_INDEX, ))
            ][0]
        except IndexError as e:
            self.combined_index_name = settings.COMBINED_INDEX
        self.doc_type = kwargs['source_definition'].get('doc_type', 'item')

        if not self.index_name:
            raise ConfigurationError('The name of the index is not provided')

        return super(ElasticsearchLoader, self).run(*args, **kwargs)
示例#12
0
    def __init__(self, *args, **kwargs):
        super(StaticJSONDumpExtractor, self).__init__(*args, **kwargs)

        if not self.source_definition.get('dump_path'):
            raise ConfigurationError('Missing \'dump_path\' definition')
示例#13
0
def setup_pipeline(source_definition):
    logger.debug('[%s] Starting pipeline for source: %s' % (source_definition['key'], source_definition.get('id')))

    # index_name is an alias of the current version of the index
    index_alias = '{prefix}_{index_name}'.format(
        prefix=source_definition.get('es_prefix', settings.DEFAULT_INDEX_PREFIX),
        index_name=source_definition.get('index_name',
                                         source_definition.get('id'))
    )

    if not es.indices.exists(index_alias):
        index_name = '{index_alias}_{now}'.format(index_alias=index_alias,
                                                  now=datetime.utcnow()
                                                  .strftime('%Y%m%d%H%M%S'))

        es.indices.create(index_name)
        es.indices.put_alias(name=index_alias, index=index_name)

    # Find the current index name behind the alias specified in the config
    try:
        current_index_aliases = es.indices.get_alias(name=index_alias)
    except NotFoundError:
        raise ConfigurationError('Index with alias "{index_alias}" does '
                                 'not exist'.format(index_alias=index_alias))

    current_index_name = current_index_aliases.keys()[0]
    # Check if the source specifies that any update should be added to
    # the current index instead of a new one
    if source_definition.get('keep_index_on_update'):
        new_index_name = current_index_name
    else:
        new_index_name = '{index_alias}_{now}'.format(
            index_alias=index_alias,
            now=datetime.utcnow().strftime('%Y%m%d%H%M%S')
        )

    # Parameters that are passed to each task in the chain
    params = {
        'run_identifier': 'pipeline_{}'.format(uuid4().hex),
        'current_index_name': current_index_name,
        'new_index_name': new_index_name,
        'index_alias': index_alias,
    }

    logger.debug('[%s] Starting run with identifier %s' % (source_definition['key'], params['run_identifier']))

    celery_app.backend.set(params['run_identifier'], 'running')
    run_identifier_chains = '{}_chains'.format(params['run_identifier'])

    # we can have multiple pipelines. but for compatibility and readability
    # use the source definition if no specific pipelines have been defined
    pipelines = source_definition.get('pipelines', None) or [source_definition]

    pipeline_definitions = {}
    pipeline_extractors = {}
    pipeline_transformers = {}
    pipeline_enrichers = {}
    pipeline_loaders = {}

    for pipeline in pipelines:
        if 'id' not in pipeline:
            raise ConfigurationError("Each pipeline must have an id field.")

        # adjusted source definitions per pipeline. This way you can for
        # example change the index on a pipeline basis
        pipeline_definitions[pipeline['id']] = deepcopy(source_definition)
        pipeline_definitions[pipeline['id']].update(pipeline)

        # initialize the ETL classes, per pipeline
        pipeline_extractors[pipeline['id']] = load_object(
            pipeline_definitions[pipeline['id']]['extractor'])

        pipeline_transformers[pipeline['id']] = load_object(
            pipeline_definitions[pipeline['id']]['transformer'])

        pipeline_enrichers[pipeline['id']] = [
            (load_object(enricher) or {}) for enricher in
            pipeline_definitions[pipeline['id']].get('enrichers', [])]

        pipeline_loaders[pipeline['id']] = list()
        for cls in pipeline_definitions[pipeline['id']].get('loaders', None) or \
                [pipeline_definitions[pipeline['id']].get('loader', None)]:
            if cls:
                pipeline_loaders[pipeline['id']].append(load_object(cls))

    result = None
    for pipeline in pipelines:
        try:
            # The first extractor should be a generator instead of a task
            for item in pipeline_extractors[pipeline['id']](
                    source_definition=pipeline_definitions[pipeline['id']]).run():
                step_chain = list()

                params['chain_id'] = uuid4().hex
                params['start_time'] = datetime.now()

                celery_app.backend.add_value_to_set(
                    set_name=run_identifier_chains,
                    value=params['chain_id'])

                # Transformers
                if pipeline_transformers.get(pipeline['id']):
                    step_chain.append(pipeline_transformers[pipeline['id']].s(
                        *item,
                        source_definition=pipeline_definitions[pipeline['id']],
                        **params)
                    )

                # Enrichers
                for enricher_task in pipeline_enrichers[
                    pipeline['id']
                ]:
                    step_chain.append(enricher_task.s(
                        source_definition=pipeline_definitions[
                            pipeline['id']],
                        **params
                    )
                    )

                # Loaders
                # Multiple loaders to enable to save to different stores
                initialized_loaders = []
                for loader in pipeline_loaders[pipeline['id']]:
                    initialized_loaders.append(loader.s(
                        source_definition=pipeline_definitions[
                            pipeline['id']],
                        **params))
                step_chain.append(group(initialized_loaders))

                result = chain(step_chain).delay()
        except KeyboardInterrupt:
            logger.warning('KeyboardInterrupt received. Stopping the program.')
            exit()
        except Exception, e:
            logger.error('[{site_name}] Pipeline has failed. Setting status of '
                         'run identifier "{run_identifier}" to "error":\n{message}'
                         .format(index=params['new_index_name'],
                                 run_identifier=params['run_identifier'],
                                 extractor=pipeline_extractors[pipeline['id']],
                                 message=e,
                                 site_name=source_definition['key'],
                                 )
                         )

            celery_app.backend.set(params['run_identifier'], 'error')

            # Reraise the exception so celery can autoretry
            raise
示例#14
0
def setup_pipeline(source_definition):
    # index_name is an alias of the current version of the index
    index_alias = '{prefix}_{index_name}'.format(
        prefix=settings.DEFAULT_INDEX_PREFIX,
        index_name=source_definition.get('index_name',
                                         source_definition.get('id'))
    )

    if not es.indices.exists(index_alias):
        index_name = '{index_alias}_{now}'.format(index_alias=index_alias,
                                                  now=datetime.utcnow()
                                                  .strftime('%Y%m%d%H%M%S'))

        es.indices.create(index_name)
        es.indices.put_alias(name=index_alias, index=index_name)

    # Find the current index name behind the alias specified in the config
    try:
        current_index_aliases = es.indices.get_alias(name=index_alias)
    except NotFoundError:
        raise ConfigurationError('Index with alias "{index_alias}" does '
                                 'not exist'.format(index_alias=index_alias))

    current_index_name = current_index_aliases.keys()[0]
    new_index_name = '{index_alias}_{now}'.format(
        index_alias=index_alias, now=datetime.utcnow().strftime('%Y%m%d%H%M%S')
    )

    extractor = load_object(source_definition['extractor'])(source_definition)
    transformer = load_object(source_definition['transformer'])()
    enrichers = [(load_object(enricher[0])(), enricher[1]) for enricher in
                 source_definition['enrichers']]
    loader = load_object(source_definition['loader'])()

    # Parameters that are passed to each task in the chain
    params = {
        'run_identifier': 'pipeline_{}'.format(uuid4().hex),
        'current_index_name': current_index_name,
        'new_index_name': new_index_name,
        'index_alias': index_alias
    }

    celery_app.backend.set(params['run_identifier'], 'running')
    run_identifier_chains = '{}_chains'.format(params['run_identifier'])

    try:
        for item in extractor.run():
            # Generate an identifier for each chain, and record that in
            # {}_chains, so that we can know for sure when all tasks
            # from an extractor have finished
            params['chain_id'] = uuid4().hex
            celery_app.backend.add_value_to_set(set_name=run_identifier_chains,
                                                value=params['chain_id'])

            item_chain = chain()

            # Tranform
            item_chain |= transformer.s(
                *item,
                source_definition=source_definition,
                **params
            )

            # Enrich
            for enricher_task, enricher_settings in enrichers:
                item_chain |= enricher_task.s(
                    source_definition=source_definition,
                    enricher_settings=enricher_settings,
                    **params
                )

            # Load
            item_chain |= loader.s(
                source_definition=source_definition,
                **params
            )

            item_chain.delay()
    except:
        logger.error('An exception has occured in the "{extractor}" extractor. '
                     'Deleting index "{index}" and setting status of run '
                     'identifier "{run_identifier}" to "error".'
                     .format(index=new_index_name,
                             run_identifier=params['run_identifier'],
                             extractor=source_definition['extractor']))

        celery_app.backend.set(params['run_identifier'], 'error')
        raise

    celery_app.backend.set(params['run_identifier'], 'done')