Пример #1
0
    def run(self, query={}, force=False, authenticate=False):
        '''imported from MongoDB'''

        self.version = '.1'
        self.functiontype = 'importer'
        self.date = datetime.datetime(year=2017, month=4, day=4)

        databasename = config.get('mongodb', 'databasename')
        collectionname = config.get('mongodb', 'collectionname')
        username = config.get('mongodb', 'username')
        password = config.get('mongodb', 'password')
        client = pymongo.MongoClient(config.get('mongodb', 'url'))
        db = client[databasename]
        if authenticate:
            db.authenticate(username, password)
        collection = db[collectionname]

        mapping = {
            'doctype': 'source',
            'url': 'url',
            '_id': 'rssidentifier',
            'publication_date': 'datum',
            'text': 'text',
            'teaser': 'teaser',
            'title': 'title',
            'byline': 'byline',
            'bylinesource': 'bylinesource',
            'category': 'section',
            'url': 'url'
        }

        input_iterable = collection.find(query)

        for num, inputdoc in enumerate(input_iterable):
            document = {}
            for k, v in mapping.items():
                try:
                    document[k] = inputdoc[v]
                except:
                    logger.debug('key {} not found'.format(k))
            logger.info('processing {num}'.format(**locals()))
            self.doctype = document.get('doctype')
            document = self._add_metadata(document)
            self._verify(document)
            # logger.debug('about to save',document)
            # print('\n'*10)
            try:
                self._save_document(document, forced=force)
                logger.debug("Stored document {} in ES".format(num))
            except Exception as e:
                logger.warning(
                    "ACK, unable to import document number {num}: {e}".format(
                        **locals()))
Пример #2
0
def _doctype_query_or_list(doctype_query_or_list,
                           force=False,
                           field=None,
                           task=None):
    '''
    This function helps other functions dynamically interpret the argument for document selection.
    It allows for either a list of documents, an elasticsearch query, a string-query or a doctype
    string to be provided and returns an iterable containing these documents.

    Parameters
    ----------
    doctype_query_or_list: list, string or dict
        specification of input document, either:
            a list, each element should be an elasticsearch document
            a dict, should be an elasticsearch query
            a string, which is either an exact match with doctype (checked against doctype mappings) or
                alternatively, a query_string for the elasticsearch database
    force: bool (defautl=False)
        whether existing fields should be re-computed. Used to subset to documents were field is missing.
    field: string (default=None)
        Field on which operations are done, used to check when force=False
    task: string (default=None)
        Function for which the documents are used. Argument is used only to generate the expected outcome
        fieldname, i.e. <field>_<function>

    Returns
    -------
    Iterable
    '''
    if type(doctype_query_or_list) == list:
        documents = doctype_query_or_list
    elif type(doctype_query_or_list) == str:
        if doctype_query_or_list in core.database.client.indices.get_mapping()[
                config.get('elasticsearch',
                           'document_index')]['mappings'].keys():
            logger.info("assuming documents of given type should be processed")
            if force or not field:
                documents = core.database.scroll_query({
                    'filter': {
                        'match': {
                            'doctype': "%s" % doctype_query_or_list
                        }
                    }
                })
            elif not force and field:
                logger.info(
                    "force=False, ignoring documents where the result key exists (and has non-NULL value)"
                )
                documents = core.database.scroll_query({
                    'filter': {
                        'and': [{
                            'match': {
                                'doctype': doctype_query_or_list
                            }
                        }, {
                            'missing': {
                                'field': '%s_%s' % (field, task)
                            }
                        }]
                    }
                })
        else:
            logger.info("assuming input is a query_string")
            if force or not field:
                documents = core.database.scroll_query({
                    'filter': {
                        'query_string': {
                            'query': doctype_query_or_list
                        }
                    }
                })
            elif not force and field:
                logger.info(
                    "force=False, ignoring documents where the result key exists (and has non-NULL value)"
                )
                documents = core.database.scroll_query({
                    'filter': {
                        'and': [{
                            'missing': {
                                'field': '%s_%s' % (field, task)
                            }
                        }, {
                            'query_string': {
                                'query': doctype_query_or_list
                            }
                        }]
                    }
                })
    else:
        if not force and field and task and not doctype_query_or_list:
            field = '%s_%s' % (field, task)
            doctype_query_or_list.update(
                {'filter': {
                    'missing': {
                        'field': field
                    }
                }})
        documents = core.search_utils.scroll_query(doctype_query_or_list)
    return documents
Пример #3
0
from celery import Celery, group, chain, chord
from flask import Flask
import argparse
import core
import configparser
import core.search_utils
import core.taskmanager
import datetime
import processing  # helps celery recognize the processing tasks
import scrapers  # helps celery recognize the scraping tasks
import clients  # helps celery recognize client tasks
import analysis  # helps celery recognize analysis tasks

from core.database import config

logging.basicConfig(level=config.get("inca", "loglevel"))

LOCAL_ONLY = config.get('inca', 'local_only') == "True"

api = Flask(__name__)
taskmaster = Celery(
    backend=config.get('celery',
                       '%s.backend' % config.get('inca', 'dependencies')),
    broker=config.get('celery',
                      '%s.broker' % config.get('inca', 'dependencies')),
)

taskmaster.conf.update(
    CELERYBEAT_SCHEDULE=core.celerybeat_schedule.get_scheduler())

expose = ["scrapers", "processing", "analysis", "clients", "inca"]
Пример #4
0
# -*- coding: utf-8 -*-
from core.processor_class import Processer
from core.database import config
# from core.basic_utils import dotkeys
import logging
import requests
from PIL import Image
import imagehash
import os
import sys

IS_PYTHON3 = sys.version_info[0] == 3 and sys.version_info[1] >= 2

logger = logging.getLogger(__name__)

IMAGEPATH = config.get('imagestore', 'imagepath')


def hash2filepath(myhash):
    '''
    Returns a tuple consisting of the directory in which the image is to be stored
    and the filename itself. The filename is identical to the hash.
    '''
    hashstr = str(myhash)
    path = os.path.join(IMAGEPATH, hashstr[:4], hashstr[4:8], hashstr[8:12],
                        hashstr[12:])
    filename = hashstr + '.jpg'
    return path, filename


class download_images(Processer):
Пример #5
0
Файл: inca.py Проект: uless/inca
class Inca():
    """INCA main class for easy access to functionality

    methods
    ----
    Scrapers
        Retrieval methods for RSS websites. Most scrapers can run
        out-of-the-box without specifying any parameters. If no database is
        present, scrapers will return the data as a list.

        usage:
            docs = inca.scrapers.<scraper>()

    Rssscrapers
        Same as Scrapers, but based on the websites' RSS feeds.

    Clients
        API-clients to get data from various endpoints. You can start using client
        functionality by:
        1) Adding an application, using the `<service>_create_app` method
        2) Add credentials to that application, using `<service>_create_credentials`
        3) Then run a collection method, such as `twitter_timeline`!

        usage:
            inca.clients.<service>_create_app(name='default')
            inca.clients.<service>_create_credentials(app='default')
            docs = inca.clients.<service>_<functionname>(app='default', *args, **kwargs)

    Processing
        These methods change documents by adding fields. Such manipulations can
        be things such as POS-tags, Sentiment or something else.

        usage:
            modified_docs = inca.processing.<processor>(docs=<original_docs or query>, field=<field to manipulate>, *args, **kwargs)


    """

    _taskmaster = Celery(
        backend=config.get('celery',
                           '%s.backend' % config.get('inca', 'dependencies')),
        broker=config.get('celery',
                          '%s.broker' % config.get('inca', 'dependencies')),
    )

    database = core.search_utils

    _prompt = "Placeholder"

    def __init__(self,
                 prompt="TLI",
                 distributed=False,
                 verbose=True,
                 debug=False):
        self._LOCAL_ONLY = distributed
        self._prompt = getattr(make_interface, prompt).prompt
        self._construct_tasks('scrapers')
        self._construct_tasks('processing')

        #self._analysis_task_constructor()
        self._construct_tasks('analysis')
        self._construct_tasks('clients')
        self._construct_tasks('importers_exporters')
        self._construct_tasks('rssscrapers')

        if verbose:
            logger.setLevel('INFO')
            logger.info("Providing verbose output")
        if debug:
            logger.setLevel('DEBUG')
            logger.debug("Activating debugmode")

    class analysis():
        '''Data analysis tools'''
        pass

    class scrapers():
        '''Scrapers for various (news) outlets'''
        pass

    class rssscrapers():
        '''RSS-based crapers for various (news) outlets'''
        pass

    class processing():
        '''Processing options to operate on documents'''
        pass

    class analysis():
        '''Perform and summarize analysis done on documents'''
        pass

    def _analysis_task_constructor(self):
        """Construct endpoints specifically for analysis tasks

        This function is used when analysis tasks are encountered. The Analysis
        sub-classes include functionality for fitting, predicting, plotting,
        updateing and explaining results.

        """

        target_functions = [
            'fit', 'predict', 'plot', 'interpretation', 'quality'
        ]

        for k, v in self._taskmaster.tasks.items():
            functiontype = k.split('.', 1)[0]
            taskname = k.rsplit('.', 1)[1]
            if functiontype == "analysis":

                analysis_class = self._taskmaster.tasks[k]

                def makefunc(method):
                    if inspect.isgeneratorfunction(method):

                        def endpoint(*args, **kwargs):
                            for i in method(*args, **kwargs):
                                yield i
                    else:

                        def endpoint(*args, **kwargs):
                            return method(*args, **kwargs)

                    return endpoint

                class analysis_placeholder:
                    pass

                analysis_placeholder.__doc__ = analysis_class.__doc__

                for method in target_functions:
                    endpoint = getattr(analysis_class, method)
                    setattr(analysis_placeholder, method, endpoint)

                setattr(getattr(self, "analysis"), taskname,
                        analysis_placeholder)

    class clients():
        '''Clients to access (social media) APIs'''
        pass

    class importers_exporters():
        '''Importing functions to ingest data '''
        pass

    def _construct_tasks(self, function):
        """Construct the appropriate endoints from Celery tasks

        This function serves to create the appropriate functions in the Inca
        object by intro-specting available functions from the celery taskmaster.
        Subclasses of Task should then be added automatically.

        Parameters
        ----
        function : string
            The type of function to add, such as 'scrapers' or 'processors'

        Returns
            None


        """
        for k, v in self._taskmaster.tasks.items():
            functiontype = k.split('.', 1)[0]
            taskname = k.rsplit('.', 1)[1]
            if functiontype == function:
                target_task = self._taskmaster.tasks[k]
                target_task.prompt = self._prompt

                is_client_main_class = hasattr(
                    target_task, "service_name"
                ) and target_task.__name__ == target_task.service_name
                if is_client_main_class:
                    setattr(
                        getattr(self,
                                function), "{service_name}_create_app".format(
                                    service_name=target_task.service_name),
                        target_task.add_application)
                    setattr(
                        getattr(self,
                                function), "{service_name}_remove_app".format(
                                    service_name=target_task.service_name),
                        target_task.remove_application)
                    setattr(
                        getattr(self, function),
                        "{service_name}_create_credentials".format(
                            service_name=target_task.service_name),
                        target_task.add_credentials)
                else:
                    setattr(getattr(self, function), taskname,
                            target_task.runwrap)
                function_class = getattr(self, function)
                leaf_class = self._taskmaster.tasks[k]
                method = leaf_class.runwrap

                def makefunc(method):
                    if inspect.isgeneratorfunction(method):

                        def endpoint(*args, **kwargs):
                            for i in method(*args, **kwargs):
                                yield i
                    else:

                        def endpoint(*args, **kwargs):
                            return method(*args, **kwargs)

                    return endpoint

                endpoint = makefunc(method)
                if function == 'scrapers' or function == 'rssscrapers':
                    docstring = self._taskmaster.tasks[k].get.__doc__
                elif function == "processing":
                    docstring = self._taskmaster.tasks[k].process.__doc__
                elif function == "importers_exporters":
                    t = self._taskmaster.tasks[k]
                    if hasattr(t, 'load'):
                        docstring = t.load.__doc__
                    else:
                        docstring = t.save.__doc__
                else:
                    docstring = self._taskmaster.tasks[k].__doc__
                endpoint.__doc__ = docstring
                endpoint.__name__ = leaf_class.__name__

                setattr(function_class, taskname, endpoint)

    def _summary(self):
        summary = ''
        summary += '\nTop 10 document types currently in database:\n'
        contents = self.database.list_doctypes().items()
        for k, v in sorted(self.database.list_doctypes().items(),
                           key=lambda x: x[1],
                           reverse=True)[:10]:
            summary += "{k:30} : {v:10}\n".format(**locals())
        if len(contents) > 10:
            summary += "...\n"
        return summary