Пример #1
0
class FileHandler(Configurable):
    """Abstract component factory for file-related components.

    Args:
        fs (str): service name to use for filesystem.
        path (str): which path to use within the provided filesystem.
        eol (str): which character to use to separate lines.
        mode (str): which mode to use when opening the file.
        encoding (str): which encoding to use when opening the file.
    """

    path = Option(
        filesystem_path,
        required=True,
        positional=True,
        __doc__="Path to use within the provided filesystem.")  # type: str
    eol = Option(str,
                 default="\n",
                 __doc__="Character to use as line separator.")  # type: str
    mode = Option(str,
                  __doc__="What mode to use for open() call.")  # type: str
    encoding = Option(str, default="utf-8", __doc__="Encoding.")  # type: str
    fs = Service("fs", __doc__="The filesystem instance to use.")  # type: str

    @ContextProcessor
    def file(self, context, *, fs):
        with self.open(fs) as file:
            yield file

    def open(self, fs):
        return fs.open(self.path, self.mode, encoding=self.encoding)
Пример #2
0
class OdooReader(OdooBase):
    domain = Option(
        type=list,
        default=[],
    )
    fields = Option(
        type=list,
        default=[],
    )
    limit = Option(type=int, required=False)

    def read(self, context, *args, config, **kwargs):
        new_args = [self.domain]
        new_args += args
        new_kwargs = kwargs.copy()
        if self.limit:
            new_kwargs['limit'] = self.limit
        if self.fields and not context.output_type:
            context.set_output_fields(self.fields)
        fields = context.get_output_fields()
        results = config.search_read(self.model, *new_args, **new_kwargs)
        if not fields:
            yield from results
        else:
            for result in results:
                final_result = []
                for field in fields:
                    final_result.append(result.get(field, False))
                yield tuple(final_result) if self.fields else result

    __call__ = read
Пример #3
0
class ExtractKeyedValue(Configurable):
    '''
	Given a `dict` representing an some object, extract the `key` member (a dict).
	To the extracted dictionaries, add a 'parent_data' key with
	the value of the original dictionary. Yield the extracted dictionary.
	'''
    key = Option(str, required=True)
    include_parent = Option(bool, default=True)

    def __init__(self, *v, **kw):
        '''
		Sets the __name__ property to include the relevant options so that when the
		bonobo graph is serialized as a GraphViz document, different objects can be
		visually differentiated.
		'''
        super().__init__(*v, **kw)
        self.__name__ = f'{type(self).__name__} ({self.key})'

    def __call__(self, data, *args, **kwargs):
        a = data.get(self.key)
        if a:
            child = {k: v for k, v in a.items()}
            child.update({
                'parent_data': data,
            })
            yield child
Пример #4
0
class GroupKeys(Configurable):
	mapping = Option(dict)
	drop_empty = Option(bool, default=True)
	def __call__(self, data):
		to_delete = set()
		for key, mapping in self.mapping.items():
			subd = {}
			properties = mapping['properties']
			postprocess = mapping.get('postprocess')
			for k in properties:
				v = data.get(k)
				to_delete.add(k)
				if self.drop_empty and not v:
					continue
				subd[k] = v
			if postprocess:
				if callable(postprocess):
					postprocess = [postprocess]
				for p in postprocess:
					subd = p(subd, data)
			data[key] = subd
		for k in to_delete:
			with suppress(KeyError):
				del data[k]
		return data
Пример #5
0
class Trace(Configurable):
	name = Option()
	diff = Option(default=False)
	ordinals = Option(default=(0,))
	trace_counter = Service('trace_counter')

	def __call__(self, thing: dict, trace_counter):
		key = '__trace_id'
		skey = '__trace_seq'
		if not key in thing:
			thing[key] = next(trace_counter)
			thing[skey] = 1
		else:
			thing[skey] += 1
		id = thing[key]
		seq = thing[skey]
		if id in self.ordinals:
			formatted = pprint.pformat({k: v for k, v in thing.items() if not k.startswith('__trace_')})
			if formatted[0] == '{' and formatted[-1] == '}':
				# adding newlines and a trailing comma helps with making a sensible diff
				formatted = '{\n ' + formatted[1:-1] + ',\n}\n'
			if self.diff:
				previous = thing.get('__trace_%d_%d' % (id, seq-1))
				print('===========> %s #%d: sequence %d' % (self.name, id, seq))
				if previous:
					lines = difflib.ndiff(previous.splitlines(keepends=True), formatted.splitlines(keepends=True))
					sys.stdout.writelines(lines)
				else:
					print(formatted)
			else:
				print(formatted)
			thing['__trace_%d_%d' % (id, seq)] = formatted
		return thing
Пример #6
0
class GroupRepeatingKeys(Configurable):
    mapping = Option(dict)
    drop_empty = Option(bool, default=True)

    def __call__(self, data):
        for key, mapping in self.mapping.items():
            property_prefixes = mapping['prefixes']
            postprocess = mapping.get('postprocess')
            data[key] = []
            to_delete = set()
            with suppress(KeyError):
                for i in itertools.count(1):
                    ks = ((prefix, f'{prefix}_{i}')
                          for prefix in property_prefixes)
                    subd = {}
                    for p, k in ks:
                        subd[p] = data[k]
                        to_delete.add(k)
                    if self.drop_empty:
                        values_unset = list(
                            map(lambda v: not bool(v), subd.values()))
                        if all(values_unset):
                            continue
                    if postprocess and subd:
                        if callable(postprocess):
                            postprocess = [postprocess]
                        for p in postprocess:
                            subd = p(subd, data)
                            if not subd:
                                break
                    if subd:
                        data[key].append(subd)
            for k in to_delete:
                del data[k]
        return data
Пример #7
0
class MethodBasedConfigurable(Configurable):
    handler = Method()
    foo = Option(positional=True)
    bar = Option()

    def __call__(self, *args, **kwargs):
        self.handler(*args, **kwargs)
class OpendatasoftExtract(Configurable):
    portal = Option(str, required=True, positional=True)
    dataset_id = Option(str, required=True, positional=True)
    format = Option(str, required=True, positional=True)

    http = Service('http')

    def __call__(self, http):
        exports_url, str_date = self.get_metadata(http, self.portal, self.dataset_id)
        version = self.date2version(str_date)
        url = self.get_export_url(http, exports_url, self.format)
        yield {
            'url': url,
            'version': version,
        }

    def get_metadata(self, http, portal, dataset_id):
        url = f'{portal}/api/v2/catalog/datasets/{dataset_id}'
        result = http.get(url)
        if not result.ok:
            raise RuntimeError(f'Fails fetch metedata content from {url}')

        try:
            metadata = result.json()
        except ValueError as e:
            raise ValueError(f'Fails parse json metedata from {url}') from e

        try:
            str_date = metadata['dataset']['metas']['default']['data_processed']
            link = next(filter(lambda d: d['rel'] == 'exports', metadata['links']))
            exports_url = link['href']
        except KeyError as e:
            raise ValueError(f'Fails use metedata from {url}') from e

        return [exports_url, str_date]

    def date2version(self, str_date):
        # ISO date, just cut it
        return str_date[:10].replace('-', '.')

    def get_export_url(self, http, exports_url, format):
        result = http.get(exports_url)
        if not result.ok:
            raise RuntimeError(f'Fails fetch export list from {exports_url}')

        try:
            exports = result.json()
        except ValueError as e:
            raise ValueError(f'Fails parse json export list from {exports_url}') from e

        try:
            link = next(filter(lambda d: d['rel'] == format, exports['links']))
        except KeyError as e:
            raise ValueError(f'Fails retrive export format {format} from {exports_url}') from e

        if not link:
            raise RuntimeError(f'Export format {format} from {exports_url} not available')

        return link['href']
Пример #9
0
class DateRangeNode(Configurable):

    start_date = Option(positional=True, required=True)
    end_date = Option(positional=True, required=True)

    def __call__(self):
        for date in date_range(self.start_date, self.end_date):
            yield date
Пример #10
0
class FileReader(Reader, FileHandler):
    """Component factory for file-like readers.

    On its own, it can be used to read a file and yield one row per line, trimming the "eol" character at the end if
    present. Extending it is usually the right way to create more specific file readers (like json, csv, etc.)
    """

    mode = Option(str, default='r', __doc__='''
        What mode to use for open() call.
    ''')  # type: str

    output_fields = Option(
        ensure_tuple,
        required=False,
        __doc__='''
        Specify the field names of output lines.
        Mutually exclusive with "output_type".
    '''
    )
    output_type = Option(
        required=False,
        __doc__='''
        Specify the type of output lines.
        Mutually exclusive with "output_fields".
    '''
    )

    @ContextProcessor
    def output(self, context, *args, **kwargs):
        """
        Allow all readers to use eventually use output_fields XOR output_type options.

        """

        output_fields = self.output_fields
        output_type = self.output_type

        if output_fields and output_type:
            raise UnrecoverableError('Cannot specify both output_fields and output_type option.')

        if self.output_type:
            context.set_output_type(self.output_type)

        if self.output_fields:
            context.set_output_fields(self.output_fields)

        yield

    def read(self, file, *, fs):
        """
        Write a row on the next line of given file.
        Prefix is used for newlines.
        """
        for line in file:
            yield line.rstrip(self.eol)

    __call__ = read
Пример #11
0
class FilterXPathEqual(Configurable):
    xpath = Option(str, required=True)
    value = Option(str)

    def __call__(self, e):
        for t in e.xpath(self.xpath):
            if t.text == self.value:
                return NOT_MODIFIED
        return None
Пример #12
0
class AddFieldNames(Configurable):
	key = Option(required=False)
	field_names = Option()
	def __call__(self, *data):
		if len(data) == 1 and type(data[0]) in (tuple, list):
			data = data[0]
		names = self.field_names.get(self.key, []) if isinstance(self.field_names, dict) else self.field_names
		d = dict(zip(names, data))
		return d
Пример #13
0
class OpenDataSoftAPI(Configurable):
    dataset = Option(str, required=True)
    endpoint = Option(str, default='{scheme}://{netloc}{path}')
    scheme = Option(str, default='https')
    netloc = Option(str, default='data.opendatasoft.com')
    path = Option(path_str, default='/api/records/1.0/search/')
    rows = Option(int, default=500)
    limit = Option(int, default=None)
    timezone = Option(str, default='Europe/Paris')
    kwargs = Option(dict, default=dict)

    @ContextProcessor
    def compute_path(self, context):
        params = (('dataset', self.dataset), ('timezone', self.timezone)) + tuple(sorted(self.kwargs.items()))
        yield self.endpoint.format(scheme=self.scheme, netloc=self.netloc, path=self.path) + '?' + urlencode(params)

    @ContextProcessor
    def start(self, context, base_url):
        yield ValueHolder(0)

    def __call__(self, base_url, start, *args, **kwargs):
        while (not self.limit) or (self.limit > start):
            url = '{}&start={start}&rows={rows}'.format(
                base_url, start=start.value, rows=self.rows if not self.limit else min(self.rows, self.limit - start)
            )
            resp = requests.get(url)
            records = resp.json().get('records', [])

            if not len(records):
                break

            for row in records:
                yield {**row.get('fields', {}), 'geometry': row.get('geometry', {})}

            start.value += self.rows
Пример #14
0
class HTTPGetExtract(Configurable):
    url = Option(str, required=True, positional=True)
    content = Option(str, required=False, default='content')

    http = Service('http')

    def __call__(self, http):
        response = http.get(self.url)
        if not response.ok:
            logger.error(response.text)
            raise RuntimeError(f'Request fails: {self.url}')
        yield response.content
Пример #15
0
class CurriedCSVReader(Configurable):
    '''
	This reader takes CSV filenames as input, and for each parses
	the CSV content and yields a tuple of strings for each row.
	'''
    fs = Service(
        'fs',
        __doc__='''The filesystem instance to use.''',
    )  # type: str
    mode = Option(
        str,
        default='r',
        __doc__='''What mode to use for open() call.''',
    )  # type: str
    encoding = Option(
        str,
        default='utf-8',
        __doc__='''Encoding.''',
    )  # type: str
    limit = Option(
        int,
        __doc__=
        '''Limit the number of rows read (to allow early pipeline termination).''',
    )
    field_names = Option()

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.count = 0

    def read(self, path, *, fs):
        limit = self.limit
        count = self.count
        names = self.field_names
        if not (limit) or (limit and count < limit):
            sys.stderr.write('============================== %s\n' % (path, ))
            with fs.open(path, newline='') as csvfile:
                r = csv.reader(csvfile)
                for row in r:
                    if limit and count >= limit:
                        break
                    count += 1
                    if names:
                        d = {}
                        for i in range(len(names)):
                            d[names[i]] = row[i]
                        yield d
                    else:
                        yield row
            self.count = count

    __call__ = read
Пример #16
0
class Bobby(Configurable):
    handler = Method()
    handler2 = Method()
    foo = Option(positional=True)
    bar = Option(required=False)

    @ContextProcessor
    def think(self, context):
        yield 'different'

    def __call__(self, think, *args, **kwargs):
        self.handler('1', *args, **kwargs)
        self.handler2('2', *args, **kwargs)
Пример #17
0
class HTTPGet(Configurable):
    url = Option(str, required=False, default='url')
    content = Option(str, required=False, default='content')

    http = Service('http')

    def __call__(self, properties, http):
        response = http.get(properties[self.url])
        if not response.ok:
            logger.error(response.text)
            raise RuntimeError(f'Request fails: {properties[self.url]}')
        properties[self.content] = response.content
        yield properties
Пример #18
0
class OverpassExtract(Configurable):
    query = Option(str, required=True, positional=True)
    overpass_url = Option(str, required=False, positional=False, default=OVERPASS_URL)

    http = Service('http')

    def __call__(self, http):
        response = http.post(self.overpass_url, data=self.query)

        if not response.ok:
            logger.error(response.text)
            raise RuntimeError('Overpass query fails')

        yield response.content
Пример #19
0
class CurriedXMLReader(Configurable):
    '''
	Similar to XMLReader, this reader takes XML filenames as input, and for each parses
	the XML content and yields lxml.etree Element objects matching the given XPath
	expression.
	'''
    xpath = Option(str, required=True)
    fs = Service(
        'fs',
        __doc__='''The filesystem instance to use.''',
    )  # type: str
    mode = Option(
        str,
        default='r',
        __doc__='''What mode to use for open() call.''',
    )  # type: str
    encoding = Option(
        str,
        default='utf-8',
        __doc__='''Encoding.''',
    )  # type: str
    limit = Option(
        int,
        __doc__=
        '''Limit the number of rows read (to allow early pipeline termination).''',
    )
    verbose = Option(bool, default=False)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.count = 0

    def read(self, path, *, fs):
        limit = self.limit
        count = self.count
        if not (limit) or (limit and count < limit):
            if self.verbose:
                sys.stderr.write('============================== %s\n' %
                                 (path, ))
            file = fs.open(path, self.mode, encoding=self.encoding)
            root = lxml.etree.parse(file)
            for e in root.xpath(self.xpath):
                if limit and count >= limit:
                    break
                count += 1
                yield e
            self.count = count
            file.close()

    __call__ = read
Пример #20
0
class PreserveCSVFields(Configurable):
	key = Option(str, default='csv_line')
	order = Option(list, default=None)
	
	def __call__(self, data:dict):
		s = ''
		keyorder = self.order
		if not keyorder:
			keyorder = sorted(data.keys())
		for k in keyorder:
			v = data.get(k, '')
			s += f'{k}: {v}\n'
		data[self.key] = s
		yield data
Пример #21
0
class MongoReader(Configurable):
    database = Option(str,
                      positional=True,
                      default='scopus',
                      __doc__='the mongodb database name')
    collection = Option(str,
                        positional=True,
                        default='',
                        __doc__='the mongodb collection name')
    client = Service('mongodb.client')

    def __call__(self, args, *, client):
        db = client[self.database]
        collection = db[self.collection]
Пример #22
0
class RecordCounter(Configurable):
	counts = Service('counts')
	verbose = Option(bool, default=False)
	name = Option()

	def __init__(self, *args, **kwargs):
		super().__init__(self, *args, **kwargs)
		self.mod = 100

	def __call__(self, data, counts):
		counts[self.name] += 1
		count = counts[self.name]
		if count % self.mod == 0:
			print(f'\r{count} {self.name}', end='', file=sys.stderr)
		return data
Пример #23
0
class MongoWriter(Configurable):
    database = Option(str,
                      positional=True,
                      default='scopus',
                      __doc__='the mongo database')
    collection = Option(str,
                        positional=True,
                        default='',
                        __doc__='the mongo collection')
    client = Service('mongodb.client')

    def __call__(self, args, *, client):
        db = client[self.database]
        collection = db[self.collection]
        collection.insert_one(fix_keys(args))
Пример #24
0
class CleanDateToSpan(Configurable):
    '''
	Supplied with a key name, attempt to parse the value in `input[key]`` as a date or
	date range, and create a new `TimeSpan` object for the parsed date(s). Store the
	resulting timespan in `input[key + '_span']`.
	'''

    key = Option(str, required=True)
    optional = Option(bool, default=True)

    def __init__(self, *v, **kw):
        '''
		Sets the __name__ property to include the relevant options so that when the
		bonobo graph is serialized as a GraphViz document, different objects can be
		visually differentiated.
		'''
        super().__init__(*v, **kw)
        self.__name__ = f'{type(self).__name__} ({self.key})'

    @staticmethod
    def string_to_span(value):
        '''Parse a string value and attempt to create a corresponding `model.TimeSpan` object.'''
        try:
            date_from, date_to = date_cleaner(value)
            ts = model.TimeSpan()
            if date_from is not None:
                ts.begin_of_the_begin = date_from.strftime(
                    "%Y-%m-%dT%H:%M:%SZ")
            if date_to is not None:
                ts.end_of_the_end = date_to.strftime("%Y-%m-%dT%H:%M:%SZ")
            return ts
        except Exception as e:
            print('*** Unknown date format %r: %s' % (value, e))
            return None

    def __call__(self, data, *args, **kwargs):
        if self.key in data:
            value = data[self.key]
            ts = self.string_to_span(value)
            if ts is not None:
                data['%s_span' % self.key] = ts
                return data
        else:
            if not self.optional:
                warnings.warn('*** key %r is not in the data object:' %
                              (self.key, ))
                pprint.pprint(data, stream=sys.stderr)
        return NOT_MODIFIED
Пример #25
0
class IOFormatEnabled(Configurable):
    ioformat = Option(default=settings.IOFORMAT.get)

    def get_input(self, *args, **kwargs):
        if self.ioformat == settings.IOFORMAT_ARG0:
            if len(args) != 1 or len(kwargs):
                raise ValueError(
                    'Wrong input formating: IOFORMAT=ARG0 implies one arg and no kwargs, got args={!r} and kwargs={!r}.'
                    .format(args, kwargs))
            return args[0]

        if self.ioformat == settings.IOFORMAT_KWARGS:
            if len(args) or not len(kwargs):
                raise ValueError(
                    'Wrong input formating: IOFORMAT=KWARGS ioformat implies no arg, got args={!r} and kwargs={!r}.'
                    .format(args, kwargs))
            return kwargs

        raise NotImplementedError('Unsupported format.')

    def get_output(self, row):
        if self.ioformat == settings.IOFORMAT_ARG0:
            return row

        if self.ioformat == settings.IOFORMAT_KWARGS:
            return Bag(**row)

        raise NotImplementedError('Unsupported format.')
Пример #26
0
class AddAuctionCatalog(Configurable):
    helper = Option(required=True)
    non_auctions = Service('non_auctions')

    def __call__(self, data: dict, non_auctions):
        '''Add modeling for auction catalogs as linguistic objects'''
        cno = data['catalog_number']

        # this information may either come from `data` (for the auction events branch of the pipeline)
        # or from `non_auctions` (for the catalogs branch, which lacks this information,
        # but will have access to the `non_auctions` service which was shared from the events branch)
        sale_type = non_auctions.get(cno, data.get('non_auction_flag'))
        if sale_type:
            non_auctions[cno] = sale_type
        sale_type = sale_type or 'Auction'
        catalog = self.helper.catalog_text(cno, sale_type)
        cdata = {'uri': catalog.id}
        puid = data.get('persistent_puid')
        if puid:
            puid_id = self.helper.gri_number_id(puid)
            catalog.identified_by = puid_id
            cdata['identifiers'] = [puid_id]

        data['_catalog'] = add_crom_data(data=cdata, what=catalog)
        yield data
Пример #27
0
class RemoveKeys(Configurable):
	keys = Option(set)
	def __call__(self, data:dict):
		for key in self.keys:
			with suppress(KeyError):
				del data[key]
		return data
Пример #28
0
class AddDataDependentArchesModel(Configurable):
	'''
	Set the `_ARCHES_MODEL` key in the supplied `dict` to the appropriate arches model UUID
	and return it.
	'''
	models = Option()
	def __call__(self, data, *args, **kwargs):
		if '_LOD_OBJECT' in data:
			obj = data['_LOD_OBJECT']
			t = type(obj)
			tname = t.__name__
			if tname in self.models:
				data['_ARCHES_MODEL'] = self.models[tname]
				return data
			else:
				typename = type(obj).__name__
				if tname in self.models:
					data['_ARCHES_MODEL'] = self.models[typename]
					return data
				else:
					print(f'*** No Arches model available for {typename}')
				data['_ARCHES_MODEL'] = f'XXX-{typename}'
		else:
			data['_ARCHES_MODEL'] = self.models['LinguisticObject']
		return data
Пример #29
0
class PickleReader(FileReader, PickleHandler):
    """
    Reads a Python pickle object and yields the items in dicts.
    """

    mode = Option(str, default='rb')

    def read(self, file, context, *, fs):
        data = pickle.load(file)

        # if the data is not iterable, then wrap the object in a list so it may be iterated
        if isinstance(data, dict):
            is_dict = True
            iterator = iter(data.items())
        else:
            is_dict = False
            try:
                iterator = iter(data)
            except TypeError:
                iterator = iter([data])

        if not context.output_type:
            context.set_output_fields(self.fields or next(iterator))
        fields = context.get_output_fields()
        fields_length = len(fields)

        for row in iterator:
            if len(row) != fields_length:
                raise ValueError(
                    'Received an object with {} items, expected {}.'.format(
                        len(row), fields_length))

            yield tuple(row.values() if is_dict else row)

    __call__ = read
Пример #30
0
class Serializer(Configurable):
	compact = Option(default=True)
	def __call__(self, data: dict):
		factory = data['_CROM_FACTORY']
		js = factory.toString(data['_LOD_OBJECT'], self.compact)
		data['_OUTPUT'] = js
		return data