예제 #1
0
def test_grouper():

    my_iterable = list(range(100))

    assert len(list(grouper(my_iterable, 10))) == 10

    my_iterable = list(range(100)) + [None]
    my_groups = list(grouper(my_iterable, 10))
    assert len(my_groups) == 11
    assert len(my_groups[10]) == 1
예제 #2
0
    def get_items(self):
        """
        Custom get items to allow for incremental building for a whole set of stores
        """

        self.logger.info("Starting Website Builder")

        self.ensure_indexes()

        keys = self.get_keys()
        self.logger.info("Processing {} items".format(len(keys)))
        self.total = len(keys)

        # Chunk keys by chunk size for good data IO
        for chunked_keys in grouper(keys, self.chunk_size, None):
            chunked_keys = list(filter(None.__ne__, chunked_keys))

            docs = {
                d[self.materials.key]: d
                for d in self.materials.query(
                    criteria={self.materials.key: {"$in": chunked_keys}}
                )
            }
            self.add_thermo_docs(docs)
            self.add_aux_docs(docs)

            for d in docs.values():
                yield d
예제 #3
0
    def get_groups_from_keys(self, keys) -> Set[Tuple]:
        """
        Get the groups by grouping_keys for these documents
        """

        grouping_keys = self.grouping_keys

        groups: Set[Tuple] = set()

        for chunked_keys in grouper(keys, self.chunk_size):
            docs = list(
                self.source.query(
                    criteria={self.source.key: {"$in": chunked_keys}},
                    properties=grouping_keys,
                )
            )

            sub_groups = set(
                tuple(get(d, prop, None) for prop in grouping_keys) for d in docs
            )
            self.logger.debug(f"Found {len(sub_groups)} subgroups to process")

            groups |= sub_groups

        self.logger.info(f"Found {len(groups)} groups to process")
        return groups
예제 #4
0
파일: builder.py 프로젝트: acrutt/maggma
    def run(self, log_level=logging.DEBUG):
        """
        Run the builder serially
        This is only intended for diagnostic purposes
        """
        # Set up logging
        root = logging.getLogger()
        root.setLevel(log_level)
        ch = TqdmLoggingHandler()
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        ch.setFormatter(formatter)
        root.addHandler(ch)

        self.connect()

        cursor = self.get_items()

        for chunk in grouper(tqdm(cursor), self.chunk_size):
            self.logger.info("Processing batch of {} items".format(
                self.chunk_size))
            processed_chunk = [self.process_item(item) for item in chunk]
            processed_items = [
                item for item in processed_chunk if item is not None
            ]
            self.update_targets(processed_items)

        self.finalize()
예제 #5
0
    def get_items(self):

        self.logger.info("Starting {} Builder".format(self.__class__.__name__))

        self.ensure_indexes()

        if self.incremental:
            keys = source_keys_updated(source=self.source,
                                       target=self.target,
                                       query=self.query)
        else:
            keys = self.source.distinct(self.source.key, self.query)

        self.logger.info("Processing {} items".format(len(keys)))

        if self.projection:
            projection = list(
                set(self.projection + [self.source.key, self.source.lu_field]))
        else:
            projection = None

        self.total = len(keys)
        for chunked_keys in grouper(keys, self.chunk_size, None):
            chunked_keys = list(filter(None.__ne__, chunked_keys))
            for doc in list(
                    self.source.query(
                        criteria={self.source.key: {
                            "$in": chunked_keys
                        }},
                        properties=projection)):
                yield doc
예제 #6
0
    def prechunk(self, number_splits: int) -> Iterator[Dict]:
        """
        Generic prechunk for map builder to perform domain-decompostion
        by the key field
        """
        self.ensure_indexes()
        keys = self.target.newer_in(self.source, criteria=self.query, exhaustive=True)

        N = ceil(len(keys) / number_splits)
        for split in grouper(keys, N):
            yield {"query": {self.source.key: {"$in": list(split)}}}
예제 #7
0
    def prechunk(self, number_splits: int) -> Iterator[Dict]:
        """
        Generic prechunk for group builder to perform domain-decompostion
        by the grouping keys
        """
        self.ensure_indexes()

        keys = self.get_ids_to_process()
        groups = self.get_groups_from_keys(keys)

        N = ceil(len(groups) / number_splits)
        for split in grouper(keys, N):
            yield {"query": dict(zip(self.grouping_keys, split))}
예제 #8
0
 def get_items(self):
     # Borrowed from MapBuilder
     keys = self.propnet_store.distinct('task_id', criteria=self.criteria)
     containers = self.props + ['inputs']
     self.total = len(keys)
     for chunked_keys in grouper(keys, self.chunk_size, None):
         chunked_keys = list(filter(None.__ne__, chunked_keys))
         for doc in list(
                 self.propnet_store.query(
                     criteria={'task_id': {
                         "$in": chunked_keys
                     }},
                     properties=containers + ['task_id'],
                 )):
             yield doc
예제 #9
0
    def run(self):
        """
        Run the builder serially

        Args:
            builder_id (int): the index of the builder in the builders list
        """
        self.connect()

        cursor = self.get_items()

        for chunk in grouper(cursor, self.chunk_size):
            self.logger.info("Processing batch of {} items".format(self.chunk_size))
            processed_items = [self.process_item(item) for item in chunk if item is not None]
            self.update_targets(processed_items)

        self.finalize(cursor)
예제 #10
0
    def process(self, builder_id):
        """
        Run the builder using the builtin multiprocessing.
        Adapted from pymatgen-db

        Args:
            builder_id (int): the index of the builder in the builders list
        """
        builder = self.builders[builder_id]
        chunk_size = builder.chunk_size
        processing_builder = reload_msonable_object(builder)

        process_pool = Pool(self.num_workers, maxtasksperchild=chunk_size)
        cursor = builder.get_items()
        for items in grouper(
                process_pool.imap(processing_builder.process_item, cursor),
                chunk_size):
            self.logger.info("Completed {} items".format(chunk_size))
            builder.update_targets(items)

        builder.finalize(cursor)
예제 #11
0
    def remove_docs(self, criteria: Dict, remove_s3_object: bool = False):
        """
        Remove docs matching the query dictionary

        Args:
            criteria: query dictionary to match
            remove_s3_object: whether to remove the actual S3 Object or not
        """
        if not remove_s3_object:
            self.index.remove_docs(criteria=criteria)
        else:
            to_remove = self.index.distinct(self.key, criteria=criteria)
            self.index.remove_docs(criteria=criteria)

            # Can remove up to 1000 items at a time via boto
            to_remove_chunks = list(grouper(to_remove, n=1000))
            for chunk_to_remove in to_remove_chunks:
                objlist = [{
                    "Key": self.sub_dir + obj
                } for obj in chunk_to_remove]
                self.s3_bucket.delete_objects(Delete={"Objects": objlist})
예제 #12
0
    def process(self, builder_id):
        """
        Run the builder serially

        Args:
            builder_id (int): the index of the builder in the builders list
        """
        builder = self.builders[builder_id]
        chunk_size = builder.chunk_size

        # establish connection to the sources and targets
        builder.connect()

        cursor = builder.get_items()

        for chunk in grouper(cursor, chunk_size):
            self.logger.info("Processing batch of {} items".format(chunk_size))
            processed_items = [
                builder.process_item(item) for item in filter(None, chunk)
            ]
            builder.update_targets(processed_items)
예제 #13
0
    def get_items(self):
        """
        Generic get items for Map Builder designed to perform
        incremental building
        """

        self.logger.info("Starting {} Builder".format(self.__class__.__name__))

        self.ensure_indexes()

        keys = self.target.newer_in(self.source,
                                    criteria=self.query,
                                    exhaustive=True)
        if self.retry_failed:
            failed_keys = self.target.distinct(
                self.target.key, criteria={"state": {
                    "$ne": "failed"
                }})
            keys = list(set(keys + failed_keys))

        self.logger.info("Processing {} items".format(len(keys)))

        if self.projection:
            projection = list(
                set(self.projection +
                    [self.source.key, self.source.last_updated_field]))
        else:
            projection = None

        self.total = len(keys)
        for chunked_keys in grouper(keys, self.chunk_size):
            chunked_keys = list(chunked_keys)
            for doc in list(
                    self.source.query(
                        criteria={self.source.key: {
                            "$in": chunked_keys
                        }},
                        properties=projection,
                    )):
                yield doc
예제 #14
0
    def _request_with_fewer_props(self, n, k, reduce_batch_on_fail=False):
        """
        Requests the nth page of k results from the AFLUX API, using batches of properties.

        The algorithm divides the number of properties into x chunks, starting with x = 2,
        and requests each chunk. If one of the chunks fails, optionally, the batch size is
        reduced according to `_request_with_smaller_batch()`. If the chunk continues to fail,
        x is increased by 1, the properties are re-chunked and re-requested. This proceeds
        until each chunk contains only one property. If the query still fails, an error is raised.

        Args:
            n (int): page number of the results to return.
            k (int): number of datasets per page.
            reduce_batch_on_fail (bool): True causes batch size to decrease if a query fails
                to produce results prior to decreasing the chunk size. False does not decrease
                the batch size. Default: False

        Returns:
            dict: cumulative response from API
        """

        if len(self.responses) == 0:
            # We are making the very first request, finalize the query.
            self.finalize()

        collected_responses = defaultdict(dict)
        props = self.selects
        chunks = 2

        # Split up current query matchbook to recover filters
        matchbook_splitter = re.compile(r"(?!\'),(?<!\')")
        filter_identifier = re.compile(r"\(.+\)")
        current_matchbook = self._matchbook
        split_matchbook = matchbook_splitter.split(current_matchbook)
        orderby_kw = split_matchbook[0]      # Preserves orderby keyword
        filters = []
        for item in split_matchbook[1:]:
            if filter_identifier.search(item):
                filters.append(item)

        while len(props) // chunks >= 1:
            if len(props) / chunks < 2:
                chunks = len(props) + 1
            query_error = False
            for chunk in grouper(props, (len(props) // chunks) + 1):
                logger.debug('Requesting property chunk {} with {} records'.format(chunks, k))
                props_to_request = list(set(c for c in chunk if c is not None))

                # Exclude orderby keyword if it is not requested in this chunk.
                # If it is included, remove from requested properties to avoid duplication in URI
                orderby_prop = None
                orderby_str = None
                for prop in props_to_request:
                    if orderby_kw.startswith(prop):
                        if orderby_kw.startswith('$'):
                            orderby_str = orderby_kw[1:]
                        else:
                            orderby_str = orderby_kw
                        orderby_prop = prop
                        break

                if orderby_prop:
                    props_to_request.remove(orderby_prop)
                else:
                    if orderby_kw.startswith('$'):
                        orderby_str = orderby_kw
                    else:
                        orderby_str = '$' + orderby_kw
                matchbook_list = [orderby_str] + filters + props_to_request

                query = AflowAPIQuery(catalog=self.catalog, batch_reduction=reduce_batch_on_fail)
                query.finalize()
                query._matchbook = ",".join(matchbook_list)

                try:
                    query._request(n, k)
                except ValueError:      # pragma: no cover
                    query_error = True
                if not query_error:
                    response = query.responses[n]
                    for record_key, record in response.items():
                        collected_responses[record_key].update(record)
                else:       # pragma: no cover
                    break

            if query_error:     # pragma: no cover
                chunks += 1
            else:
                return collected_responses

        raise ValueError("The API failed to complete the request "      # pragma: no cover
                         "and reducing the number of properties failed to fix it.")
예제 #15
0
def serial(builder: Builder):
    """
    Runs the builders using a single process
    """

    logger = logging.getLogger("SerialProcessor")

    builder.connect()

    cursor = builder.get_items()

    total = None
    if isinstance(cursor, GeneratorType):
        try:
            cursor = primed(cursor)
            if hasattr(builder, "total"):
                total = builder.total
        except StopIteration:
            pass

    elif hasattr(cursor, "__len__"):
        total = len(cursor)  # type: ignore
    elif hasattr(cursor, "count"):
        total = cursor.count()  # type: ignore

    logger.info(
        f"Starting serial processing: {builder.__class__.__name__}",
        extra={
            "maggma": {
                "event": "BUILD_STARTED",
                "total": total,
                "builder": builder.__class__.__name__,
                "sources": [source.name for source in builder.sources],
                "targets": [target.name for target in builder.targets],
            }
        },
    )
    for chunk in grouper(tqdm(cursor, total=total), builder.chunk_size):
        logger.info(
            "Processing batch of {} items".format(builder.chunk_size),
            extra={
                "maggma": {
                    "event": "UPDATE",
                    "items": len(chunk),
                    "builder": builder.__class__.__name__,
                }
            },
        )
        processed_chunk = [builder.process_item(item) for item in chunk]
        processed_items = [
            item for item in processed_chunk if item is not None
        ]
        builder.update_targets(processed_items)

    logger.info(
        f"Ended serial processing: {builder.__class__.__name__}",
        extra={
            "maggma": {
                "event": "BUILD_ENDED",
                "builder": builder.__class__.__name__
            }
        },
    )
    builder.finalize()
예제 #16
0
    def get_items(self):
        """
        Retrieves AFLOW data using the AFLUX API according to the specifications in the query
        configurations.

        Yields:
            tuple: The first item is an `aflow.entries.Entry` containing the material data
                and the second item is a list of targets for the data ('data' and/or 'auid')

        """
        kws = self.keywords.copy()
        for kw in ('auid', 'aurl', 'compound', 'files'):
            try:
                kws.remove(kw)
            except KeyError:
                pass

        for config_ in self.query_configs:
            logger.debug("Catalog {} selecting {}".format(
                config_['catalog'],
                'all' if not config_['select'] else config_['select']))

            if config_['select']:
                kws_to_chunk = config_['select']
            else:
                kws_to_chunk = self.keywords

            k = config_['k']
            filter_vals = config_['filter']

            chunk_idx = 0
            chunk_size = 5
            total_chunks = len(kws_to_chunk) // chunk_size + 1

            for chunk in grouper(kws_to_chunk, chunk_size):
                chunk_idx += 1
                logger.debug("Property chunk {} of {}".format(
                    chunk_idx, total_chunks))
                props = [getattr(AFLOW_KWS, c) for c in chunk if c is not None]
                if len(props) == 0:
                    continue
                data_query = self._get_query_obj(config_['catalog'], k,
                                                 config_['exclude'],
                                                 filter_vals)
                data_query.select(*props)
                success = False
                while not success:
                    try:
                        for entry in data_query:
                            yield entry, config_['targets']
                        success = True
                    except ValueError:
                        if data_query.N == 0:  # Empty query
                            raise ValueError(
                                "Query returned no results. Query config:\n{}".
                                format(config_))
                        else:  # pragma: no cover
                            logger.warning('Server error. ' +
                                           'Resting...starting {}'.format(
                                               datetime.datetime.now()))
                            time.sleep(120)
예제 #17
0
    def get_items(self) -> Iterable:
        """
        Gets items from source_stores for processing.
        Items are retrieved in chunks based on a subset of
        key values set by chunk_size but are unsorted.

        Returns:
            generator of items to process
        """
        self.logger.info("Starting {} get_items...".format(
            self.__class__.__name__))

        # get distinct key values
        if len(self.query_by_key) > 0:
            keys = self.query_by_key
        else:
            unique_keys = set()  # type: Set
            for store in self.sources:
                store_keys = store.distinct(field=store.key)
                unique_keys.update(store_keys)
                if None in store_keys:
                    self.logger.debug(
                        "None found as a key value for store {} with key {}".
                        format(store.collection_name, store.key))
            keys = list(unique_keys)
            self.logger.info("{} distinct key values found".format(len(keys)))
            self.logger.debug(
                "None found in key values? {}".format(None in keys))

        # for every key (in chunks), query from each store and
        # project fields specified by projection_mapping
        for chunked_keys in grouper(keys, self.chunk_size):
            chunked_keys = [k for k in chunked_keys if k is not None]
            self.logger.debug(
                "Querying by chunked_keys: {}".format(chunked_keys))

            unsorted_items_to_process = []
            for store, projection in zip(self.sources,
                                         self.projection_mapping):

                # project all fields from store if corresponding element
                # in projection_mapping is an empty dict,
                # else only project the specified fields
                properties: Union[List, None]
                if projection == {}:  # all fields are projected
                    properties = None
                    self.logger.debug(
                        "For store {} getting all properties".format(
                            store.collection_name))
                else:  # only specified fields are projected
                    properties = [v for v in projection.values()]
                    self.logger.debug(
                        "For {} store getting properties: {}".format(
                            store.collection_name, properties))

                # get docs from store for given chunk of key values,
                # rename fields if specified by projection mapping,
                # and put in list of unsorted items to be processed
                docs = store.query(criteria={store.key: {
                    "$in": chunked_keys
                }},
                                   properties=properties)
                for d in docs:
                    if properties is None:  # all fields are projected as is
                        item = deepcopy(d)
                    else:  # specified fields are renamed
                        item = dict()
                        for k, v in projection.items():
                            item[k] = get(d, v)

                    # remove unneeded fields and add key value to each item
                    # key value stored under target_key is used for sorting
                    # items during the process_items step
                    for k in ["_id", store.last_updated_field]:
                        if k in item.keys():
                            del item[k]
                    item[self.target.key] = d[store.key]

                    unsorted_items_to_process.append(item)

                self.logger.debug(
                    "Example fields of one output item from {} store sent to process_items: {}"
                    .format(store.collection_name, item.keys()))

            yield unsorted_items_to_process
예제 #18
0
    def get_items(self) -> Tuple[List[Dict], List[Dict]]:
        """
        Gets all materials to assocaite with SNLs
        Returns:
            generator of materials and SNLs that could match
        """
        self.logger.info("Provenance Builder Started")

        self.logger.info("Setting indexes")
        self.ensure_indicies()

        # Find all formulas for materials that have been updated since this
        # builder was last ran
        q = {**self.query, "property_name": ProvenanceDoc.property_name}
        updated_materials = self.provenance.newer_in(
            self.materials,
            criteria=q,
            exhaustive=True,
        )
        forms_to_update = set(
            self.materials.distinct(
                "formula_pretty", {"material_id": {
                    "$in": updated_materials
                }}))

        # Find all new SNL formulas since the builder was last run
        for source in self.source_snls:
            new_snls = self.provenance.newer_in(source)
            forms_to_update |= set(source.distinct("formula_pretty", new_snls))

        # Now reduce to the set of formulas we actually have
        forms_avail = set(self.materials.distinct("formula_pretty",
                                                  self.query))
        forms_to_update = forms_to_update & forms_avail

        self.logger.info(
            f"Found {len(forms_to_update)} new/updated systems to proces")

        self.total = len(forms_to_update)

        for formulas in grouper(forms_to_update, self.chunk_size):
            snls = []
            for source in self.source_snls:
                snls.extend(
                    source.query(
                        criteria={"formula_pretty": {
                            "$in": formulas
                        }}))

            mats = list(
                self.materials.query(
                    properties=[
                        "material_id",
                        "last_updated",
                        "structure",
                        "initial_structures",
                        "formula_pretty",
                    ],
                    criteria={"formula_pretty": {
                        "$in": formulas
                    }},
                ))

            form_groups = defaultdict(list)
            for snl in snls:
                form_groups[snl["formula_pretty"]].append(snl)

            mat_groups = defaultdict(list)
            for mat in mats:
                mat_groups[mat["formula_pretty"]].append(mat)

            for formula, snl_group in form_groups.items():

                mat_group = mat_groups[formula]

                self.logger.debug(
                    f"Found {len(snl_group)} snls and {len(mat_group)} mats")
                yield mat_group, snl_group