Exemplo n.º 1
0
def process_product(session_context, product_id, product=None, force_update=False):
    log.info("Processing product [%s]" % product_id)
    start = time()

    if product is None:
        product = session_context.data_proxy.fetch_products([product_id]).get(product_id)
        if product is None:
            raise ValueError("No product exists in the db with id [%s]" % product_id)

    log.info("Product [{0}] loaded".format(product_id))

    product_model, has_pre_existing_product_model = prd.prepare_product_model(
        session_context, product, force_update=force_update)

    if product_model is None:
        log.error("Error while processing product [%s]: product model was not generated" % product_id)
    else:
        language = product_model.get_attribute("language")

    product_as_dict = None
    product_model_as_dict = None

    if not has_pre_existing_product_model or force_update:

        for attribute in session_context.product_text_fields:
            if product_model_as_dict is None:
                product_model_as_dict = utils.flatten_dict(product_model.to_dict())  # lazily flattens the product model
            if attribute not in product_model_as_dict:
                if product_as_dict is None:
                    product_as_dict = utils.flatten_dict(product)  # lazily flattens the product
                value = product_as_dict.get(attribute)
                if value is not None:
                    stemmed_value = text.parse_text_to_stems(language, value)
                    product_model_as_dict[attribute] = stemmed_value

        _, _, tfidf_by_top_term_by_attribute = prd.prepare_product_terms(
            session_context, product_model_as_dict, reprocessing_product=has_pre_existing_product_model)

        pt_tfidf.update_templates(session_context, product_id, language, tfidf_by_top_term_by_attribute)
    log.info("---Done processing product [%s] (took %.6f seconds)" % (product_id, time() - start))

    session_context.clear_context_filters_cache()
Exemplo n.º 2
0
    def from_dict(product_id, product_model_dict, validator):
        """ Converts a product model in the form of a dict into an instance of ProductModel.
            It differs from the constructor in that from_dict() expects a product model, as the
            constructor expects a raw product.

            :param product_id: The id of the intended product.
            :param product_model_dict: A flat dict of attributes.
            :param validator: an instance of a ProductModelFactory.
            :returns: a ProductModel instance.
        """
        product_id = product_id
        product_model_values = utils.flatten_dict(product_model_dict)
        return ProductModel(validator, product_id, product_model_values)
Exemplo n.º 3
0
def __process_product_terms(session_context, page, products_list, language, flush_size):
    session_context = session_context.new_session()
    start_idx = page * session_context.page_size_batch_process_products
    end_idx = min((page + 1) * session_context.page_size_batch_process_products, len(products_list))

    page_product_ids = products_list[start_idx:end_idx]
    total_products = len(page_product_ids)

    tf_records = []
    df_by_term = {}

    product_models_map = session_context.data_proxy.fetch_product_models(page_product_ids)
    product_dicts_map = {p_id: utils.flatten_dict(p_model.to_dict()) for p_id, p_model in product_models_map.items()}
    skipped = total_products - len(product_dicts_map)

    non_persisted_text_fields = set(session_context.product_text_fields) - \
                                session_context.product_model_factory.persisted_attributes
    if len(non_persisted_text_fields) > 0:
        # Fetches the non-persisted text attributes from the raw products collection and stemmizes them.
        products_map = session_context.data_proxy.fetch_products(product_ids=page_product_ids,
                                                                 fields_to_project=list(non_persisted_text_fields))
        for p_id, product in products_map.items():
            attributes_ok = True
            if p_id not in product_dicts_map:
                attributes_ok = False
            if attributes_ok:
                product = utils.flatten_dict(product)
                stemmed_attributes_map = {}
                for attribute in non_persisted_text_fields:
                    value = product.get(attribute)
                    if value is not None:
                        try:
                            stemmed_attributes_map[attribute] = text.parse_text_to_stems(language, value)
                        except Exception as err:
                            log.error('Exception: {0}'.format(str(err)))
                            log.error('Offending value: {0}'.format(value))
                            attributes_ok = False
                            continue
            if attributes_ok:
                product_dicts_map[p_id].update(stemmed_attributes_map)
            else:
                skipped += 1
                if p_id in product_dicts_map:
                    product_dicts_map.pop(p_id)

    for product_dict in product_dicts_map.values():

        product_terms_results = prepare_product_terms(session_context, product_dict, batch_processing=True)
        if product_terms_results is None:
            skipped += 1
            continue
        new_tf_records, new_terms, _ = product_terms_results

        tf_records += new_tf_records

        for term in new_terms:
            df = df_by_term.get(term, 0) + 1
            df_by_term[term] = df

        if len(tf_records) >= flush_size:
                _flush_tf_records(session_context, tf_records)

    if len(tf_records) > 0:
        _flush_tf_records(session_context, tf_records)

    return df_by_term, skipped