def transform(
        self,
        *,
        corpus: WdcOffersCorpus,
        product_type_classifier: Optional[WdcProductTypeClassifier] = None,
        dimension_parser: Optional[WdcDimensionParser] = None,
        bucketer: Optional[WdcSizeBuckets] = None,
    ) -> Generator[Union[KgNode, KgEdge], None, None]:
        """
        Main functionality for transformer

        :param corpus: container for data points for easier parsing
        :param product_type_classifier: classifier used to determine the appropriate generic product category, defaults to None
        :param dimension_parser: parser used to determine dimensions, defaults to None
        :param bucketer: algorithm for finding generalized product type sizes and bucketing them accordingly
        :return: KgEdge objects representing spatial relations between products
        """

        # Set default ProductTypeClassifier
        if not product_type_classifier:
            product_type_classifier = WdcHeuristicProductTypeClassifier()

        # Set default DimensionParser
        if not dimension_parser:
            dimension_parser = WdcParsimoniousDimensionParser()

        if not bucketer:
            bucketer = WdcHalfOrderSizeBuckets()

        self.__dimension_parser = dimension_parser
        self.__product_type_classifier = product_type_classifier
        self.__bucketer = bucketer

        # Parse file and generalize product types
        for entry in corpus.entries():
            for parsed_dimensions in self.__dimension_parser.parse(
                    entry=entry):
                for product_type in self.__product_type_classifier.classify(
                        entry=entry):
                    if not product_type or not product_type.expected:
                        continue
                    self.__bucketer.generalize(
                        wdc_product_type=product_type,
                        wdc_product_dimensions=parsed_dimensions.dimensions,
                    )

        for permutation in itertools.permutations(
                self.__bucketer.averages.values(), 2):
            print(permutation)
            yield self.__find_predicate(permutation[0], permutation[1])
def test_wdc_rounded_size_bucket(wdc_large_offers_corpus: WdcOffersCorpus):
    bucketer = WdcRoundedSizeBuckets()
    type_parser = WdcHeuristicProductTypeClassifier()
    dimension_parser = WdcParsimoniousDimensionParser()
    for entry in wdc_large_offers_corpus.entries():
        for dimension in dimension_parser.parse(entry=entry):
            for product in type_parser.classify(entry=entry):
                if not product or not product.expected:
                    continue
                bucketer.generalize(
                    wdc_product_type=product,
                    wdc_product_dimensions=dimension.dimensions,
                )
    for generic_product in bucketer.averages.values():
        if generic_product.bucket:
            assert type(generic_product.bucket) in (float, int)
            assert generic_product.bucket == int(round(generic_product.volume))
def test_heuristic_product_type_classifier(
        wdc_large_offers_corpus: WdcOffersCorpus):
    HPTC = WdcHeuristicProductTypeClassifier()
    for count, entry in enumerate(wdc_large_offers_corpus.entries()):
        if count + 1 == 1:
            item = next(HPTC.classify(entry=entry))
            assert item.expected.name == "Electronics"
            assert f"{item.expected.confidence:.3%}" == "100.000%"
        elif count + 1 == 2:
            item = next(HPTC.classify(entry=entry))
            assert item.expected.name == "Jewelry"
            assert f"{item.expected.confidence:.3%}" == "100.000%"
        elif count + 1 == 3:
            item = next(HPTC.classify(entry=entry))
            assert item.expected.name == "Gourmet Food"
            assert f"{item.expected.confidence:.3%}" == "33.333%"
        elif count + 1 == 5:
            item = next(HPTC.classify(entry=entry))
            assert item.expected.name == "Pet Supplies"
            assert f"{item.expected.confidence:.3%}" == "66.667%"
def test_wdc_half_order_size_buckets(wdc_large_offers_corpus: WdcOffersCorpus):
    bucketer = WdcHalfOrderSizeBuckets()
    product_parser = WdcHeuristicProductTypeClassifier()
    dimension_parser = WdcParsimoniousDimensionParser()
    for entry in wdc_large_offers_corpus.entries():
        for product in product_parser.classify(entry=entry):
            if not product or not product.expected:
                continue
            for dimension in dimension_parser.parse(entry=entry):
                bucketer.generalize(
                    wdc_product_type=product,
                    wdc_product_dimensions=dimension.dimensions,
                )
    for generic_product in bucketer.averages.values():
        if not generic_product.bucket:
            continue
        assert type(generic_product.bucket) in (float, int)
        assert generic_product.bucket == 1 or generic_product.bucket - 1 <= (
            bucketer.num_buckets - 1) * (log(
                generic_product.volume * 10 / bucketer.max_volume) / log(10))
def test_parsimonious_parser_large(wdc_large_offers_corpus: WdcOffersCorpus):
    for count, entry in enumerate(wdc_large_offers_corpus.entries()):
        if count in (19, 20):
            dimension = WdcParsimoniousDimensionParser().parse(entry=entry)
            if not dimension:
                continue
            dimension = dimension[0]
            assert dimension.dimensions.weight.value == 1.0
            assert dimension.dimensions.weight.unit == "g"
            assert dimension.dimensions.accuracy(
                SOURCE_KEY[dimension.field]) == 0.75

        elif count in (79, 80):
            dimension = WdcParsimoniousDimensionParser().parse(entry=entry)
            if not dimension:
                continue
            dimension = dimension[0]
            assert dimension.dimensions.weight.value == 1.7
            assert dimension.dimensions.weight.unit == "oz"
            assert dimension.dimensions.accuracy(
                SOURCE_KEY[dimension.field]) == 0.5
Пример #6
0
from mowgli_etl.pipeline.wdc.wdc_offers_corpus import WdcOffersCorpus
from mowgli_etl.pipeline.wdc.wdc_constants import WDC_ARCHIVE_PATH
from mowgli_etl.pipeline.wdc.parsimonious_parser.wdc_parsimonious_dimension_parser import (
    WdcParsimoniousDimensionParser as WPDP, )
import dataclasses

if __name__ == "__main__":
    min_vals = dict()
    max_vals = dict()
    corpus = WdcOffersCorpus(wdc_json_file_path=WDC_ARCHIVE_PATH /
                             "offers_corpus_english_v2_1000.jsonl")
    dim_classifier = WPDP()
    for entry in corpus.entries():
        dimensions = dim_classifier.parse(entry=entry)
        for dimension in dimensions:
            dimension = dimension.dimensions.to_english()
            for field in dataclasses.fields(dimension):
                dim = getattr(dimension, field.name)
                if not dim:
                    continue
                if field.name not in min_vals:
                    min_vals[field.name] = dim.value
                else:
                    min_vals[field.name] = min(min_vals[field.name], dim.value)
                if field.name not in max_vals:
                    max_vals[field.name] = dim.value
                else:
                    max_vals[field.name] = max(min_vals[field.name], dim.value)

    print("MINIMUM VALUES:\n")
    for key in sorted(list(min_vals.keys())):
Пример #7
0
def wdc_large_offers_corpus():
    return WdcOffersCorpus(
        wdc_json_file_path=WDC_ARCHIVE_PATH / "offers_corpus_english_v2_1000.jsonl"
    )
Пример #8
0
def test_100_data_sample(wdc_large_offers_corpus: WdcOffersCorpus):
    count = 0
    for entry in wdc_large_offers_corpus.sample(100):
        count += 1
    assert count == 100
Пример #9
0
def test_massive_data_sample(wdc_large_offers_corpus: WdcOffersCorpus):
    with pytest.raises(ValueError):
        for item in wdc_large_offers_corpus.sample(10**8):
            assert item is None