Exemplo n.º 1
0
    def handle(self, *args, **options):
        associations = json.load(open('entity_associations_by_name.json', 'r'))

        products_dict = iterable_to_dict(Product, 'id')
        stores_dict = iterable_to_dict(Store, 'id')

        for key, payload in associations:
            store_id, entity_name = key
            store = stores_dict[store_id]

            print('Finding entity with key: {} - {}'.format(
                store, entity_name))
            entities = Entity.objects.filter(store=store, name=entity_name)

            match_count = entities.count()

            if match_count == 0:
                print('No entity found')
                continue
            elif match_count > 1:
                print('More than one entity found')
                for entity in entities:
                    print('* {} - {}'.format(entity.id, entity.url))
                continue

            entity = entities[0]
            print('Matching entity found: {}'.format(entity.id))

            product_id = payload['product']
            product = products_dict.get(product_id, None)

            if not product:
                print('No matching product found: {}'.format(product_id))
                continue

            secondary_product_id = payload['secondary_product']

            if secondary_product_id:
                secondary_product = products_dict.get(secondary_product_id,
                                                      None)
                if not secondary_product:
                    print('No matching secondary product found: {}'.format(
                        product_id))
                    continue
            else:
                secondary_product = None

            entity.product_type_id = payload['product_type']
            entity.product = product
            entity.cell_plan = secondary_product
            entity.last_association = pytz.utc.localize(
                datetime.combine(dateparse.parse_date(payload['date']),
                                 datetime.min.time()))
            entity.last_association_user_id = payload['user']
            entity.save()
Exemplo n.º 2
0
    def conflicts(self):
        raw_conflicts = self.filter(product__isnull=False) \
            .get_available() \
            .values('store', 'product', 'cell_plan') \
            .annotate(conflict_count=Count('pk')) \
            .order_by('store', 'product', 'cell_plan') \
            .filter(conflict_count__gt=1)

        store_ids = set()
        product_ids = set()

        entities_query = Q()
        for entry in raw_conflicts:
            store_ids.add(entry['store'])
            product_ids.add(entry['product'])
            if entry['cell_plan']:
                product_ids.add(entry['cell_plan'])

            entities_query |= Q(store=entry['store']) & \
                Q(product=entry['product']) & \
                Q(cell_plan=entry['cell_plan'])

        entities = Entity.objects.filter(entities_query).select_related()

        entities_dict = {}
        for entity in entities:
            key = (entity.store_id, entity.product_id, entity.cell_plan_id)
            if key not in entities_dict:
                entities_dict[key] = []
            entities_dict[key].append(entity)

        stores_dict = iterable_to_dict(Store.objects.filter(pk__in=store_ids))
        products_dict = iterable_to_dict(
            Product.objects.filter(pk__in=product_ids).select_related(
                'instance_model__model__category'))
        products_dict[None] = None

        result = []
        for entry in raw_conflicts:
            result.append({
                'store':
                stores_dict[entry['store']],
                'product':
                products_dict[entry['product']],
                'cell_plan':
                products_dict[entry['cell_plan']],
                'entities':
                entities_dict[(entry['store'], entry['product'],
                               entry['cell_plan'])]
            })

        return result
    def bucket_results(self, entities, es_results, bucket_field='product_id'):
        ordering = self.ordering_or_default()
        product_id_to_prices = self.product_prices_dict(entities)

        product_ids = [entity['product'] for entity in entities]

        product_id_to_specs = {
            entry['_source']['product_id']: entry['_source']
            for entry in es_results.to_dict()['hits']['hits']
        }
        product_id_to_instance = iterable_to_dict(
            Product.objects.filter(pk__in=product_ids))
        product_id_to_full_instance = {}
        for product_id in product_ids:
            full_instance = product_id_to_instance[product_id]
            full_instance._specs = product_id_to_specs[product_id]
            product_id_to_full_instance[product_id] = full_instance

        if not bucket_field:
            bucket_field = 'product_id'

        bucketed_results = OrderedDict()

        if ordering in self.DB_ORDERING_CHOICES:
            if ordering in self.PRICING_ORDERING_CHOICES:
                ordering_field = 'min_' + ordering
            else:
                ordering_field = ordering

            for entry in entities:
                product = product_id_to_full_instance[entry['product']]

                bucket = product.specs[bucket_field]

                if bucket not in bucketed_results:
                    bucketed_results[bucket] = OrderedDict()

                if product not in bucketed_results[bucket]:
                    bucketed_results[bucket][product] = {
                        'ordering_value': entry[ordering_field],
                        'prices': product_id_to_prices[product.id]
                    }
        else:
            # Ordering was based on ES
            ordering_field = re.match(r'-?(.+)$', ordering).groups()[0]

            for es_product in es_results:
                product = product_id_to_full_instance[es_product['product_id']]
                bucket = product.specs[bucket_field]

                if bucket not in bucketed_results:
                    bucketed_results[bucket] = OrderedDict()

                bucketed_results[bucket][product] = {
                    'ordering_value': product.specs[ordering_field],
                    'prices': product_id_to_prices[product.id]
                }

        return bucketed_results
Exemplo n.º 4
0
    def products(self, request, pk, *args, **kwargs):
        category = self.get_object()
        form_class = category.specs_form()
        form = form_class(request.query_params)
        if form.is_valid():
            es_products_search = form.get_es_products()

            paginator = ProductPagination()
            page = request.query_params.get(paginator.page_query_param, 1)
            try:
                page = int(page)
            except ValueError:
                page = 1

            page_size = paginator.get_page_size(request)

            offset = (page - 1) * page_size
            upper_bound = page * page_size

            es_products_page = es_products_search[offset:upper_bound].execute()

            # Page contents

            product_ids = [
                es_product.product_id for es_product in es_products_page
            ]

            db_products = Product.objects.filter(
                pk__in=product_ids).select_related(
                    'instance_model__model__category')
            db_products_dict = iterable_to_dict(db_products, 'id')

            products = []
            for es_product in es_products_page:
                db_product = db_products_dict[es_product.product_id]
                db_product._specs = es_product.to_dict()
                products.append(db_product)

            serializer = ProductSerializer(products,
                                           many=True,
                                           context={'request': request})

            # Overall aggregations

            aggs = form.process_es_aggs(es_products_page.aggs)

            return Response({
                'count': es_products_page.hits.total,
                'results': serializer.data,
                'aggs': aggs,
            })
        else:
            return Response(form.errors, status=status.HTTP_400_BAD_REQUEST)
Exemplo n.º 5
0
    def process_es_aggs(self, aggs):
        category_fields_specs_dict = iterable_to_dict(
            self.category_specs_filters, 'name')

        new_aggs = {}

        for field_name, field_aggs in aggs.to_dict().items():
            buckets = field_aggs['result']['buckets']
            field = category_fields_specs_dict[field_name]
            new_field_aggs = field.process_buckets(buckets)
            new_aggs[field.name] = new_field_aggs

        return new_aggs
Exemplo n.º 6
0
    def associate_related_cell_entities(self, user):
        from django.conf import settings

        assert self.cell_plan_name
        assert self.product

        print('Associating related entities for: {}'.format(self))

        other_entities = Entity.objects.filter(
            store=self.store, name=self.name).exclude(pk=self.pk)

        other_entities_cell_plan_names = [
            e.cell_plan_name for e in other_entities
        ]

        cell_plan_category = Category.objects.get(
            pk=settings.CELL_PLAN_CATEGORY)

        filter_parameters = {
            'association_name.keyword': other_entities_cell_plan_names
        }

        matching_cell_plans = cell_plan_category.es_search() \
            .filter('terms', **filter_parameters)[:100] \
            .execute()

        cell_plan_ids = [
            cell_plan.product_id for cell_plan in matching_cell_plans
        ]
        cell_plans = Product.objects.filter(pk__in=cell_plan_ids)
        cell_plans_dict = iterable_to_dict(cell_plans)

        cell_plans_dict = {
            cell_plan.association_name: cell_plans_dict[cell_plan.product_id]
            for cell_plan in matching_cell_plans
        }

        print('Related entities found:')
        for entity in other_entities:
            print('* {}'.format(entity))

            if entity.cell_plan_name in cell_plans_dict:
                cell_plan = cell_plans_dict[entity.cell_plan_name]
                print('Matching plan found: {}'.format(cell_plan))
                if entity.product != self.product or \
                        entity.cell_plan != cell_plan:
                    entity.associate(user, self.product, cell_plan)
            else:
                print('No matching cell plan found')
    def product_prices_dict(self, entities):
        product_id_to_prices = {}

        currencies_dict = iterable_to_dict(Currency)

        for entry in entities:
            currency = currencies_dict[entry['currency']]
            product_id = entry['product']

            entry_prices = {
                'currency': currency,
                'min_normal_price': entry['min_normal_price'],
                'min_offer_price': entry['min_offer_price'],
                'min_normal_price_usd': entry['min_normal_price_usd'],
                'min_offer_price_usd': entry['min_offer_price_usd'],
            }

            if product_id in product_id_to_prices:
                product_id_to_prices[product_id].append(entry_prices)
            else:
                product_id_to_prices[product_id] = [entry_prices]

        return product_id_to_prices
Exemplo n.º 8
0
    def update_entities(self,
                        discover_urls_concurrency=None,
                        products_for_url_concurrency=None,
                        use_async=None, update_log=None):
        assert self.storescraper_class

        scraper = self.scraper

        if update_log:
            update_log.status = update_log.IN_PROCESS
            update_log.save()

        # First pass of product retrieval

        def log_update_error(exception):
            if update_log:
                update_log.status = update_log.ERROR
                desired_filename = 'logs/scrapings/{}_{}.json'.format(
                    self, timezone.localtime(
                        update_log.creation_date).strftime('%Y-%m-%d_%X'))
                storage = PrivateS3Boto3Storage()
                real_filename = storage.save(
                    desired_filename, ContentFile(
                        str(exception).encode('utf-8')))
                update_log.registry_file = real_filename
                update_log.save()

        try:
            scraped_products_data = scraper.products(
                discover_urls_concurrency=discover_urls_concurrency,
                products_for_url_concurrency=products_for_url_concurrency,
                use_async=use_async
            )
        except Exception as e:
            log_update_error(e)
            raise

        # self.update_with_scraped_products(scraped_products_data['products'],
        #                                   update_log=update_log)

        scraped_products = scraped_products_data['products']
        scraped_products_dict = iterable_to_dict(scraped_products, 'key')

        entities_to_be_updated = self.wtbentity_set.select_related()

        categories_dict = iterable_to_dict(Category, 'storescraper_name')

        for entity in entities_to_be_updated:
            scraped_product_for_update = scraped_products_dict.pop(
                entity.key, None)

            entity.update_with_scraped_product(
                scraped_product_for_update)

        for scraped_product in scraped_products_dict.values():
            WtbEntity.create_from_scraped_product(
                scraped_product,
                self,
                categories_dict[scraped_product.category]
            )

        if update_log:
            update_log.status = update_log.SUCCESS

            serialized_scraping_info = [p.serialize()
                                        for p in scraped_products]

            storage = PrivateS3Boto3Storage()
            scraping_record_file = ContentFile(json.dumps(
                serialized_scraping_info, indent=4).encode('utf-8'))

            desired_filename = 'logs/scrapings/{}_{}.json'.format(
                self, timezone.localtime(update_log.creation_date).strftime(
                    '%Y-%m-%d_%X'))
            real_filename = storage.save(desired_filename,
                                         scraping_record_file)
            update_log.registry_file = real_filename

            update_log.save()
Exemplo n.º 9
0
    def handle(self, *args, **options):
        associations = json.load(open('entity_associations.json', 'r'))

        total_entity_count = len(associations)

        products_dict = iterable_to_dict(Product, 'id')
        categories_dict = iterable_to_dict(Category, 'id')

        entities_by_url = {}
        for entity in Entity.objects.all():
            if entity.url not in entities_by_url:
                entities_by_url[entity.url] = [entity]
            else:
                entities_by_url[entity.url].append(entity)

        for idx, url in enumerate(associations):
            print('{} / {}: {}'.format(idx + 1, total_entity_count, url))

            association_data = associations[url]

            if url not in entities_by_url:
                print('No entity found')
                continue

            entity = entities_by_url[url]
            if len(entity) > 1:
                print('More than one entity found for URL')
                continue
            entity = entity[0]

            if entity.product_id == association_data['product'] \
                    and entity.cell_plan_id == association_data[
                        'secondary_product'] \
                    and entity.last_association_user_id == \
                    association_data['user'] \
                    and entity.category_id == \
                    association_data['product_type'] \
                    and entity.is_visible == \
                    association_data['is_visible']:
                print('No changes found, skipping')
                continue

            if association_data['product'] and association_data['product'] \
                    not in products_dict:
                print('Product not found ' +
                      str(association_data['product']))
                continue

            if association_data['secondary_product'] and \
                    association_data['secondary_product'] not in products_dict:
                print('Product not found ' +
                      str(association_data['secondary_product']))
                continue

            entity.product_id = association_data['product']
            entity.cell_plan_id = association_data['secondary_product']
            entity.last_association_user_id = association_data['user']
            if association_data['date']:
                entity.last_association = pytz.utc.localize(
                    datetime.combine(
                        dateparse.parse_date(association_data['date']),
                        datetime.min.time()))
            else:
                entity.last_association = None
            entity.category = categories_dict[
                association_data['product_type']]
            entity.is_visible = association_data['is_visible']
            print('Saving entity')
            entity.save()
Exemplo n.º 10
0
    def update_with_scraped_products(self,
                                     categories,
                                     scraped_products,
                                     discovery_urls_without_products,
                                     update_log=None):
        from solotodo.models import Currency, Entity

        assert self.is_active

        scraped_products_dict = iterable_to_dict(scraped_products, 'key')
        entities_to_be_updated = self.entity_set.filter(
            Q(category__in=categories)
            | Q(key__in=scraped_products_dict.keys())).select_related()

        categories_dict = iterable_to_dict(Category, 'storescraper_name')
        currencies_dict = iterable_to_dict(Currency, 'iso_code')

        for entity in entities_to_be_updated:
            scraped_product_for_update = scraped_products_dict.pop(
                entity.key, None)

            if scraped_product_for_update:
                category = categories_dict[scraped_product_for_update.category]
                currency = currencies_dict[scraped_product_for_update.currency]
            else:
                category = None
                currency = None

            entity.update_with_scraped_product(scraped_product_for_update,
                                               category, currency)

        for scraped_product in scraped_products_dict.values():
            Entity.create_from_scraped_product(
                scraped_product,
                self,
                categories_dict[scraped_product.category],
                currencies_dict[scraped_product.currency],
            )

        if update_log:
            update_log.status = update_log.SUCCESS
            update_log.available_products_count = len(
                list(filter(lambda x: x.is_available(), scraped_products)))
            update_log.unavailable_products_count = len(
                list(filter(lambda x: not x.is_available(), scraped_products)))
            update_log.discovery_urls_without_products_count = len(
                discovery_urls_without_products)

            serialized_scraping_info = {
                'categories': [c.storescraper_name for c in categories],
                'discovery_urls_without_products':
                discovery_urls_without_products,
                'products': [p.serialize() for p in scraped_products]
            }

            storage = PrivateS3Boto3Storage()
            scraping_record_file = ContentFile(
                json.dumps(serialized_scraping_info, indent=4).encode('utf-8'))

            desired_filename = 'logs/scrapings/{}_{}.json'.format(
                self,
                timezone.localtime(
                    update_log.creation_date).strftime('%Y-%m-%d_%X'))
            real_filename = storage.save(desired_filename,
                                         scraping_record_file)
            update_log.registry_file = real_filename

            update_log.save()