def handle(self, *args, **options): associations = json.load(open('entity_associations_by_name.json', 'r')) products_dict = iterable_to_dict(Product, 'id') stores_dict = iterable_to_dict(Store, 'id') for key, payload in associations: store_id, entity_name = key store = stores_dict[store_id] print('Finding entity with key: {} - {}'.format( store, entity_name)) entities = Entity.objects.filter(store=store, name=entity_name) match_count = entities.count() if match_count == 0: print('No entity found') continue elif match_count > 1: print('More than one entity found') for entity in entities: print('* {} - {}'.format(entity.id, entity.url)) continue entity = entities[0] print('Matching entity found: {}'.format(entity.id)) product_id = payload['product'] product = products_dict.get(product_id, None) if not product: print('No matching product found: {}'.format(product_id)) continue secondary_product_id = payload['secondary_product'] if secondary_product_id: secondary_product = products_dict.get(secondary_product_id, None) if not secondary_product: print('No matching secondary product found: {}'.format( product_id)) continue else: secondary_product = None entity.product_type_id = payload['product_type'] entity.product = product entity.cell_plan = secondary_product entity.last_association = pytz.utc.localize( datetime.combine(dateparse.parse_date(payload['date']), datetime.min.time())) entity.last_association_user_id = payload['user'] entity.save()
def conflicts(self): raw_conflicts = self.filter(product__isnull=False) \ .get_available() \ .values('store', 'product', 'cell_plan') \ .annotate(conflict_count=Count('pk')) \ .order_by('store', 'product', 'cell_plan') \ .filter(conflict_count__gt=1) store_ids = set() product_ids = set() entities_query = Q() for entry in raw_conflicts: store_ids.add(entry['store']) product_ids.add(entry['product']) if entry['cell_plan']: product_ids.add(entry['cell_plan']) entities_query |= Q(store=entry['store']) & \ Q(product=entry['product']) & \ Q(cell_plan=entry['cell_plan']) entities = Entity.objects.filter(entities_query).select_related() entities_dict = {} for entity in entities: key = (entity.store_id, entity.product_id, entity.cell_plan_id) if key not in entities_dict: entities_dict[key] = [] entities_dict[key].append(entity) stores_dict = iterable_to_dict(Store.objects.filter(pk__in=store_ids)) products_dict = iterable_to_dict( Product.objects.filter(pk__in=product_ids).select_related( 'instance_model__model__category')) products_dict[None] = None result = [] for entry in raw_conflicts: result.append({ 'store': stores_dict[entry['store']], 'product': products_dict[entry['product']], 'cell_plan': products_dict[entry['cell_plan']], 'entities': entities_dict[(entry['store'], entry['product'], entry['cell_plan'])] }) return result
def bucket_results(self, entities, es_results, bucket_field='product_id'): ordering = self.ordering_or_default() product_id_to_prices = self.product_prices_dict(entities) product_ids = [entity['product'] for entity in entities] product_id_to_specs = { entry['_source']['product_id']: entry['_source'] for entry in es_results.to_dict()['hits']['hits'] } product_id_to_instance = iterable_to_dict( Product.objects.filter(pk__in=product_ids)) product_id_to_full_instance = {} for product_id in product_ids: full_instance = product_id_to_instance[product_id] full_instance._specs = product_id_to_specs[product_id] product_id_to_full_instance[product_id] = full_instance if not bucket_field: bucket_field = 'product_id' bucketed_results = OrderedDict() if ordering in self.DB_ORDERING_CHOICES: if ordering in self.PRICING_ORDERING_CHOICES: ordering_field = 'min_' + ordering else: ordering_field = ordering for entry in entities: product = product_id_to_full_instance[entry['product']] bucket = product.specs[bucket_field] if bucket not in bucketed_results: bucketed_results[bucket] = OrderedDict() if product not in bucketed_results[bucket]: bucketed_results[bucket][product] = { 'ordering_value': entry[ordering_field], 'prices': product_id_to_prices[product.id] } else: # Ordering was based on ES ordering_field = re.match(r'-?(.+)$', ordering).groups()[0] for es_product in es_results: product = product_id_to_full_instance[es_product['product_id']] bucket = product.specs[bucket_field] if bucket not in bucketed_results: bucketed_results[bucket] = OrderedDict() bucketed_results[bucket][product] = { 'ordering_value': product.specs[ordering_field], 'prices': product_id_to_prices[product.id] } return bucketed_results
def products(self, request, pk, *args, **kwargs): category = self.get_object() form_class = category.specs_form() form = form_class(request.query_params) if form.is_valid(): es_products_search = form.get_es_products() paginator = ProductPagination() page = request.query_params.get(paginator.page_query_param, 1) try: page = int(page) except ValueError: page = 1 page_size = paginator.get_page_size(request) offset = (page - 1) * page_size upper_bound = page * page_size es_products_page = es_products_search[offset:upper_bound].execute() # Page contents product_ids = [ es_product.product_id for es_product in es_products_page ] db_products = Product.objects.filter( pk__in=product_ids).select_related( 'instance_model__model__category') db_products_dict = iterable_to_dict(db_products, 'id') products = [] for es_product in es_products_page: db_product = db_products_dict[es_product.product_id] db_product._specs = es_product.to_dict() products.append(db_product) serializer = ProductSerializer(products, many=True, context={'request': request}) # Overall aggregations aggs = form.process_es_aggs(es_products_page.aggs) return Response({ 'count': es_products_page.hits.total, 'results': serializer.data, 'aggs': aggs, }) else: return Response(form.errors, status=status.HTTP_400_BAD_REQUEST)
def process_es_aggs(self, aggs): category_fields_specs_dict = iterable_to_dict( self.category_specs_filters, 'name') new_aggs = {} for field_name, field_aggs in aggs.to_dict().items(): buckets = field_aggs['result']['buckets'] field = category_fields_specs_dict[field_name] new_field_aggs = field.process_buckets(buckets) new_aggs[field.name] = new_field_aggs return new_aggs
def associate_related_cell_entities(self, user): from django.conf import settings assert self.cell_plan_name assert self.product print('Associating related entities for: {}'.format(self)) other_entities = Entity.objects.filter( store=self.store, name=self.name).exclude(pk=self.pk) other_entities_cell_plan_names = [ e.cell_plan_name for e in other_entities ] cell_plan_category = Category.objects.get( pk=settings.CELL_PLAN_CATEGORY) filter_parameters = { 'association_name.keyword': other_entities_cell_plan_names } matching_cell_plans = cell_plan_category.es_search() \ .filter('terms', **filter_parameters)[:100] \ .execute() cell_plan_ids = [ cell_plan.product_id for cell_plan in matching_cell_plans ] cell_plans = Product.objects.filter(pk__in=cell_plan_ids) cell_plans_dict = iterable_to_dict(cell_plans) cell_plans_dict = { cell_plan.association_name: cell_plans_dict[cell_plan.product_id] for cell_plan in matching_cell_plans } print('Related entities found:') for entity in other_entities: print('* {}'.format(entity)) if entity.cell_plan_name in cell_plans_dict: cell_plan = cell_plans_dict[entity.cell_plan_name] print('Matching plan found: {}'.format(cell_plan)) if entity.product != self.product or \ entity.cell_plan != cell_plan: entity.associate(user, self.product, cell_plan) else: print('No matching cell plan found')
def product_prices_dict(self, entities): product_id_to_prices = {} currencies_dict = iterable_to_dict(Currency) for entry in entities: currency = currencies_dict[entry['currency']] product_id = entry['product'] entry_prices = { 'currency': currency, 'min_normal_price': entry['min_normal_price'], 'min_offer_price': entry['min_offer_price'], 'min_normal_price_usd': entry['min_normal_price_usd'], 'min_offer_price_usd': entry['min_offer_price_usd'], } if product_id in product_id_to_prices: product_id_to_prices[product_id].append(entry_prices) else: product_id_to_prices[product_id] = [entry_prices] return product_id_to_prices
def update_entities(self, discover_urls_concurrency=None, products_for_url_concurrency=None, use_async=None, update_log=None): assert self.storescraper_class scraper = self.scraper if update_log: update_log.status = update_log.IN_PROCESS update_log.save() # First pass of product retrieval def log_update_error(exception): if update_log: update_log.status = update_log.ERROR desired_filename = 'logs/scrapings/{}_{}.json'.format( self, timezone.localtime( update_log.creation_date).strftime('%Y-%m-%d_%X')) storage = PrivateS3Boto3Storage() real_filename = storage.save( desired_filename, ContentFile( str(exception).encode('utf-8'))) update_log.registry_file = real_filename update_log.save() try: scraped_products_data = scraper.products( discover_urls_concurrency=discover_urls_concurrency, products_for_url_concurrency=products_for_url_concurrency, use_async=use_async ) except Exception as e: log_update_error(e) raise # self.update_with_scraped_products(scraped_products_data['products'], # update_log=update_log) scraped_products = scraped_products_data['products'] scraped_products_dict = iterable_to_dict(scraped_products, 'key') entities_to_be_updated = self.wtbentity_set.select_related() categories_dict = iterable_to_dict(Category, 'storescraper_name') for entity in entities_to_be_updated: scraped_product_for_update = scraped_products_dict.pop( entity.key, None) entity.update_with_scraped_product( scraped_product_for_update) for scraped_product in scraped_products_dict.values(): WtbEntity.create_from_scraped_product( scraped_product, self, categories_dict[scraped_product.category] ) if update_log: update_log.status = update_log.SUCCESS serialized_scraping_info = [p.serialize() for p in scraped_products] storage = PrivateS3Boto3Storage() scraping_record_file = ContentFile(json.dumps( serialized_scraping_info, indent=4).encode('utf-8')) desired_filename = 'logs/scrapings/{}_{}.json'.format( self, timezone.localtime(update_log.creation_date).strftime( '%Y-%m-%d_%X')) real_filename = storage.save(desired_filename, scraping_record_file) update_log.registry_file = real_filename update_log.save()
def handle(self, *args, **options): associations = json.load(open('entity_associations.json', 'r')) total_entity_count = len(associations) products_dict = iterable_to_dict(Product, 'id') categories_dict = iterable_to_dict(Category, 'id') entities_by_url = {} for entity in Entity.objects.all(): if entity.url not in entities_by_url: entities_by_url[entity.url] = [entity] else: entities_by_url[entity.url].append(entity) for idx, url in enumerate(associations): print('{} / {}: {}'.format(idx + 1, total_entity_count, url)) association_data = associations[url] if url not in entities_by_url: print('No entity found') continue entity = entities_by_url[url] if len(entity) > 1: print('More than one entity found for URL') continue entity = entity[0] if entity.product_id == association_data['product'] \ and entity.cell_plan_id == association_data[ 'secondary_product'] \ and entity.last_association_user_id == \ association_data['user'] \ and entity.category_id == \ association_data['product_type'] \ and entity.is_visible == \ association_data['is_visible']: print('No changes found, skipping') continue if association_data['product'] and association_data['product'] \ not in products_dict: print('Product not found ' + str(association_data['product'])) continue if association_data['secondary_product'] and \ association_data['secondary_product'] not in products_dict: print('Product not found ' + str(association_data['secondary_product'])) continue entity.product_id = association_data['product'] entity.cell_plan_id = association_data['secondary_product'] entity.last_association_user_id = association_data['user'] if association_data['date']: entity.last_association = pytz.utc.localize( datetime.combine( dateparse.parse_date(association_data['date']), datetime.min.time())) else: entity.last_association = None entity.category = categories_dict[ association_data['product_type']] entity.is_visible = association_data['is_visible'] print('Saving entity') entity.save()
def update_with_scraped_products(self, categories, scraped_products, discovery_urls_without_products, update_log=None): from solotodo.models import Currency, Entity assert self.is_active scraped_products_dict = iterable_to_dict(scraped_products, 'key') entities_to_be_updated = self.entity_set.filter( Q(category__in=categories) | Q(key__in=scraped_products_dict.keys())).select_related() categories_dict = iterable_to_dict(Category, 'storescraper_name') currencies_dict = iterable_to_dict(Currency, 'iso_code') for entity in entities_to_be_updated: scraped_product_for_update = scraped_products_dict.pop( entity.key, None) if scraped_product_for_update: category = categories_dict[scraped_product_for_update.category] currency = currencies_dict[scraped_product_for_update.currency] else: category = None currency = None entity.update_with_scraped_product(scraped_product_for_update, category, currency) for scraped_product in scraped_products_dict.values(): Entity.create_from_scraped_product( scraped_product, self, categories_dict[scraped_product.category], currencies_dict[scraped_product.currency], ) if update_log: update_log.status = update_log.SUCCESS update_log.available_products_count = len( list(filter(lambda x: x.is_available(), scraped_products))) update_log.unavailable_products_count = len( list(filter(lambda x: not x.is_available(), scraped_products))) update_log.discovery_urls_without_products_count = len( discovery_urls_without_products) serialized_scraping_info = { 'categories': [c.storescraper_name for c in categories], 'discovery_urls_without_products': discovery_urls_without_products, 'products': [p.serialize() for p in scraped_products] } storage = PrivateS3Boto3Storage() scraping_record_file = ContentFile( json.dumps(serialized_scraping_info, indent=4).encode('utf-8')) desired_filename = 'logs/scrapings/{}_{}.json'.format( self, timezone.localtime( update_log.creation_date).strftime('%Y-%m-%d_%X')) real_filename = storage.save(desired_filename, scraping_record_file) update_log.registry_file = real_filename update_log.save()