Exemplo n.º 1
0
def get_store_deps(params):
    """
        Method to extract the departments in retailer
    """
    dep_list = []
    br_stats = {}
    try:
        store_id = params['external_id']
        # Prepare request
        br = ByRequest(attempts=1)
        br.add_proxy(OXYLABS, attempts=3, name='Oxylabs')
        logger.debug('[ByRequest] Rquesting {}'.format(url_store.format(store_id)))
        response = br.get(url_store.format(store_id), return_json=True)
        br_stats = br.stats
        ws_id = stream_monitor('worker', step='start', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats)

        if response:
            # Add departments
            for dep in response:
                dep_list.append(extract_info(dep))

        else:
            err_st = 'Could not get response for {}'.format(url_store.format(store_id))
            logger.error(err_st)
            stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=0, reason=str(err_st))
                
    except Exception as e:
        err_st = "Unexpected error in get_store_deps: {}".format(e)
        ws_id = stream_monitor('worker', step='start', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats)
        es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=err_st)
        logger.error(err_st)
        logger.debug(params)
    return dep_list
Exemplo n.º 2
0
def get_stores_from_coords(lat, lng, gral_data={}):
    url_coord = "https://services.mxgrability.rappi.com/api/base-crack/principal?lat={}&lng={}&device=2"
    br = ByRequest(attempts=2)
    br.add_proxy(OXYLABS, attempts=5, name="Oxylabs")
    stores_ls = []
    lat = frmt_coord(lat)
    lng = frmt_coord(lng)
    logger.debug('[ByRequest] Requesting {}'.format(url_coord.format(lat,
                                                                     lng)))
    resp = br.get(url_coord.format(lat, lng), return_json=True)
    if isinstance(resp, list):
        logger.debug('Got response')
        # pprint(resp)
        stores_ls = extract_stores(resp)
    else:
        logger.error('Not a valid response, check if the site changed')
    for raw_st in stores_ls:
        try:
            for loc in raw_st.get('locations', []):
                clean_store = create_st_dict(loc)
                if isinstance(clean_store, dict):
                    clean_store.update(gral_data)
                    stream_info(clean_store)
        except Exception as ex:
            err_st = 'Error with store {}'.format(raw_st)
            errors.append(MonitorException(code=3, reason=err_st))
            logger.error(err_st)

    logger.info('Found {} stores'.format(len(stores_ls)))
    return stores_ls
Exemplo n.º 3
0
 def __init__(self):
     self.br = ByRequest(attempts=1)
     self.br.add_proxy(OXYLABS, attempts=3, name='Oxylabs')
     self._access_token = None
     self.url_auth = 'https://services.mxgrability.rappi.com/api/auth/guest_access_token'
     self.url_content = 'https://services.mxgrability.rappi.com/api/dynamic/context/content'
     self.base_url = 'https://www.rappi.com.mx/'
     self.url_image = 'https://images.rappi.com.mx/products/{}'
     self.url_product = 'https://www.rappi.com.mx/product/{}'
     self.dep_list = []
     self.cat_list = []
     self.product_list = []
     self.total_products = 0
Exemplo n.º 4
0
# -*- coding: utf-8 -*-
import datetime
from pprint import pprint
from worker import app
import pandas as pd

from ByHelpers import applogger
from ByHelpers.rabbit_engine import (MonitorException, stream_info,
                                     stream_monitor)
from ByRequests.ByRequests import ByRequest

from config import OXYLABS, SRV_GEOLOCATION, CELERY_QUEUE

logger = applogger.get_logger()

br = ByRequest(attempts=2)
br.add_proxy(OXYLABS, attempts=5, name="Oxylabs")

stores_dict = {
    'la_comer': {
        'name': 'La Comer',
        'key': 'rappi_la_comer'
    },
    'costco': {
        'name': 'Costco',
        'key': 'rappi_costco'
    },
    'chedraui': {
        'name': 'Chedraui',
        'key': 'rappi_chedraui'
    },
Exemplo n.º 5
0
# -*- coding: utf-8 -*-
import json
import os
import random
import sys

from ByHelpers import applogger
from ByRequests.ByRequests import ByRequest
from ByHelpers.rabbit_engine import stream_monitor

from config import *
from worker import crawl_store, start_stores, logger

br = ByRequest(attempts=2)

# URLS
geoloc_host = 'http://' + str(SRV_GEOLOCATION)
stores_endp_url = geoloc_host + '/store/retailer?key=%s'

#Variables
retailer_key = 'rappi'
retailer_name = 'Rappi'

retailers_to_get = [
    'rappi_farmazone', 'rappi_farmacias_similares', 'rappi_benavides',
    'rappi_farmaciasguadalajara'
]


def call_scraper(params, ms_id):
    """ Call to crawl async elements
Exemplo n.º 6
0
class StoreCrawler():
    def __init__(self):
        self.br = ByRequest(attempts=1)
        self.br.add_proxy(OXYLABS, attempts=3, name='Oxylabs')
        self._access_token = None
        self.url_auth = 'https://services.mxgrability.rappi.com/api/auth/guest_access_token'
        self.url_content = 'https://services.mxgrability.rappi.com/api/dynamic/context/content'
        self.base_url = 'https://www.rappi.com.mx/'
        self.url_image = 'https://images.rappi.com.mx/products/{}'
        self.url_product = 'https://www.rappi.com.mx/product/{}'
        self.dep_list = []
        self.cat_list = []
        self.product_list = []
        self.total_products = 0

    def perform_request(self, url, headers={}, json={}, method='GET', return_json=True, require_auth=False):
        logger.debug('[ByRequest] Requesting {}'.format(url))
        proxies = {
            "http": OXYLABS,
            "https": OXYLABS,
        }
        global_headers = self.br.headers
        global_headers.update(headers)
        if require_auth:
            global_headers.update({'Authorization': f'Bearer {self.access_token}'})
        response = requests.request(method, url, headers=global_headers, json=json, proxies=proxies)
        # response = self.br.request(method, url, headers=global_headers, json=json)        
        if response.status_code >= 300:
            if require_auth:
                self.get_auth()
                global_headers.update({'Authorization': f'Bearer {self.access_token}'})
            response = requests.request(method, url, headers=global_headers, json=json, proxies=proxies)
            # response = self.br.request(method, url, headers=global_headers, json=json)
        if response and return_json:
            try:
                response = response.json()
            except Exception as e:
                response = {}
        return response
    
    def get_auth(self):
        headers = {
            'Accept': 'application/json, text/plain, */*',
            'Content-Type': 'application/json'
        }
        body = {
            "headers": {
                "normalizedNames": {},
                "lazyUpdate": None
            },
            "grant_type": "guest"
        }
        response = self.perform_request(self.url_auth, method='POST', headers=headers, json=body)
        if response:
            access_token = response.get('access_token')
            self._access_token = access_token
        return access_token

    @property
    def access_token(self):
        if not self._access_token:
            self.get_auth()
        return self._access_token

    def get_store_departments(self, params):
        """
            Method to extract the departments for given store
        """
        headers = {
            'language': 'es',
            'Content-Type': 'application/json',
            'app-version': 'web_4.0.6'
        }
        local_products = []
        try:
            store_id = params['external_id']
            # Prepare request
            url = self.url_content
            body = {
                "state": {
                    "lat": str(params['coords']['lat']),
                    "lng": str(params['coords']['lng']),
                    "parent_store_type": None,
                    "store_type": None
                },
                "limit": 10,
                "offset": 0,
                "context": "store_home",
                "stores": [int(store_id)]
            }
            response = self.perform_request(url, method='POST', headers=headers, json=body, require_auth=True)

            if response:
                # Add departments
                for element in response['data']['components']:
                    if element.get('name') == 'aisles_icons_carousel':
                        for cat in element['resource'].get('aisle_icons', []):
                            self.dep_list.append(self.extract_info(cat))
                        resource_products = element['resource'].get('products', [])
                        local_products.extend(resource_products)

            else:
                err_st = 'Could not get department response for {}'.format(url)
                logger.error(err_st)
            self.product_list.extend(local_products) 
            logger.info('Found {} departments in {} [{}]'.format(len(self.dep_list), params['retailer_key'], store_id))       
        except Exception as e:
            err_st = "Unexpected error in get_store_departments: {}".format(e)
            logger.error(err_st)
            logger.debug(params)
        return self.dep_list

    def get_store_categories(self, params):
        """
            Method to extract the categories for given store
        """
        headers = {
            'language': 'es',
            'Content-Type': 'application/json',
            'app-version': 'web_4.0.6'
        }
        local_products = []
        try:
            store_id = params['external_id']
            # Prepare request
            url = self.url_content
            body = {
                "state": {},
                "limit": 100,
                "offset": 0,
                "context": "aisles_tree",
                "stores": [int(store_id)]
            }
            # body = {
            #     "state": {
            #         "aisle_id": "0",
            #         "parent_id": "0"
            #     },
            #     "limit": 10,
            #     "context": "sub_aisles",
            #     "stores": [int(store_id)]
            # }

            #for i in range(0, len(self.dep_list), 10):
            if True:
                #body["offset"] = i
                response = self.perform_request(url, method='POST', headers=headers, json=body, require_auth=True)

                if response:
                    # Add categories
                    for cat in response['data']['components']:
                        if cat.get('name') == 'aisles_tree':
                            self.cat_list.append(self.extract_info(cat['resource']))
                            resource_products = cat['resource'].get('products', [])
                            local_products.extend(resource_products)

                else:
                    err_st = 'Could not get categories response for store {} - {}'.format(params['retailer_key'], store_id)
                    logger.error(err_st)
                    logger.debug(pformat(body))

            self.product_list.extend(local_products)    
            logger.info('Found {} categories in {} [{}]'.format(len(self.cat_list), params['retailer_key'], store_id))       
        except Exception as e:
            err_st = "Unexpected error in get_store_categories: {}".format(e)
            logger.error(err_st)
            logger.debug(params)
        return self.cat_list


    def get_category_products(self, category_dict, params):
        """
            Method to extract the products for given category
        """
        headers = {
            'language': 'es',
            'Content-Type': 'application/json',
            'app-version': 'web_4.0.6'
        }
        more_items = True
        offset = 0
        category_products = []
        try:
            store_id = params['external_id']
            # Prepare request
            url = self.url_content
            body = {
                "state": {
                    "aisle_id": str(category_dict['id'])
                },
                "limit": 10,
                "context": "aisle_detail",
                "stores": [int(store_id)]
            }

            while more_items:
                local_products = []
                body["offset"] = offset
                response = self.perform_request(url, method='POST', headers=headers, json=body, require_auth=True)

                if response:
                    # Add products
                    for element in response['data']['components']:
                        if element.get('name') == 'aisle_detail':
                            resource_products = element['resource'].get('products')
                            local_products.extend(resource_products)

                else:
                    err_st = 'Could not get product response for category {} - {}'.format(category_dict['name'], category_dict['id'])
                    logger.error(err_st)
                    logger.debug(pformat(body))

                    # retry
                offset += 10

                if len(local_products) < 60:
                    more_items = False

                category_products.extend(local_products)
                
            logger.info('Found {} products in {} | {} [{}]'.format(len(category_products), category_dict['name'], params['retailer_key'], store_id)) 
            self.product_list.extend(category_products)

        except Exception as e:
            err_st = "Unexpected error in get_category_products: {}".format(e)
            logger.error(err_st)
            logger.debug(params)
        return category_products


    def extract_info(self, raw_dep_dict):
        children = raw_dep_dict.get('categories', [])
        if children:
            children = [self.extract_info(child) for child in children]
        n_dict = {
            'name' : raw_dep_dict.get('name'),
            'id': raw_dep_dict.get('id'),
            'children': children,
            'children_count': raw_dep_dict.get('children_count', len(children)),
            'total_products': raw_dep_dict.get('quantity_products')
        }
        return n_dict

    def process_product(self, params, raw_prod):
        clean_product = {
            'route_key' : params['route_key'],
            'retailer' : params['retailer_key'],
            'name' : raw_prod.get('name'),
            'id' :  raw_prod.get('product_id'),
            'url' : self.url_product.format(raw_prod.get('id')),
            'gtin' : raw_prod.get('ean'),
            'date' : str(datetime.datetime.utcnow()),
            'description' : raw_prod.get('description'),
            'brand' : raw_prod.get('trademark'),
            'provider' : '',
            'ingredients' : [],
            'images' : [
                self.url_image.format(raw_prod.get('image'))
            ],
            'raw_attributes' : [
                {
                    'key'  : 'content',
                    'value': raw_prod.get('quantity'),
                    'unit' : raw_prod.get('unit_type')
                }
            ],
            'raw_ingredients' : '',
            'price' : float(raw_prod.get('price')) if raw_prod.get('price') is not None else None,
            'price_original' : float(raw_prod.get('real_price')) if raw_prod.get('real_price') is not None else None,
            'discount' : float(raw_prod.get('discount')) if raw_prod.get('discount') is not None else None,
            'promo' : '',
            'location' : {
                'store' : [
                    params['store_uuid']
                ]
            }
        }
        return clean_product
            
    def send_products(self, params):
        logger.info('Found {} products to send {} [{}]'.format(len(self.product_list), params['retailer_key'], params['external_id']))
        for product in self.product_list:
            clean_product = self.process_product(params, product)
            try:
                stream_info(clean_product)
                self.total_products += 1
            except Exception as e:
                err_str = 'Could not send product_id {}: {}'.format(clean_product.get('id'), e)
                logger.error(err_str)
        logger.info('Sent {} products {} [{}]'.format(self.total_products, params['retailer_key'], params['external_id']))
        

    def crawl_store(self, params):
        # self.get_store_departments(params)
        # time.sleep(1)
        self.get_store_categories(params)
        time.sleep(0.5)
        for category_dict in self.cat_list:
            for subcategory_dict in category_dict['children']:
                self.get_category_products(subcategory_dict, params)
            time.sleep(random.randint(2, 4))
        self.send_products(params)
        return self.total_products
Exemplo n.º 7
0
def crawl_cat(dep_name, scat, params, page=1, next_id=None, run_all=True):
    br_stats = {}
    br = ByRequest(attempts=1)
    br.add_proxy(OXYLABS, attempts=3, name='Oxylabs')
    errors = []

    # Url creation
    url = url_cat.format(scat['id'], params['external_id'], LIMIT)
    if next_id is not None:
        url = url + '&next_id={}'.format(next_id)

    logger.debug('[ByRequest] Requesting {}'.format(url))
    
    try:
        response = br.get(url, return_json=True)
        br_stats = br.stats
        next_id = None
        prod_raw_ls = []
        prods_ls = []
        cat_ls = [dep_name, scat['name']]

        # Product list extraction
        if isinstance(response, dict):
            next_id = response.get('next_id')
            result = response.get('results', [])
            for res in result:
                prod_raw_ls.extend(res.get('products', []))
        else:
            err_st = 'Could not get response from {}'.format(url)
            logger.error(err_st)
            errors.append(MonitorException(code=0, reason=err_st))

        # Check if there are more products to crawl
        n_prod = len(prod_raw_ls)
        logger.info('Found {} products, page {} for {} | {}'.format(str(n_prod).ljust(3), str(page).ljust(2), params['retailer_key'], ' | '.join(cat_ls)))

        if (next_id is not None) and run_all:
            logger.debug('Found next page...')
            # crawl_cat(dep_name, scat, params, page=page+1, next_id=next_id)
            crawl_cat.apply_async(args=(dep_name, scat, params, page+1, next_id), queue=CELERY_QUEUE)
        for prod in prod_raw_ls:
            try:
                prod_clean = process_prod(prod, params)
                if prod_clean:
                    prod_clean.update({
                        'categories': cat_ls,
                    })
                    prods_ls.append(prod_clean)
                    stream_info(prod_clean)
                else:
                    err = 'Could not get product'
                    logger.error(err)
                    raise Exception(err)
            except Exception as exe:
                err_st = 'Error with product: {}'.format(prod)
                logger.error(err_st)
                errors.append(MonitorException(code=2, reason=err_st))
                
        if len(errors) > 0:
            ws_id = stream_monitor('worker', step='category', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats)
            for error in errors:
                stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=error.code, reason=str(error.reason))
        else:
            stream_monitor('worker', step='category', value=1, ms_id=params['ms_id'], store_id=params['store_id'])

    except Exception as e:
        err_st = "Unexpected error in crawl_cat: {}".format(e)
        ws_id = stream_monitor('worker', step='category', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats)
        es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=err_st)
        logger.error(err_st)
    return prods_ls