def get_filename_from_url(url: str) -> Dict[str, str]: """ md5(domain)/md5(url[domain:]) """ response: Dict[str, str] = dict() domain = extract_domain(url=url) path = extract_path(url=url) folder = hashlib.md5(domain.encode()).hexdigest() file = hashlib.md5(path.encode()).hexdigest() response.update({ 'folder': f'{folder}/{file}', 'file': datetime.today().strftime("%Y-%m-%d") + '.body' }) return response
def get_xpaths(table: str, url: str) -> Dict[str, str]: """Docstring for the get_xpath function. Query DynamoDB using the domain for the XPath configuration on the given Table ``table``. Args: param1 (str) table: The DynamoDB table name to get the XPath obj from param2 (str) url: The url that will be extracted from the domain to make the query Returns: Object containing the XPath configurations from DynamoDB """ _table = dynamodb.Table(table) domain: str = extract_domain(url) response = _table.query( IndexName='domain-index', KeyConditionExpression=conditions.Key('domain').eq(domain) ) return response['Items'][0]
def run(event, context) -> Dict[str, str]: """Docstring for price_verifier:run function Receives a dict from the Scrape Spider with the crawled information to check if there is any price variation, if has variation it sends that to the Finisher within Scrape Information The received dict may look like this: - { 'executionId': HASH, 'dynamo': { 'table': ..., ... } } """ response: Dict[str, str] = {'id': event['executionId']} price_variation: Dict[str, str] = {'id': str(uuid4())} current_date: Date = datetime.now() content: Dict[str, str] = loads( b64decode(event['dynamo']['content'].encode()).decode()) price_verifier: ClassVar = DynamoUtils(environ['PRICE_VARIATION']) check: Dict[str, str] = None check_objects = price_verifier.get({ 'index': 'url-index', 'key': 'url', 'value': content['url'] }) for item in check_objects: if item['last_check'] == 'true': check = item if check: print( f"The following URL already exists on the Database: [{content['url']}]" ) if content['price'] == check['price']: print('The last check price is the same as today.') else: price_variation.update({ 'year': current_date.strftime('%Y'), 'month': current_date.strftime('%m'), 'day': current_date.strftime('%d'), 'last_check': 'true', 'url': content['url'], 'price': content['price'], 'domain': check['domain'] }) # Update the current price status[last_check] on DynamoDB to false updater_response = price_verifier.update({ 'partition_key': 'id', 'sort_key': 'url', 'id': check['id'], 'url': check['url'], 'target': 'last_check', 'value': 'false' }) price_verifier.put(item=price_variation) print('Updater Response:', updater_response) print('New Price Item:', price_variation) else: price_variation.update({ 'year': current_date.strftime('%Y'), 'month': current_date.strftime('%m'), 'day': current_date.strftime('%d'), 'last_check': 'true', 'url': content['url'], 'price': content['price'], 'domain': extract_domain(content['url']) }) price_verifier.put(item=price_variation) print('Added the URL to the Price Variation Table', price_variation)
def _get(self, content: str, url: str) -> dict: """Docstring for the _head function. It gets the information from the JSON using ``eval`` python function. Note: This is exclusive to OLX, since it's the unique website that keep the JSON on the <head> Args: param1 (str) content: The HTML <head> content Returns: A dict with the crawled content """ parser: ClassVar[Parser] = Selector(text=content) item: Dict[str, str] = dict() items: List[str] = [ 'url', 'date', 'domain', 'title', 'category', 'price', 'body', 'rooms', 'bathrooms', 'suites', 'garages', 'features', 'city', 'zipcode', 'neighbourhood', 'address', 'latitude', 'longitude', 'privative_area', 'total_area', 'ground_area', 'images' ] domain: str = extract_domain(url) url: str = url date = datetime.today().strftime("%Y-%m-%d") head_json = eval( parser.xpath(self.mapping['parser_json']).extract_first().replace( 'null', 'None').replace('true', 'True').replace('false', 'False'))['ad'] body: str = head_json['body'] title: str = parser.xpath(self.mapping['parser_title']).extract_first() location: Dict[str, str] = head_json['location'] city: str = location['municipality'] zipcode: str = location['zipcode'] neighbourhood: str = location['neighbourhood'] address: str = location['address'] latitude: str = location['mapLati'] longitude: str = location['mapLong'] privative_area: str = None total_area: str = None ground_area: str = None price: str = None if 'priceValue' in head_json: price = head_json['priceValue'] rooms: str = None garages: str = None bathrooms: str = None category: str = None suites: str = None features: list = list() for _property in head_json['properties']: if _property['name'] == 'price' and not price: price = _property['value'] elif _property['name'] == 'rooms': rooms = _property['value'] elif _property['name'] == 'garage_spaces': garages = _property['value'] elif _property['name'] == 'bathrooms': bathrooms = _property['value'] elif _property['name'] == 'category': category = _property['value'] elif _property['name'] == 'size': if _property['label'] == 'Área útil': privative_area = _property['value'] elif _property['label'] == 'Tamanho': ground_area = _property['value'] elif _property['label'] == 'Área total': total_area = _property['value'] else: total_area = _property elif 'features' in _property['name']: for feature in _property['values']: features.append(feature['label']) images: list = list() for image in head_json['images']: try: images.append({ 'src': image['original'], 'alt': image['originalAlt'] }) except KeyError: pass for variable in items: item[variable] = eval(variable) print('OLX head_json["ad"]', head_json) print('FINAL OLX ITEM', item) return item
def _get(self, content: str, url: str) -> dict: """Docstring for the _scrape function. It crawl the information on the given HTML ``content`` using the given dict with XPaths ``mapping``. The content it crawls is title, price, bedrooms, and informations like that. Note: It's different from the _crawl function since the crawled content and the dict structure are not the same Args: param1 (str) content: The HTML content Returns: A dict with the crawled content """ parser: ClassVar[Parser] = Selector(text=content) item: Dict[str, str] = dict() items: List[str] = [ 'url', 'date', 'domain', 'title', 'category', 'price', 'body', 'rooms', 'bathrooms', 'suites', 'garages', 'features', 'city', 'zipcode', 'neighbourhood', 'address', 'latitude', 'longitude', 'privative_area', 'total_area', 'ground_area', 'images' ] city: str = None zipcode: str = None neighbourhood: str = None address: str = None latitude: str = None longitude: str = None body: str = parser.xpath(self.mapping['parser_body']).extract_first() title: str = parser.xpath(self.mapping['parser_title']).extract_first() category: str = parser.xpath( self.mapping['parser_category']).extract_first() price: str = parser.xpath(self.mapping['parser_price']).extract_first() rooms: str = parser.xpath(self.mapping['parser_rooms']).extract_first() suites: str = parser.xpath( self.mapping['parser_suites']).extract_first() garages: str = parser.xpath( self.mapping['parser_garages']).extract_first() bathrooms: str = parser.xpath( self.mapping['parser_bathrooms']).extract_first() privative_area: str = parser.xpath( self.mapping['parser_privative_area']).extract_first() total_area: str = parser.xpath( self.mapping['parser_total_area']).extract_first() ground_area: str = parser.xpath( self.mapping['parser_ground_area']).extract_first() location: str = parser.xpath( self.mapping['parser_location']).extract_first() features: List[str] = parser.xpath( self.mapping['parser_features']).extract() images_src: List[str] = parser.xpath( self.mapping['parser_images_src']).extract() images_alt: List[str] = parser.xpath( self.mapping['parser_images_alt']).extract() images: List[Dict[str, str]] = [{ 'src': images_src[i], 'alt': images_alt[i] } for i in range(len(images_src))] url: str = url domain: str = extract_domain(url) date = datetime.today().strftime("%Y-%m-%d") if self.mapping['options_location_use_geo'] == 'true': latitude = parser.xpath( self.mapping['parser_location_latitude']).extract_first() longitude = parser.xpath( self.mapping['parser_location_longitude']).extract_first() elif self.mapping['options_location_use_geo'] == 'false': address = location for variable in items: item[variable] = eval(variable) return item