def parse_single_price(path, rule): """ for item page, to parse price information :param path: path of the raw data file :param rule: rule for parsing :return: price """ with open(path, 'rb') as f: content = f.read() encoding = chardet.detect(content)['encoding'] try: content = content.decode(encoding) except UnicodeDecodeError: try: content = content.decode('utf-8') except Exception as e: LOGGER.warning('url pattern: ' + str(e)) return None except Exception as e: LOGGER.warning('url pattern: ' + str(e)) return None doc = pq(content) price = doc(rule['selector']).filter( lambda x, this: re.compile(rule['filter_re']).match( pq(this).attr(rule['filter_attr']) if rule['filter_in_attr'] else pq(this).html())) if rule['filter'] else doc(rule['selector']) price = price.children() if rule['children'] else price if not price: return None price = pq(price).attr( rule['attr']) if rule['in_attr'] else pq(price).text() price, currency = price_formatter(price) return price
def microdata_filter(site_id): products = [] schema_product_type = 'http://schema.org/Product' data_file_path = config.URL_CRAWLED_DATA_DIR + str(site_id) if not os.path.exists(data_file_path): return False, None, None, None with open(data_file_path, 'rb') as f: encoding = chardet.detect(f.read())['encoding'] items = microdata.get_items(f, encoding) if not items: return False, None, None, None for item in items: item = json.loads(item.json()) if item.get('type')[0] == schema_product_type and item.get( 'properties').get('offers'): product_price = None product_currency = None try: product_price = item.get('properties').get('offers')[0].get( 'properties').get('price')[0] except Exception as e: print(e) try: product_currency = item.get('properties').get('offers')[0].get( 'properties').get('priceCurrency')[0] except Exception as e: print(e) if product_price: product = { 'price': price_formatter(product_price)[0] if product_price else None, 'currency': product_currency } products.append(product) if len(products) == 0: return False, None, None, None else: product = products[0] return True, product.get('price'), product.get( 'currency'), datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
def parse_price(file, containers, selectors): """ parse price :param file: raw data file path :param containers: containers in price_tag_selectors.json :param selectors: selectors in price_tag_selectors.json :return: True if error, else False :return: list of price information, [{'method': 'tag pattern', 'price', 'currency', 'rule': {'selector', 'weight'}, 'event_time'}, ...] :return: selectors failed to parse price """ with open(file, 'rb') as f: content = f.read() encoding = chardet.detect(content)['encoding'] try: content = content.decode(encoding) except UnicodeDecodeError as e: try: content = content.decode('utf-8') except Exception as e: print('[FAIL] tag pattern:', e) LOGGER.warning('tag pattern: ' + str(e)) return True, None, None except Exception as e: print('[FAIL] tag pattern:', e) LOGGER.warning('tag pattern: ' + str(e)) return True, None, None try: doc = pq(content) except Exception as e: LOGGER.warning('tag pattern: ' + str(e)) return True, None, None price_list = [] selector_list = [] for tag in selectors: prices = doc(tag.get('selector')) success_flag = False if prices: if prices.size() == 1: container = re.match(r'(<)(\w+)(\s*|/>)(\.*)', str(prices)).group(2) if container and container in containers: price, currency = price_formatter(prices.text()) if price and price != '' and float(price) != 0: price_list.append({ 'method': 'tag pattern', 'price': price, 'currency': currency, 'rule': { 'selector': tag.get('selector'), 'weight': tag.get('weight'), }, 'event_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') }) success_flag = True elif prices.size() > 1: i = 0 while i < prices.size(): price = prices.eq(i) container = re.match(r'(<)(\w+)(\s*|/>)(\.*)', str(price)).group(2) if container and container in containers: price, currency = price_formatter(price.text()) if price and price != '' and float(price) != 0: price_list.append({ 'method': 'tag pattern', 'price': price, 'currency': currency, 'rule': { 'selector': tag.get('selector'), 'weight': tag.get('weight'), }, 'event_time': datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') }) success_flag = True break i += 1 if not success_flag: selector_list.append(tag) return False, price_list, selector_list
def parse_list(path, rule): """ for list page, to parse name and price information :param path: path of the raw data file :param rule: rule for parsing :return: ret, list of information """ with open(path, 'rb') as f: content = f.read() encoding = chardet.detect(content)['encoding'] try: content = content.decode(encoding) except UnicodeDecodeError: try: content = content.decode('utf-8') except Exception as e: # print('[FAIL] url pattern:', e) LOGGER.warning('url pattern: ' + str(e)) return None except Exception as e: # print('[FAIL] url pattern:', e) LOGGER.warning('url pattern: ' + str(e)) return None doc = pq(content) list_exist = doc(rule['selector']) if not list_exist: return None ret = [] items = doc(rule['item_selector']) for i in range(0, items.size()): item = items.eq(i) description = None price = None # Parse description for it in item.items(rule['item_description']['selector']): description = it.eq(0) if description: description = description.filter(lambda x, this: re.compile(rule[ 'item_description']['filter_re']).match( pq(this).attr(rule['item_description']['filter_attr']) if rule['item_price']['filter_in_attr'] else pq(this).html( ))) if rule['item_description']['filter'] else description description = description.children( ) if rule['item_description']['children'] else description if description: description = pq(description).attr(rule['item_description']['attr']) \ if rule['item_description']['in_attr'] else description.text() # Parse price for it in item.items(rule['item_price']['selector']): price = it.eq(0) if price: price = price.filter(lambda x, this: re.compile(rule['item_price'][ 'filter_re']).match( pq(this).attr(rule['item_price']['filter_attr']) if rule['item_price']['filter_in_attr'] else pq(this).html( ))) if rule['item_price']['filter'] else price price = price.children( ) if rule['item_price']['children'] else price if price: price = pq(price).attr(rule['item_price']['attr']) \ if rule['item_price']['in_attr'] else pq(price).text() price, currency = price_formatter(price) ret.append({'name': description, 'price': price}) return ret
def microdata_filter(site): """ filter site contains microdata follows the schema in 'http://schema.org/Product' :param site: dict, {'site_id', 'product_site_id', 'url', 'product_name'} :return: success_flag, currency, price """ success_flag = False if not check_site_by_id(site.get('site_id')): return success_flag, None, None, None products = [] schema_product_type = 'http://schema.org/Product' data_file_path = config.URL_CRAWLED_DATA_DIR + str(site.get('site_id')) new_data_file_path = config.URL_CRAWLED_DATA_DIR + str( site.get('site_id')) + '_new' if not os.path.exists(new_data_file_path) and not os.path.exists( data_file_path): LOGGER.warning('microdata: cannot crawl data from this url') return False, None, None, None items = None if os.path.exists(new_data_file_path): with open(new_data_file_path, 'rb') as f: encoding = chardet.detect(f.read())['encoding'] items = microdata.get_items(f, encoding) if not items: if os.path.exists(new_data_file_path) and os.path.exists( data_file_path): os.remove(new_data_file_path) if os.path.exists(data_file_path): with open(data_file_path, 'rb') as f: encoding = chardet.detect(f.read())['encoding'] items = microdata.get_items(f, encoding) else: if os.path.exists(data_file_path): os.remove(data_file_path) os.rename(new_data_file_path, data_file_path) for item in items: item = json.loads(item.json()) if item.get('type')[0] == schema_product_type and item.get( 'properties').get('offers'): success_flag = True product_name = None product_price = None product_currency = None try: product_name = item.get('properties').get('name')[0] except Exception as e: LOGGER.warning('microdata: ' + str(e)) try: product_price = item.get('properties').get('offers')[0].get( 'properties').get('price')[0] except Exception as e: LOGGER.warning('microdata: ' + str(e)) try: product_currency = item.get('properties').get('offers')[0].get( 'properties').get('priceCurrency')[0] except Exception as e: LOGGER.warning('microdata: ' + str(e)) if product_price: product = { 'name': product_name, 'price': price_formatter(product_price)[0] if product_price else None, 'currency': product_currency } products.append(product) if len(products) == 0: LOGGER.info('[FAIL] microdata: not found') return success_flag, None, None, None elif len(products) == 1: product = products[0] else: product = find_best_match(products, site.get('product_name')) LOGGER.info('[RESULT] microdata: ' + str(product.get('currency')) + ' ' + str(product.get('price'))) return success_flag, product.get('price'), product.get('currency'), \ datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')