def build_power_prediction_model(): util.loginfo( "==============================================================") ads = get_ads_dataset( { "$and": [{ "engine": { "$ne": "0" } }, { "engine": { "$ne": 0 } }], "$and": [{ "power": { "$ne": 0 } }, { "power": { "$ne": "0" } }] }, ["model", "engine", "power"], "power") build_model(ads, "power") util.loginfo( "======================== Done ===============================")
def add_list_to_db(item_list): db = get_db() try: db.products.insert(item_list) util.loginfo('ad(s) inserted successfully') except Exception as e: util.logerr(str(e))
def build_price_prediction_model(): util.loginfo( "==============================================================") ads = get_ads_dataset( { "body_type": { "$ne": "" }, "fuel": { "$ne": "" }, "price": { "$ne": "" }, "color": { "$ne": "" }, "year": { "$ne": "" }, "mileage": { "$ne": "" } }, [ "brand", "model", "year", "body_type", "transmission", "location", "specs", "color", "mileage" ], "price") build_model(ads, "price") util.loginfo( "======================== Done ===============================")
def classify(self, ad, is_raw_ad=False): votes = { 'SAL': 0, 'PRT': 0, 'TRN': 0, 'SVC': 0, 'EXP': 0, 'REQ': 0, 'ACC': 0, 'EXC': 0, 'OTH': 0, 'INV': 0 } if is_raw_ad: ad = ({ 'ad_text': ad['description'] + " " + ad['title'] + " " + (ad['source']).replace('.com', '').replace('.sa', '').replace( '.ksa', '') + (" " + (ad['ad_page_link'][:-1]).rsplit('/', 1)[-1]) if (ad['source'] == 'haraj.com.sa') else "" }) util.logdebug( "-------------------------------------------------------------------------" ) util.logdebug(ad['ad_text']) util.logdebug( "-------------------------------------------------------------------------" ) else: ad_txt = '' for w in ad: if ad[w] == True: ad_txt += ' ' + w print(ad_txt) for classifier_name, classifier in self.trained_classifiers: if is_raw_ad: vote = classifier.classify( self.generate_feature(ad, self.words_sets[classifier_name])) else: vote = classifier.classify(ad) if vote == '': continue util.logdebug(classifier_name + " votted :" + vote) votes[vote] += 1 util.logdebug("final votes for all overall classifiers for all cats" + str(votes)) vote = max(votes, key=votes.get) confidence = float(votes[max(votes, key=votes.get)]) / float( len(self.trained_classifiers)) util.loginfo("final vote is :" + vote) util.loginfo("confidenc is :" + str(confidence)) if is_raw_ad: return vote, confidence else: return vote
def add_ad_to_db(item): db = get_db() try: db.products.insert_one(item) util.loginfo('ad inserted successfully') except Exception as e: util.logerr(str(e))
def deactivate(item): db = get_db() try: db.products.update({'source': item.get('source'),"ad_id": item.get('ad_id')}, {'$set': {'active': False}}) util.loginfo('ad is deactivated successful') except Exception as e: util.logerr(str(e))
def delete(item): db = get_db() try: db.products.delete_many({'source': item.get('source'),"ad_id": item.get('ad_id')}) util.loginfo('ad deleted successful') except Exception as e: util.logerr(str(e))
def get_models(): db = get_db() try: models = list(db.models.find({'active': True})) util.loginfo('models retrived successfully') except Exception as e: util.logerr(str(e)) return models
def get_brands(): db = get_db() try: brands = list(db.brands.find({'active': True})) util.loginfo('brands retrived successfully') except Exception as e: util.logerr(str(e)) return brands
def get_ad_by_query(q): db = get_db() try: ads = list(db.products.find(q)) util.loginfo('ads retrived successfully') except Exception as e: util.logerr(str(e)) return ads
def get_ad(): db = get_db() try: ad = db.products.find_one() util.loginfo('ad retrived successfully') except Exception as e: util.logerr(str(e)) return ad
def feature_selection(self, number_of_features_to_remove, raw_list_of_ads, best_classifier, best_words_set, best_accuracy): # read ads samples from training database util.loginfo("---------------- feature selection ----------------") util.logdebug("accuracy trying to beat is " + str(best_accuracy)) number_of_features_removed = 0 counter = 0 while counter < len( best_words_set) - 1 and number_of_features_removed < 3: removed = best_words_set.pop( random.randint(0, len(best_words_set) - 1)) util.logdebug("feature trying to test its value is " + u''.join(removed)) accuracy_ = [] for i in range(0, 11): util.logdebug("shuffling for " + str(i)) random.shuffle(raw_list_of_ads) raw_train_rows = int(math.floor(0.7 * len(raw_list_of_ads))) raw_train_set, raw_test_set = raw_list_of_ads[:raw_train_rows], raw_list_of_ads[ raw_train_rows:] train_list_of_ads = [ (self.generate_feature(ad, best_words_set), cat) for (ad, cat) in raw_train_set ] test_list_of_ads = [(self.generate_feature(ad, best_words_set), cat) for (ad, cat) in raw_test_set] new_classifier = best_classifier.train(train_list_of_ads) new_accuracy = nltk.classify.accuracy(best_classifier, test_list_of_ads) # accuracy_.append(new_accuracy) # if new_accuracy > accuracy_: # util.logdebug("found new local accuracy" + str (new_accuracy)) # accuracy_ = new_accuracy accuracy_.append(new_accuracy) if np.median(accuracy_) > best_accuracy: util.loginfo("---------- accuracy captured when removing " + removed + " is " + str(accuracy_)) util.loginfo("---------- new median accuracy" + str(np.median(accuracy_))) util.loginfo("---------- feature removed is" + u''.join(removed)) best_accuracy = np.median(accuracy_) best_classifier = new_classifier number_of_features_removed += 1 else: best_words_set.append(removed) if number_of_features_removed >= number_of_features_to_remove: break counter += 1 util.loginfo("accuracy : " + str(best_accuracy)) return best_classifier, best_words_set
def build_price_prediction_model(): util.loginfo( "==============================================================") ads = get_ads_dataset({ "year": { "$ne": 0 }, "mileage": { "$ne": "" } }, ["brand", "model", "year", "mileage"], "price") return build_model(ads)
def update(items): db = get_db() for item in items: try: db.products.update( { 'source': item.get('source'), "ad_id": item.get('ad_id') }, { "$set": { 'language_override': item.get('language_override'), 'ad_page_link': item.get('ad_page_link'), 'last_update': item.get('last_update'), 'title': item.get('title'), 'description': item.get('description'), 'brand': item.get('brand'), 'model': item.get('model'), 'year': item.get('year'), 'body_type': item.get('body_type'), 'mileage': item.get('mileage'), 'engine': item.get('engine'), 'power': item.get('power'), 'specs': item.get('specs'), 'transmission': item.get('transmission'), 'fuel': item.get('fuel'), 'condition': item.get('condition'), 'color': item.get('color'), 'price': item.get('price'), 'image_link': item.get('image_link'), 'location': item.get('location'), 'active': item.get('active'), 'keyFeatures': item.get('keyFeatures'), 'features': item.get('features'), 'tags':item.get('tags'), 'variants': item.get('variants'), 'ad_cat': item.get('ad_cat') } } ) util.loginfo("ad(s) updated successfully") except Exception as e: util.logerr(str(e))
def checkIfModelsExist(self): import os.path util.loginfo("-------------- checking if models exist --------------") for classifier_name, classifier in self.classifiers_list: if os.path.isfile( settings[settings['ENVIRONMENT']]['ML_FOLDER_PATH'] + 'classifier/model/' + classifier_name + '.pickle') == False: return False if os.path.isfile( settings[settings['ENVIRONMENT']]['ML_FOLDER_PATH'] + 'classifier/model/words_set_' + classifier_name + '.txt') == False: return False if os.path.isfile(settings[settings['ENVIRONMENT']]['ML_FOLDER_PATH'] + 'classifier/model/stop_words_set.txt') == False: return False return True
def __init__(self, classifier_list=[ ('linear_svc_2', SklearnClassifier(LinearSVC())), ('log_reg_1_2', SklearnClassifier(LogisticRegression())), ('log_reg_2_2', SklearnClassifier(LogisticRegression())), ('log_reg_3_2', SklearnClassifier(LogisticRegression())), ('multinom_nb_1_2', SklearnClassifier(MultinomialNB())), ('multinom_nb_2_2', SklearnClassifier(MultinomialNB())), ('multinom_nb_3_2', SklearnClassifier(MultinomialNB())), ('linear_svc_3', SklearnClassifier(LinearSVC())), ('log_reg_1_3', SklearnClassifier(LogisticRegression())), ('log_reg_2_3', SklearnClassifier(LogisticRegression())), ('log_reg_3_3', SklearnClassifier(LogisticRegression())), ('multinom_nb_1_3', SklearnClassifier(MultinomialNB())), ('multinom_nb_2_3', SklearnClassifier(MultinomialNB())), ('multinom_nb_3_3', SklearnClassifier(MultinomialNB())) ]): self.classifiers_list = [] self.trained_classifiers = [] self.words_sets = {} self.stop_words = [] self.classifiers_list = classifier_list self.ad_cats = [ 'SAL', 'PRT', 'TRN', 'SVC', 'EXP', 'REQ', 'ACC', 'EXC', 'OTH', 'INV' ] if (self.checkIfModelsExist()): util.loginfo( "---------------------- reading models ------------------------" ) self.read_models() else: util.loginfo( "-------------- models not there, start training --------------" ) self.train()
def build_specs_prediction_model(): util.loginfo( "==============================================================") ads = get_ads_dataset( { "body_type": { "$ne": "" }, "$and": [{ "engine": { "$ne": "0" } }, { "engine": { "$ne": 0 } }], "$and": [{ "power": { "$ne": 0 } }, { "power": { "$ne": "0" } }], "fuel": { "$ne": "" }, "specs": { "$ne": "" } }, [ "brand", "model", "engine", "power", "body_type", "fuel", "price", "specs" ], "specs") build_model(ads, "specs") util.loginfo( "======================== Done ===============================")
def update_lang(): db = dao.get_db_production() products = db.products.find({'language_override': {'$exists': False}}) for product in products: try: if detect(product['description']) == 'ar': language_override = 'arabic' # langdetect returns ISO 639-1 codes, else: language_override = 'english' # where mongo expects ISO 639-3 codes for Arabic except: util.loginfo( "Coudn't detect language falling back to English, ad_page_link" + product['ad_id']) language_override = 'english' brands = dao.get_brands() try: for brand in brands: if brand['_id'] == product['brand']: if language_override == "arabic": brand_name = brand.get("name_ar") break else: brand_name = brand.get("name") except: util.loginfo( "Skipping one item, error parsing brand, ad_data_link" + product['ad_id']) continue if brand == None: util.loginfo( "Skipping one item, brand is not found, ad_data_link" + product['ad_id']) continue product['language_override'] = language_override product['tags'] = brand_name db.get_collection('products').save(product)
def read(number_of_pages, debug, brands, models, classifier): skipped = 0 SOURCE = "carmudi.com.sa" count_of_ads_added = 0 count_of_ads_updated = 0 counter_of_ads = 0 already_loaded_counter = 0 for i in range(1, number_of_pages): util.loginfo(">>Page #" + str(i)) page_data = util.download_file(settings[SOURCE]['base_url'] + "/ar/api/web_listing?appliedFilter=%7B%22count%22:20,%22page%22:" + str( i) + "%7D", settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH']) if (page_data == None): continue if (len(json.loads(page_data)['products']) == 0): continue # for item in catalog_items: for item in json.loads(page_data)['products']: counter_of_ads += 1 util.loginfo(">Ad # " + str(counter_of_ads)) # 1- ad_data_link, mandatory try: ad_data_link = "/ar/api/product_details?lang=ar&product_id=" + str(item['id']); except: util.loginfo("Skipping one item, error parsing ad_data_link") skipped += 1 continue ad_data_content = util.download_file(settings[SOURCE]['base_url'] + ad_data_link, settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH']) if ad_data_content == None: util.loginfo("Skipping one item, error downloading ad_page, ad_data_link" + ad_data_link); skipped += 1 continue ad_json = json.loads(ad_data_content)['products']; # 2- ad_update_date, mandatory try: for c in {'th', 'nd', 'st'}: ad_json['created_at'] = ad_json['created_at'].replace(c, '') ad_update_date = datetime.strptime(ad_json['created_at'], "%d %b %Y") except: util.loginfo("Skipping one item, error parsing ad_update_date, ad_data_link" + ad_data_link); skipped += 1 continue # 3- ad_id, mandatory try: ad_id = ad_json['id'] except: util.loginfo("Skipping one item, error parsing ad_id, ad_data_link" + ad_data_link); skipped += 1 continue # 4- title, mandatory try: title = ad_json['make'] + " " + ad_json['model'] + " " + ad_json['year'] except: util.loginfo("Skipping one item, error parsing title, ad_data_link" + ad_data_link); skipped += 1 continue # 5- description, mandatory try: description = BeautifulSoup(ad_json['description'], 'html.parser').text # remove all tags except: util.loginfo("Skipping one item, error parsing body, ad_data_link" + ad_data_link); skipped += 1 continue # 6- brand, mandatory because the ad page link depends on it try: brand = util.find_brand(ad_json['make'], brands) except: util.loginfo("Skipping one item, error parsing brand, ad_data_link" + ad_data_link); continue if brand == None: util.loginfo("Skipping one item, brand is not found, ad_data_link" + ad_data_link); skipped += 1 continue # 7- model, mandatory try: model, found = util.find_model(ad_json['model'].capitalize(), models); except: util.loginfo("Skipping one item, error parsing model, ad_data_link" + ad_data_link); skipped += 1 continue # model = "" # if found == False: model = "" # 8- year, mandatory try: year = int(util.toArabicNumerals(ad_json['year'])) except: util.loginfo("Skipping one item, error parsing year, ad_data_link" + ad_data_link); skipped += 1 continue # 9- body_type, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web try: body_type = ad_json['body_type'] except: body_type = "" # 10- mileage, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web try: mileage = re.findall(r'\d+', ad_json['mileage'])[0] except: mileage = 0 # 11- engine, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web try: engine = re.findall(r'\d+', ad_json['engine'])[0] except: engine = 0 # 12- power, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web try: power = re.findall(r'\d+', ad_json['power'])[0] except: power = 0 # 13- specs[GCC,EU..etc], optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web try: specs = ad_json['specs'] except: specs = "" # 14- transmission, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web try: transmission = ad_json['transmission'] except: transmission = "" # 15- fuel [Gasline/Disel], optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web try: fuel = ad_json['fuel'] except: fuel = "" # 16- condition [Used/New], optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web try: condition = ad_json['condition'] except: condition = "" # 17- color, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web try: color = ad_json['color'] except: color = "" # 18- price, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web try: price = ad_json['price'] price_numeric_value = int( (filter(lambda x: x in set(string.printable), price)).strip('.. ').replace(',', '')) except: util.loginfo("Skipping one item, error parsing price, ad_data_link" + ad_data_link); skipped += 1 continue # 19- image_link, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web try: image_link = "https://s3-us-west-2.amazonaws.com/carmudi-site/products/360/" + ad_json['images'][0] except: util.loginfo("Skipping one item, error parsing image_link, ad_data_link" + ad_data_link); skipped += 1 continue # 20- location, optional, here we chose to skip in case we couldn't parse to avoid courrpt data sneak to shofle_web try: location = ad_json['location'].capitalize() except: location = "" # 21- ad_page_link ad_page_link = "/ar/product-detail/" + str(year) + "-" + brand['name'].lower() + "-" + \ model.lower() + "-in-" + location.lower() + "-" + str(price) + "-" + str(ad_id) try: if detect(description) == 'ar': language_override = 'arabic' # langdetect returns ISO 639-1 codes, else: language_override = 'english' # where mongo expects ISO 639-3 codes for Arabic except: util.loginfo("Coudn't detect language falling back to English, ad_page_link" + ad_page_link); language_override = 'english' brand_name = "" ad_to_save_or_update = { 'source': SOURCE, 'ad_id': ad_id, 'language_override': language_override, 'ad_page_link': settings[SOURCE]['base_url'] + ad_page_link, 'last_update': ad_update_date, 'title': title, 'nameLower': title.lower(), 'description': description, 'brand': ObjectId(brand.get('_id')), 'model': model, 'year': year, 'body_type': body_type, 'mileage': mileage, 'engine': engine, 'power': power, 'specs': specs, 'transmission': transmission, 'fuel': fuel, 'condition': condition, 'color': color, 'price': price, 'image_link': image_link, 'location': location, 'active': True, 'keyFeatures': [], 'features': [], 'tags': brand_name, 'variants': [{'image': image_link, 'price': price_numeric_value}]} # is_raw_ad = True indicates that ad need to be processed to extract features # ready ad for classification should be in the formate of: ([fet1,fet2....],cat) vote, confidence = classifier.classify(ad_to_save_or_update, is_raw_ad=True) # sleep(10) if vote != 'INV': if confidence >= 0.3: if vote == 'SAL': ad_to_save_or_update['ad_cat'] = vote else: if brand == "": del ad_to_save_or_update['brand'] ad_to_save_or_update['ad_cat'] = vote else: util.loginfo("Skipping one item, not confident classification, ad_data_link" + ad_page_link); skipped += 1 continue else: util.loginfo("Skipping one item, classified INVALID, ad_data_link" + ad_page_link); skipped += 1 continue ad_to_save_or_update['tags'] = calculate_tags(brand, model, ad_to_save_or_update, models) # TODO rename all products to ads if dao.product_exists_in_db({'source': SOURCE, 'ad_id': ad_id}): if dao.product_exists_in_db(ad_to_save_or_update): util.loginfo("source " + SOURCE + " ad_id " + str(ad_id) + " is already in the database!"); else: util.loginfo("updating " + SOURCE + ad_data_link); dao.update([ad_to_save_or_update]); count_of_ads_updated += 1 else: util.loginfo("adding " + SOURCE + ad_data_link); dao.add_list_to_db([ad_to_save_or_update]) count_of_ads_added += 1 if (settings['ENVIRONMENT'] == "Production"): wait = randint(5, 20) util.loginfo("waiting for " + str(wait) + " before reading next item"); sleep(wait) return count_of_ads_added, count_of_ads_updated, skipped, counter_of_ads
def read(number_of_pages, debug, brands, models, classifier): skipped = 0 SOURCE = "olx.sa.com" count_of_ads_added = 0 count_of_ads_updated = 0 counter_of_ads = 0 already_loaded_counter = 0 for i in range(1, number_of_pages): util.loginfo(">>Page #" + str(i)) page_content = util.download_file( settings[SOURCE]['base_url'] + "/vehicles/cars/?page=" + str(i), settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH']) if (page_content == None): continue catalog_listing_items = BeautifulSoup( page_content, 'html.parser').find_all('div', class_="ads__item") # TODO add ad title, ad link at its source, update date, make, model, year, location, millage, body type, seller type, options, # TODO add transmission, asked price, cash or installment, license validity, number of prev owners, main image, additional images, description # TODO decide which are mandatory and which are optional, in case it is optional decide about default data to fill input for item in catalog_listing_items: counter_of_ads += 1 util.loginfo(">Ad # " + str(counter_of_ads)) try: #TODO language should be configurable ad_page_link = item.find( 'a', class_="ads__item__title").attrs['href'] except: util.loginfo("Skipping one item, error parsing ad_page_link") skipped += 1 continue ad_page_content = util.download_file( ad_page_link, settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH']) if ad_page_content == None: continue try: ad_key_details = BeautifulSoup(ad_page_content, 'html.parser').find_all( 'div', class_="clr offerbody")[0] except: util.loginfo( "Skipping one item, error parsing ad_key_details, ad_page_link" + ad_page_link) skipped += 1 continue try: ad_update_date = ad_key_details.find_all( 'span', class_="pdingleft10 brlefte5")[0].next.strip().split(" ") ad_update_date = datetime.strptime( ad_update_date[7][:-1] + "-" + util.find_month(ad_update_date[6]) + "-" + ad_update_date[5], "%Y-%m-%d") except: util.loginfo( "Skipping one item, error parsing ad_update_date, ad_page_link" + ad_page_link) skipped += 1 continue try: ad_id = ad_key_details.find_all( 'span', class_="rel inlblk")[0].next.strip() except: util.loginfo( "Skipping one item, error parsing ad_id, ad_page_link" + ad_page_link) skipped += 1 continue try: title = BeautifulSoup( (item.find_all('a', class_="ads__item__title")[0].text).strip(), 'html.parser').text except: util.loginfo( "Skipping one item, error parsing title, ad_page_link" + ad_page_link) skipped += 1 continue try: description = BeautifulSoup( ad_key_details.find_all( 'p', class_="pding10 lheight20 large")[0].text.strip(), 'html.parser').text #remove all tags except: util.loginfo( "Skipping one item, error parsing body, ad_page_link" + ad_page_link) skipped += 1 continue try: # <p class="ads__item__breadcrumbs">سيارات » هيونداي</p> # https: // olx.sa.com / ad / honda - civic - 2004 - ID6NhJW.html brand = util.find_brand( item.find_all('p', class_="ads__item__breadcrumbs") [0].text.strip().split(' ')[2], brands) # brand = util.find_brand(ad_page_link.strip().split('/')[4].split('-')[0], brands) except: brand = "" if brand == None: brand = "" # TODO create a lookup for this try: model, found = util.find_model( ad_key_details.find_all('td', class_="value")[0].text.strip(), models) except: model = "" if found == False: model = "" try: year = int( util.toArabicNumerals( ad_key_details.find_all( 'td', class_="value")[2].text.strip())) except: year = 0 try: body_type = "" except: body_type = "" try: mileage = ad_key_details.find_all( 'td', class_="value")[3].text.strip() except: mileage = 0 # TODO some features are not offered by hatla2ee, therefore we need to ml # TODO suggested algo: find similar brand, model, year instances with power values exist and then take mode # TODO I am returning zero for now, if we are showing this value on the website then we need to indicate that it is # TODO prdited value and does not exisit in the original ad try: engine = 0 except: engine = 0 # TODO find similar instances, if three instances found then fetch power for this car otherwise return zero try: power = 0 except: power = 0 try: specs = "" except: specs = "" try: transmission = (ad_key_details.find_all( 'td', class_="value")[1].text).strip() except: transmission = "" try: fuel = "" except: fuel = "" try: condition = "" except: condition = "" try: color = "" except: color = "" try: price = item.find( 'p', class_="ads__item__price price ").text.strip() price_numeric_value = int( (filter(lambda x: x in set(string.printable), price)).strip('.. ').replace(',', '')) except: price = 0 price_numeric_value = 0 try: image_link = item.find_all( 'img', class_="ads__item__photos")[0].attrs['src'].strip() except: util.loginfo( "Skipping one item, error parsing image_link, ad_page_link" + ad_page_link) skipped += 1 continue try: location = item.find( 'p', class_='ads__item__location').text.strip() except: location = "" try: if detect(description) == 'ar': language_override = 'arabic' # langdetect returns ISO 639-1 codes, else: language_override = 'english' # where mongo expects ISO 639-3 codes for Arabic except: util.loginfo( "Coudn't detect language falling back to English, ad_page_link" + ad_page_link) language_override = 'english' brand_name = "" # TODO add tags field to the ads, this tags will take priority in search and will contian # TODO brand, model, color, engine ...etc all the features that I would to search with and I # TODO will need to categorize with as well, possibly I will make them lookups. ad_to_save_or_update = { 'source': SOURCE, 'ad_id': ad_id, 'language_override': language_override, 'ad_page_link': ad_page_link, 'last_update': ad_update_date, 'title': title, 'nameLower': title.lower(), 'description': description, 'brand': brand if brand == "" else ObjectId(brand.get('_id')), 'model': model, 'year': year, 'body_type': body_type, 'mileage': mileage, 'engine': engine, 'power': power, 'specs': specs, 'transmission': transmission, 'fuel': fuel, 'condition': condition, 'color': color, 'price': price, 'image_link': image_link, 'location': location, 'active': True, 'keyFeatures': [], #TODO remove this, not used 'features': [], #TODO remove this, not used 'tags': brand_name, 'variants': [{ 'image': image_link, 'price': price_numeric_value }] } #is_raw_ad = True indicates that ad need to be processed to extract features #ready ad for classification should be in the formate of: ([fet1,fet2....],cat) vote, confidence = classifier.classify(ad_to_save_or_update, True) # sleep(10) if vote != 'INV': if confidence >= 0.3: if vote == 'SAL': # must have brand if ad_to_save_or_update['brand'] == "": util.loginfo( "Skipping one item, SAL and brand is not found, ad_page_link" + ad_page_link) skipped += 1 continue # must have model if ad_to_save_or_update['model'] == "": util.loginfo( "Skipping one item, SAL and model is not found, ad_page_link" + ad_page_link) skipped += 1 continue # must have year # if ad_to_save_or_update['year'] == 0: # util.loginfo("Skipping one item, SAL and year is not found, ad_page_link" + ad_page_link); # skipped += 1 # continue else: if brand == "": del ad_to_save_or_update['brand'] ad_to_save_or_update['ad_cat'] = vote else: util.loginfo( "Skipping one item, not confident classification, ad_data_link" + ad_page_link) skipped += 1 continue else: util.loginfo( "Skipping one item, classified INVALID, ad_data_link" + ad_page_link) skipped += 1 continue ad_to_save_or_update['tags'] = calculate_tags( brand, model, ad_to_save_or_update, models) # TODO rename all products to ads if dao.product_exists_in_db({'source': SOURCE, 'ad_id': ad_id}): if dao.product_exists_in_db(ad_to_save_or_update): util.loginfo("source " + SOURCE + " ad_id " + str(ad_id) + " is already in the database!") else: util.loginfo("updating " + SOURCE + ad_page_link) dao.update([ad_to_save_or_update]) count_of_ads_updated += 1 else: util.loginfo("adding " + SOURCE + ad_page_link) dao.add_list_to_db([ad_to_save_or_update]) count_of_ads_added += 1 if (settings['ENVIRONMENT'] == "Production"): wait = randint(5, 20) util.loginfo("waiting for " + str(wait) + " before reading next item") sleep(wait) return count_of_ads_added, count_of_ads_updated, skipped, counter_of_ads
def main(number_of_pages, debug, source, brands, models, classifier): log_file_name = settings[settings['ENVIRONMENT']]['LOG_FOLDER_PATH'] + str( date.today()) util.loginfo( "==============================================================") util.loginfo(" Reading from " + source + " .... # pages: " + str(number_of_pages) + " debug: " + str(debug)) util.logtofile( log_file_name, "Reading from " + source + " .... # pages: " + str(number_of_pages) + " debug: " + str(debug)) util.loginfo( "==============================================================") # TODO we need to check if reading basic data without description is enough to spee up update if source == SOURCE_CARMUDI: reader = reader_carmudi elif source == SOURCE_HATLA2EE: reader = reader_hatla2ee elif source == SOURCE_OLX: reader = reader_olx elif source == SOURCE_HARAJ: reader = reader_haraj count_of_ads_added, count_of_ads_updated, skipped, counter_of_ads = reader.read( number_of_pages, debug, brands, models, classifier) util.loginfo("For " + source + " number of ads to add: " + str(count_of_ads_added)) util.logtofile( log_file_name, "For " + source + " number of ads to add: " + str(count_of_ads_added)) util.loginfo("For " + source + " number of ads to update: " + str(count_of_ads_updated)) util.logtofile( log_file_name, "For " + source + " number of ads to update: " + str(count_of_ads_updated)) util.logtofile( log_file_name, "For " + source + " number of ads to skipped: " + str(skipped)) util.logtofile( log_file_name, "For " + source + " total number of ads : " + str(counter_of_ads)) util.loginfo( "======================== Done ===============================")
def extract_features(self): # read ads samples from training database min_word_length = 2 min_word_freq = 5 no_training_rounds = 5 all_stop_words = self.get_stopwords() raw_list_of_ads = self.read_data_from_db() raw_list_of_ads = [({ 'ad_text': ad['description'] + " " + ad['title'] + " " + (ad['source']).replace('.com', '').replace('.sa', '').replace( '.ksa', '') + (" " + (ad['ad_page_link'][:-1]).rsplit('/', 1)[-1]) if (ad['source'] == 'haraj.com.sa') else "" }, ad['ad_cat']) for ad in raw_list_of_ads] for (classifier_name, classifier) in self.classifiers_list: accuracy_ary = [] best_accuracy = 0 util.loginfo("----------------" + classifier_name + " training ----------------") all_words_set = [] for i in range(0, len(self.words_sets[classifier_name])): all_words_set = self.words_sets[classifier_name].pop(i) random.shuffle(raw_list_of_ads) raw_train_rows = int(math.floor(0.7 * len(raw_list_of_ads))) raw_train_set, raw_test_set = raw_list_of_ads[:raw_train_rows], raw_list_of_ads[ raw_train_rows:] train_list_of_ads = [(self.generate_feature(ad, all_words_set), cat) for (ad, cat) in raw_train_set] test_list_of_ads = [(self.generate_feature(ad, all_words_set), cat) for (ad, cat) in raw_test_set] classifier = classifier.train(train_list_of_ads) accuracy = nltk.classify.accuracy(classifier, test_list_of_ads) if accuracy > best_accuracy: util.log("found new best accuracy" + str(accuracy)) best_accuracy = accuracy best_classifier = classifier best_words_set = all_words_set self.words_sets[classifier_name] = all_words_set util.loginfo("best accuracy : " + str(best_accuracy)) self.save_classifier_model(best_classifier, classifier_name) self.trained_classifiers.append((classifier_name, best_classifier)) self.save_word_set(best_words_set, classifier_name) self.words_sets[classifier_name] = best_words_set # self.save_stop_words_set(all_stop_words) # self.stop_words = all_stop_words util.loginfo( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) util.loginfo("voteclassifier accuracy is:" + str(nltk.classify.accuracy(self, test_list_of_ads))) util.loginfo( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) util.logdebug("read models") self.read_models() util.loginfo( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) util.loginfo("voteclassifier accuracy is:" + str(nltk.classify.accuracy(self, test_list_of_ads))) util.loginfo( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" )
def read(number_of_pages, debug, brands, models, classifier): skipped = 0 SOURCE = "ksa.hatla2ee.com" count_of_ads_added = 0 count_of_ads_updated = 0 counter_of_ads = 0 for i in range(1, number_of_pages): util.loginfo(">>Page #" + str(i)) page_content = util.download_file( settings[SOURCE]['base_url'] + "/ar/car/page/" + str(i), settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH']) if (page_content == None): continue catalog_listing_items = BeautifulSoup( page_content, 'html.parser').find_all('div', class_="CarListUnit") # TODO add ad title, ad link at its source, update date, make, model, year, location, millage, body type, seller type, options, # TODO add transmission, asked price, cash or installment, license validity, number of prev owners, main image, additional images, description # TODO decide which are mandatory and which are optional, in case it is optional decide about default data to fill input for item in catalog_listing_items: counter_of_ads += 1 util.loginfo(">Ad # " + str(counter_of_ads)) try: #TODO language should be configurable ad_page_link = (item.find_all( 'a', class_="NewListTitle")[0].attrs['href']).strip() except: util.loginfo("Skipping one item, error parsing ad_page_link") skipped += 1 continue ad_page_content = util.download_file( settings[SOURCE]['base_url'] + ad_page_link, settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH']) if ad_page_content == None: skipped += 1 continue try: ad_key_details = BeautifulSoup( ad_page_content, 'html.parser').find_all('div', class_="nUnitKeyDetailsContent") except: util.loginfo( "Skipping one item, error parsing ad_key_details, ad_page_link" + ad_page_link) skipped += 1 continue try: ad_update_date = datetime.strptime( item.find_all('span', class_="NewListDate")[0].next.strip(), "%Y-%m-%d") except: util.loginfo( "Skipping one item, error parsing ad_update_date, ad_page_link" + ad_page_link) skipped += 1 continue try: ad_id = (item.find_all( 'div', class_="favorit")[0].attrs['data-carid']).strip() except: util.loginfo( "Skipping one item, error parsing ad_id, ad_page_link" + ad_page_link) skipped += 1 continue try: title = BeautifulSoup( (item.find_all('a', class_="NewListTitle")[0].text).strip(), 'html.parser').text except: util.loginfo( "Skipping one item, error parsing title, ad_page_link" + ad_page_link) skipped += 1 continue try: desc_items = (item.find_all( 'ul', class_="NewListSpecifications")[0].find_all('li')) description = desc_items[0].find_all('a')[0].text.strip() + ' ' \ + desc_items[1].find_all('a')[0].text.strip() + ' ' \ + desc_items[2].find_all('span')[0].text.strip() + ' ' \ + desc_items[3].find_all('a')[0].text.strip() + ' ' \ + desc_items[4].find_all('span')[0].text.strip() description = BeautifulSoup( description, 'html.parser').text #remove all tags except: util.loginfo( "Skipping one item, error parsing body, ad_page_link" + ad_page_link) skipped += 1 continue # TODO configure try: # <a href="/ar/car/hyundai/elantra/1592640"> brand = util.find_brand( ((item.find_all('a')[0]['href']).strip()).split('/')[3], brands) except: brand = "" if brand == None: brand = "" try: model, found = util.find_model( ((item.find_all('a')[0]['href']).strip() ).split('/')[4].capitalize(), models) except: model = "" if found == False: model = "" try: year = int( util.toArabicNumerals( (item.find_all('span', class_="muted")[0].text).strip())) except: year = 0 # TODO configure try: body_type = (ad_key_details[0].find_all( 'strong', class_="nUnitItem")[8].next).strip() except: body_type = "" # TODO configure try: mileage = (ad_key_details[0].find_all( 'strong', class_="nUnitItem")[0].next).strip() except: mileage = 0 # TODO some features are not offered by hatla2ee, therefore we need to ml # TODO suggested algo: find similar brand, model, year instances with power values exist and then take mode # TODO I am returning zero for now, if we are showing this value on the website then we need to indicate that it is # TODO prdited value and does not exisit in the original ad # TODO zero need to be shown as -- on shofle_web # TODO optional try: engine = 0 # engine = predicter.query_engin_prediction_model() except: engine = 0 # TODO find similar instances, if three instances found then fetch power for this car otherwise return zero try: power = 0 except: power = 0 try: specs = "" except: specs = 0 try: transmission = (ad_key_details[0].find_all( 'strong', class_="nUnitItem")[6].next).strip() except: transmission = "" try: fuel = (ad_key_details[0].find_all( 'strong', class_="nUnitItem")[7].next).strip() except: fuel = 0 try: condition = "" except: condition = "" try: color = (ad_key_details[0].find_all( 'strong', class_="nUnitItem")[4].next).strip() except: color = "" try: price = (item.find_all('a', class_="NewListPrice")[0].text).strip() price_numeric_value = int( (filter(lambda x: x in set(string.printable), price)).strip('.. ').replace(',', '')) except: price = 0 price_numeric_value = 0 try: image_link = (item.find_all( 'img', class_="lazy imgfit")[0]['data-original']).strip() except: util.loginfo( "Skipping one item, error parsing image_link, ad_page_link" + ad_page_link) skipped += 1 continue try: location = (item.find( 'a', href=lambda href: href and "city" in href) )['href'].strip().split('/')[4].capitalize() except: location = "" try: if detect(description) == 'ar': language_override = 'arabic' # langdetect returns ISO 639-1 codes, else: language_override = 'english' # where mongo expects ISO 639-3 codes for Arabic except: util.loginfo( "Coudn't detect language falling back to English, ad_page_link" + ad_page_link) language_override = 'english' brand_name = "" # TODO add tags field to the ads, this tags will take priority in search and will contian # TODO brand, model, color, engine ...etc all the features that I would to search with and I # TODO will need to categorize with as well, possibly I will make them lookups. ad_to_save_or_update = { 'source': SOURCE, 'ad_id': ad_id, 'language_override': language_override, 'ad_page_link': settings[SOURCE]['base_url'] + ad_page_link, 'last_update': ad_update_date, 'title': title, 'nameLower': title.lower(), 'description': description, 'brand': brand if brand == "" else ObjectId(brand.get('_id')), 'model': model, 'year': year, 'body_type': body_type, 'mileage': mileage, 'engine': engine, 'power': power, 'specs': specs, 'transmission': transmission, 'fuel': fuel, 'condition': condition, 'color': color, 'price': price, 'image_link': image_link, 'location': location, 'active': True, 'keyFeatures': [], #TODO remove this, not used 'features': [], #TODO remove this, not used 'tags': brand_name, 'variants': [{ 'image': image_link, 'price': price_numeric_value }] } #is_raw_ad = True indicates that ad need to be processed to extract features #ready ad for classification should be in the formate of: ([fet1,fet2....],cat) vote, confidence = classifier.classify(ad_to_save_or_update, is_raw_ad=True) # sleep(10) if vote != 'INV': if confidence >= 0.3: if vote == 'SAL': # must have brand if ad_to_save_or_update['brand'] == "": util.loginfo( "Skipping one item, SAL and brand is not found, ad_page_link" + ad_page_link) skipped += 1 continue # must have model if ad_to_save_or_update['model'] == "": util.loginfo( "Skipping one item, SAL and model is not found, ad_page_link" + ad_page_link) skipped += 1 continue # must have year # if ad_to_save_or_update['year'] == 0: # util.loginfo("Skipping one item, SAL and year is not found, ad_page_link" + ad_page_link); # skipped += 1 # continue # estimated_price = price_predicter.query_price_prediction_model([ad_to_save_or_update]) # print "Predicted Engine value is ", str(estimated_price) # ad_to_save_or_update['engine'] = estimated_price[0] else: if brand == "": del ad_to_save_or_update['brand'] ad_to_save_or_update['ad_cat'] = vote else: util.loginfo( "Skipping one item, not confident classification, ad_data_link" + ad_page_link) skipped += 1 continue else: util.loginfo( "Skipping one item, classified INVALID, ad_data_link" + ad_page_link) skipped += 1 continue ad_to_save_or_update['tags'] = calculate_tags( brand, model, ad_to_save_or_update, models) # TODO rename all products to ads if dao.product_exists_in_db({'source': SOURCE, 'ad_id': ad_id}): if dao.product_exists_in_db(ad_to_save_or_update): util.loginfo("source " + SOURCE + " ad_id " + str(ad_id) + " is already in the database!") else: util.loginfo("updating " + SOURCE + ad_page_link) dao.update([ad_to_save_or_update]) count_of_ads_updated += 1 else: util.loginfo("adding " + SOURCE + ad_page_link) dao.add_list_to_db([ad_to_save_or_update]) count_of_ads_added += 1 if (settings['ENVIRONMENT'] == "Production"): wait = randint(5, 20) util.loginfo("waiting for " + str(wait) + " before reading next item") sleep(wait) return count_of_ads_added, count_of_ads_updated, skipped, counter_of_ads
def main(): util.loginfo( "==============================================================")
def read(number_of_pages, debug, brands, models, classifier): skipped = 0 SOURCE = "haraj.com.sa" count_of_ads_added = 0 count_of_ads_updated = 0 counter_of_ads = 0 already_loaded_counter = 0 for i in range(1, number_of_pages): util.loginfo("number of pages is" + str(number_of_pages)) util.loginfo(">>Page #" + str(i)) page_content = util.download_file(settings[SOURCE]['base_url'] + "/jsonGW/getadsx.php?link=tags/%D8%AD%D8%B1%D8%A7%D8%AC%20%D8%A7%D9%84%D8%B3%D9%8A%D8%A7%D8%B1%D8%A7%D8%AA/" + str( i) + "&_=1512689531762", settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH']) if (page_content == None): continue catalog_listing_items = BeautifulSoup(page_content, 'html.parser').find_all('div', class_="adx") # for item in catalog_items: for item in catalog_listing_items: counter_of_ads += 1 util.loginfo(">Ad # " + str(counter_of_ads)) try: ad_page_link = (item.find_all('a')[0].attrs['href']).strip(); except: util.loginfo("Skipping one item, error parsing ad_page_link") skipped += 1 continue ad_page_content = util.download_file(ad_page_link, settings[settings['ENVIRONMENT']][SOURCE]['DATA_FOLDER_PATH']) if ad_page_content == None: skipped += 1 continue try: ad_page_content = BeautifulSoup(ad_page_content, 'html.parser').find_all('div', class_="pageContent")[ 0]; except: util.loginfo("Skipping one item, error parsing ad_key_details, ad_page_content" + ad_page_link); skipped += 1 continue try: time_text_ary = ad_page_content.find_all('div', class_="adxExtraInfoPart")[2].text.strip().split() if len(time_text_ary) == 3: ad_update_date = datetime(datetime.today().year, datetime.today().month, datetime.today().day) elif len(time_text_ary) == 5 or len(time_text_ary) == 6: if time_text_ary[2].find(u'ساعه') != -1 \ or time_text_ary[1].find(u'ساعه') != -1 \ or time_text_ary[1].find(u'يوم') != -1: ad_update_date = datetime(datetime.today().year, datetime.today().month, datetime.today().day) elif time_text_ary[2].find(u'يوم') != -1: ad_update_date = datetime.today() - timedelta(days=int(time_text_ary[1])) ad_update_date = datetime(ad_update_date.today().year, ad_update_date.today().month, ad_update_date.today().day) else: ad_update_date = None else: ad_update_date = None if ad_update_date == None: util.loginfo("Skipping one item, error parsing ad_update_date, ad_page_link" + ad_page_link); skipped += 1 continue except: util.loginfo("Skipping one item, error parsing ad_update_date, ad_page_link" + ad_page_link); skipped += 1 continue try: ad_id = ad_page_content.find_all('div', class_="adxExtraInfoPart")[3].text.strip().strip('#') except: util.loginfo("Skipping one item, error parsing ad_id, ad_page_link" + ad_page_link); skipped += 1 continue try: title = ad_page_content.find_all('h3')[0].text.strip().split()[1] except: util.loginfo("Skipping one item, error parsing title, ad_page_link" + ad_page_link); skipped += 1 continue try: # desc_items = strip_tags(ad_page_content.find_all('div', class_="adxBody")[0].next.strip("<br>").strip('»')) description = BeautifulSoup(ad_page_content.find_all('div', class_="adxBody")[0].text, 'html.parser').text.strip() # remove all tags except: util.loginfo("Skipping one item, error parsing body, ad_page_link" + ad_page_link); skipped += 1 continue try: title_desc_text_ary = (title + " " + description).split() for w in title_desc_text_ary: brand = util.find_brand(w, brands) if brand != None: break; except: brand = "" if brand == None: brand = "" try: for w in title_desc_text_ary: model, found = util.find_model(w, models) if found == True: break except: model = "" if found == False: model = "" try: year = int(ad_page_content.find_all('meta')[0].attrs['content']) except: year = 0 try: body_type = ""; except: body_type = "" try: mileage = 0; except: mileage = 0 try: engine = 0; except: engine = 0 try: power = 0 except: power = 0 try: specs = "" except: specs = "" try: transmission = ""; except: transmission = "" try: fuel = ""; except: fuel = "" try: condition = ad_page_content.find_all('meta')[1].attrs['content'] except: condition = "" try: color = ""; except: color = "" try: price = 0 price_numeric_value = 0 except: price = 0 price_numeric_value = 0 try: # $('.adxBody>img')[0]['src'] image_link = ad_page_content.find('div', class_='adxBody').find_all('img')[0].attrs['src'] except: util.loginfo("Skipping one item, error parsing image_link, ad_page_link" + ad_page_link); skipped += 1 continue try: location = ad_page_content.find_all('div', class_="adxExtraInfoPart")[0].text.strip() except: location = 0 try: if detect(description) == 'ar': language_override = 'arabic' # langdetect returns ISO 639-1 codes, else: language_override = 'english' # where mongo expects ISO 639-3 codes for Arabic except: util.loginfo("Coudn't detect language falling back to English, ad_page_link" + ad_page_link); language_override = 'english' brand_name = "" # TODO add tags field to the ads, this tags will take priority in search and will contian # TODO brand, model, color, engine ...etc all the features that I would to search with and I # TODO will need to categorize with as well, possibly I will make them lookups. ad_to_save_or_update = { 'source': SOURCE, 'ad_id': ad_id, 'language_override': language_override, 'ad_page_link': ad_page_link, 'last_update': ad_update_date, 'title': title, 'nameLower': title.lower(), 'description': description, 'brand': brand if brand == "" else ObjectId(brand.get('_id')), 'model': model, 'year': year, 'body_type': body_type, 'mileage': mileage, 'engine': engine, 'power': power, 'specs': specs, 'transmission': transmission, 'fuel': fuel, 'condition': condition, 'color': color, 'price': price, 'image_link': image_link, 'location': location, 'active': True, 'keyFeatures': [], # TODO remove this, not used 'features': [], # TODO remove this, not used 'tags': brand_name, 'variants': [{'image': image_link, 'price': price_numeric_value}] } # is_raw_ad = True indicates that ad need to be processed to extract features # ready ad for classification should be in the formate of: ([fet1,fet2....],cat) vote, confidence = classifier.classify(ad_to_save_or_update, is_raw_ad=True) # sleep(5) # In Haraj and in all Readers, Brand is mandatory only in SAL, in Haraj, in SAL, if model is not found # the origianl text value from the ad is accepted, also "" is accpted as a model from Haraj # all this is due to large noise in data, TODO link brands to models, collect varaious ways of # writting models and brands in both arabic and english to enhance matching, and then reject # "" as a model, TODO add year as a mandatory attribute to SAL in Haraj and in all other readers if vote != 'INV': if confidence >= 0.6: # Confidence is a little highr for Haraj due to high rate of noise if vote == 'SAL': # must have brand if ad_to_save_or_update['brand'] == "": if ad_to_save_or_update['model'] != "": util.logdebug( "Brand is empty, and model is not, now trying to resolve brand using model") brand = find_brand_by_model(ad_to_save_or_update['model'], models) if brand != "": util.logdebug("------------- >>> Found brand by model ") ad_to_save_or_update['brand'] = brand else: util.loginfo( "Skipping one item, SAL and brand is not found, ad_page_link" + ad_page_link); skipped += 1 continue else: util.loginfo( "Skipping one item, SAL and brand is not found, ad_page_link" + ad_page_link); skipped += 1 continue # must have model # if ad_to_save_or_update['model'] == "": # util.loginfo("Skipping one item, SAL and model is not found, ad_page_link" + ad_page_link); # skipped += 1 # continue if ad_to_save_or_update['brand'] != "": if ad_to_save_or_update['model'] != "": if (not brand_and_model_match(brand, model)): util.loginfo( "Skipping one item, error model and barnd don't match, ad_page_link" + ad_page_link); skipped += 1 continue else: util.loginfo( "Skipping one item, error model and barnd don't match, ad_page_link" + ad_page_link); skipped += 1 continue # engine = predicter.query_engin_prediction_model([ad_to_save_or_update]) # print "Predicted Engine value is ", str(engine) else: if brand == "": del ad_to_save_or_update['brand'] ad_to_save_or_update['ad_cat'] = vote else: util.loginfo("Skipping one item, not confident classification, ad_data_link" + ad_page_link); skipped += 1 continue else: util.loginfo("Skipping one item, classified INVALID, ad_data_link" + ad_page_link); skipped += 1 continue # must have year if ad_to_save_or_update['year'] == 0: util.loginfo("Skipping one item, SAL and year is not found, ad_page_link" + ad_page_link); skipped += 1 continue ad_to_save_or_update['tags'] = calculate_tags(brand, model, ad_to_save_or_update, models) # TODO rename all products to ads if dao.product_exists_in_db({'source': SOURCE, 'ad_id': ad_id}): if dao.product_exists_in_db(ad_to_save_or_update): util.loginfo("source " + SOURCE + " ad_id " + str(ad_id) + " is already in the database!"); else: util.loginfo("updating " + SOURCE + ad_page_link); dao.update([ad_to_save_or_update]); count_of_ads_updated += 1 else: util.loginfo("adding " + SOURCE + ad_page_link); dao.add_list_to_db([ad_to_save_or_update]) count_of_ads_added += 1 if (settings['ENVIRONMENT'] == "Production"): wait = randint(5, 20) util.loginfo("waiting for " + str(wait) + " before reading next item"); sleep(wait) return count_of_ads_added, count_of_ads_updated, skipped, counter_of_ads