def ebay2generic(item, info): try: full_img_url = item["IMAGE_URL"] generic = { "id": [info["id"]], "categories": info["categories"], "clickUrl": item["OFFER_URL_MIN_CATEGORY_BID"], "images": { "XLarge": full_img_url }, "status": info["status"], "shortDescription": item["OFFER_TITLE"], "longDescription": item["OFFER_DESCRIPTION"], "price": info["price"], "Brand": item["MANUFACTURER"], "Site": item["MERCHANT_NAME"], "download_data": { 'dl_version': today_date, 'first_dl': today_date, 'fp_version': constants.fingerprint_version }, "fingerprint": None, "gender": info["gender"], "ebay_raw": item } image = Utils.get_cv2_img_array(full_img_url) if image is None: #try again if 'https://' in full_img_url: image = Utils.get_cv2_img_array(full_img_url[8:]) elif 'http://' in full_img_url: image = Utils.get_cv2_img_array(full_img_url[7:]) else: image, generic = None, None if image is None: generic = None return image, generic img_hash = get_hash(image) generic["img_hash"] = img_hash except: print item generic = None image = None return image, generic
def ebay_downloader(filename, filesize): if not startORstall(filesize): q.enqueue(ebay_downloader, args=(filename, filesize), timeout=5400) sleep(150) return ftp = ebay_dl_utils.ftp_connection(ebay_dl_utils.us_params) start = time() sio = StringIO() gc.collect() def handle_binary(more_data): sio.write(more_data) try: resp = ftp.retrbinary('RETR ' + filename, callback=handle_binary) except: try: ftp = ebay_dl_utils.ftp_connection(ebay_dl_utils.us_params) resp = ftp.retrbinary('RETR ' + filename, callback=handle_binary) except: ftp.quit() return sio.seek(0) zipfile = gzip.GzipFile(fileobj=sio) unzipped = zipfile.read() # each item is arranged in a dict according to the keys of the first item # all items are gathered in a list items = csv.DictReader(unzipped.splitlines(), delimiter='\t') itemCount = 0 new_items = 0 stall = 0 item = None for item in items: # verify right category mainCategory = item["CATEGORY_NAME"] if mainCategory != "Clothing": continue minimal_info = getImportantInfoOnly(item) if len(minimal_info["categories"]) < 1: continue # needs to add search for id and etc... collection_name = "ebay_" + minimal_info["gender"] if minimal_info["categories"] == "t-shirt": # collection_name ="ebay_Tees" # exists = db[collection_name].find({'id':item["\xef\xbb\xbfOFFER_ID"]}) # if exists.count()>1: # db[collection_name].delete_many({'id':item["\xef\xbb\xbfOFFER_ID"]}) # exists=[] # if exists.count()==0: # generic_dict = ebay2generic(item, gender, subCategory) # db[collection_name].insert_one(generic_dict) # itemCount +=1 # else: # pass continue itemCount += 1 print(itemCount) exists = db[collection_name].find_one( {'id': { '$in': [minimal_info['id']] }}) existsPlusHash = db[collection_name].find_one({ 'id': { '$in': [minimal_info['id']] }, "img_hash": { "$exists": 1 } }) if exists and existsPlusHash and exists["fingerprint"] is not None: if not exists['download_data']['dl_version'] == today_date: db[collection_name].update_one({'_id': exists['_id']}, { "$set": { "download_data.dl_version": today_date, "price": minimal_info["price"] } }) if exists["status"]["instock"] != minimal_info["status"][ "instock"]: db[collection_name].update_one( {'_id': exists['_id']}, {"$set": { "status": minimal_info["status"] }}) elif exists["status"]["instock"] is False and minimal_info[ "status"]["instock"] is False: db[collection_name].update_one( {'_id': exists['_id']}, {"$inc": { "status.days_out": 1 }}) else: pass else: pass else: if exists and existsPlusHash: #but fingerprint is none! db[collection_name].delete_many({'_id': exists['_id']}) elif exists and not existsPlusHash: #got no hash! image = Utils.get_cv2_img_array(item["IMAGE_URL"]) if image is None: db[collection_name].delete_many({'id': exists['id']}) else: img_hash = get_hash(image) db[collection_name].update_one( {'_id': exists['_id']}, {"$set": { "img_hash": img_hash }}) continue else: #check if in archive and has hash archive = collection_name + "_archive" existsInArchive = db[archive].find_one({ 'id': { '$in': [minimal_info['id']] }, "img_hash": { "$exists": 1 } }) if existsInArchive and existsInArchive[ "fingerprint"] is not None: existsInArchive["download_data"]["dl_version"] = today_date existsInArchive["price"] = minimal_info["price"] if minimal_info["status"]["instock"]: existsInArchive["status"] = minimal_info["status"] db[archive].delete_one({'_id': existsInArchive['_id']}) db[collection_name].insert_one(existsInArchive) else: db[archive].update_one( {"_id": existsInArchive['_id']}, {"$set": { "download_data.dl_version": today_date }}) continue elif existsInArchive: db[archive].delete_one({"_id": existsInArchive['_id']}) else: pass while q.count > 250000: print("Q full - stolling") sleep(600) stall += 1 img, generic_dict = ebay2generic(item, minimal_info) if generic_dict is None or img is None: print( 'img download failed!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' ) continue #check if hash already exists: hashexists = db[collection_name].find_one( {'img_hash': generic_dict['img_hash']}) hashexistsInArchive = db[archive].find_one( {'img_hash': generic_dict['img_hash']}) if hashexists: id_list = hashexists['id'] + generic_dict['id'] db[collection_name].update_one({'_id': hashexists['_id']}, {'$set': { 'id': id_list }}) print('hash exists') elif hashexistsInArchive: id_list = hashexistsInArchive['id'] + generic_dict['id'] hashexistsInArchive = db[archive].find_one_and_update( {'_id': hashexistsInArchive['_id']}, {'$set': { 'id': id_list }}) if minimal_info["status"]["instock"]: print('hash exists in archive and is now instock') hashexistsInArchive["status"] = minimal_info["status"] db[archive].delete_one( {'_id': hashexistsInArchive['_id']}) db[collection_name].insert_one(hashexistsInArchive) else: print('hash exists in archive but out of stock') else: new_items += 1 print('new item') q.enqueue(generate_mask_and_insert, doc=generic_dict, image_url=generic_dict["images"]["XLarge"], fp_date=today_date, coll=collection_name, img=img) print(' new items = %d' % (new_items)) stop = time() if itemCount < 1 and item is not None: print("%s = %s is not relevant!" % (filename, item["MERCHANT_NAME"])) else: if item is None: BW = 'black' else: BW = 'white' try: db.ebay_download_info.update_one( { 'type': 'store', 'id': filename[:-7] }, { "$set": { 'dl_duration': stop - start - 600 * stall, 'items_downloaded': itemCount, 'B/W': BW } }) print("%s (%s) potiential items for %s = %s" % (str(itemCount), str(new_items), item["MERCHANT_NAME"], filename)) except: print('%s not found in store info' % filename) ftp.quit() return