def scrape_doordash_images(doordash_code, foodie_id): print("DoordashID:", doordash_code) if (doordash_code == ""): return False soup = pull_doordash_html(doordash_code) clean_soup = soup.find_all('script')[8].get_text().split("\"")[1] clean_soup = clean_soup.replace("\\u0022", "\"").replace("\\u002D", "-") json_data = json.loads(clean_soup) path = script_dir + "/csvfiles/images/" + foodie_id + "-images-doordash/" try: os.makedirs(path) except OSError: pass exceptions = [ 'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from' ] + foodie_id.split('-') json_menu = extract_json(foodie_id) menu_items = save_locally(json_menu, foodie_id) n = 0 for category in json_data['current_menu']['menu_categories']: title = category['title'] cat_items = category['items'] for item in cat_items: image_name = item['name'] image_url = item['image_url'] if image_url == None: continue matched_items = items_in_sentence(image_name, menu_items, 2, foodie_id, exceptions) if (len(matched_items) == 0): continue optimized_items = optimize_list(matched_items, image_name.lower()) print(optimized_items) for optimized_item in optimized_items: filename = foodie_id + "-" + str(n) + ".jpg" urlretrieve(image_url, path + filename) foodie_ids.append(foodie_id) items.append(optimized_item) filenames.append(filename) matches.append(image_name) source_ids.append(doordash_code) n += 1 d = { 'FoodieID': foodie_ids, 'Item': items, 'Filename': filenames, 'Matches': matches, 'DoordashID': source_ids } df = pd.DataFrame(d) df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False)
def match_sentence_item(sentence, menu_items, c): client = textapi.Client(ids[c], keys[c]) neutral_words = ['got', 'shared', 'ordered', 'split', 'had'] # remove sentence if it contains neutral-indicating words if(remove_sentence(sentence, neutral_words)): return [], c # remove sentence if it does not contain a menu item items_in_given_sentence = items_in_sentence(sentence, menu_items, 2) if(len(items_in_given_sentence) == 0): return [], c # call API to gather sentiment analysis try: sentiment = client.Sentiment({'text' : sentence}) except: print("changing client from: ", c) c = (c + 1) % 3 print("to: ", c) client = textapi.Client(ids[c], keys[c]) try: sentiment = client.Sentiment({'text': tokenized_sentence}) except: print("Error: Too many subjectivity requests from API. Pausing for 5.") time.sleep(5) return [], c # remove sentence if it is objective or neutral if(sentiment['subjectivity'] == 'objective'): return [], c # add single item if list of items is one-item long if(len(items_in_given_sentence) == 1): return [next(iter(items_in_given_sentence))], c # add most optimal item out of list of multiple items if(len(items_in_given_sentence) > 1): optimized_items = optimize_list(items_in_given_sentence, sentence.lower()) return optimized_items, c
def run_postmates_image_scraper(postmates_code, foodie_id): #Opening proper webpage chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--incognito") chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(options=chrome_options) wiki = "https://postmates.com/merchant/" + postmates_code driver.get(wiki) n = 0 script_dir = os.path.abspath(os.path.join(__file__, "../..")) path = script_dir + "/csvfiles/images/" + foodie_id + "-images-postmates/" exceptions = [ 'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from' ] + foodie_id.split('-') # menu_items = read_items(script_dir, foodie_id) json = extract_json(foodie_id) menu_items = save_locally(json, foodie_id) foodie_ids = [] items = [] filenames = [] matches = [] elements = driver.find_elements_by_xpath( "//div[@class='product-container css-2ko7m4 e1tw3vxs3']") for element in elements: item_name = element.find_element_by_xpath( ".//h3[@class='product-name css-1yjxguc e1tw3vxs4']" ).get_attribute("innerText") matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions) if (len(matched_items) == 0): continue imgs = element.find_elements_by_xpath( ".//img[@class='css-1hyfx7x e1qfcze94']") for img in imgs: img_src = img.get_attribute("src") print(img_src) optimized_items = optimize_list(matched_items, item_name.lower()) print(optimized_items) for item in optimized_items: if n == 0: print("test") try: os.makedirs(path) except OSError: pass filename = foodie_id + "-" + str(n) + ".jpg" webp_finder = img_src.find('format=webp') print(webp_finder) img_src = img_src[:webp_finder] print(img_src) save_img_url(img_src, path + filename) foodie_ids.append(foodie_id) items.append(item) filenames.append(filename) matches.append(item_name) n += 1 print(n) driver.close() if n > 0: d = { 'FoodieID': foodie_ids, 'Item': items, 'Filename': filenames, 'Matches': matches } df = pd.DataFrame(d) df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False)
def analyze_yelp_images(images_data, foodie_id, yelp_id): exceptions = [ 'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from' ] + foodie_id.split('-') script_dir = os.path.abspath(os.path.join(__file__, "../..")) # menu_items = read_items(script_dir, foodie_id) json = extract_json(foodie_id) menu_items = save_locally(json, foodie_id) path = script_dir + "/csvfiles/images/" + foodie_id[:100] + yelp_id.replace( '/', '-') + "-images-yelp/" try: os.makedirs(path) except OSError: pass # initializing foodie_ids = [] items = [] filenames = [] captions = [] source_ids = [] n = 100 for image_data in images_data: caption = image_data[0] link = image_data[1] # remove sentence if it does not contain a menu item items_in_given_sentence = items_in_sentence(caption, menu_items, 2, foodie_id, exceptions) print(items_in_given_sentence) if (len(items_in_given_sentence) == 0): continue # choose best item out of all matched items optimized_items = optimize_list(items_in_given_sentence, caption.lower()) for item in optimized_items: filename = foodie_id + "-" + str(n) + ".jpg" urlretrieve(link, path + filename) foodie_ids.append(foodie_id) items.append(item) filenames.append(filename) captions.append(caption) source_ids.append(yelp_id) n += 1 print(n) d = { 'FoodieID': foodie_ids, 'Item': items, 'Filename': filenames, 'Captions': captions, 'YelpID': source_ids } df = pd.DataFrame(d) df.to_excel(path + foodie_id[:100] + yelp_id.replace('/', '-') + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False) return n - 100
def run_chowbusimagescraper(chowbus_id, foodie_id): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--incognito") chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(options=chrome_options) wiki = "https://www.chowbus.com" + chowbus_id driver.get(wiki) #Logistics script_dir = os.path.abspath(os.path.join(__file__ , "../..")) path = script_dir + "/csvfiles/images/" + foodie_id + "-images-chowbus/" exceptions = ['a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'] + foodie_id.split('-') print(path) # menu_items = read_items(script_dir, foodie_id) json = extract_json(foodie_id) if(json['Items'] == []): return 'Could not pull images from database. Potential FoodieID mismatch.' menu_items = save_locally(json, foodie_id) foodie_ids = [] source_ids = [] items = [] filenames = [] matches = [] n = 0 elements = driver.find_elements_by_class_name('jss290') print(elements) print("Elements length", len(elements)) for element in elements: item_name = element.find_elements_by_class_name('jss290').get_attribute(getText()) matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions) if(len(matched_items) == 0): continue imgs = element.find_elements_by_class_name("jss326") for img in imgs: img_src = img.get_attribute("src") print(img_src) optimized_items = optimize_list(matched_items, item_name.lower()) print("the length of list is: ", len(optimized_items)) for item in optimized_items: if n == 0: try: os.makedirs(path) except OSError: pass filename = foodie_id + "-" + str(n) + ".jpg" urlretrieve(img_src, path + filename) print(filename) foodie_ids.append(foodie_id) items.append(item) filenames.append(filename) print("even more food") matches.append(item_name) n += 1 print(n) driver.close() if n > 0: d = {'FoodieID' : foodie_ids, 'Item' : items, 'Filename' : filenames, 'Matches' : matches} df = pd.DataFrame(d) df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False) return 'Added Chowbus Imgs' print(path)
def run_caviar_image_scraper(caviar_id, foodie_id): #Open browser in incognito chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--incognito") chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(options=chrome_options) wiki = "https://www.trycaviar.com/" + caviar_id driver.get(wiki) #Logistics script_dir = os.path.abspath(os.path.join(__file__, "../..")) path = script_dir + "/csvfiles/images/" + foodie_id + "-images-caviar/" exceptions = [ 'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from' ] + foodie_id.split('-') # menu_items = read_items(script_dir, foodie_id) json = extract_json(foodie_id) if (json['Items'] == []): return 'Could not pull images from database. Potential FoodieID mismatch.' menu_items = save_locally(json, foodie_id) foodie_ids = [] source_ids = [] items = [] filenames = [] matches = [] n = 0 #Click on reviews dishes = driver.find_elements_by_xpath( "//a[@class='js-offer-link offer-tile_link']") dishes = dishes + driver.find_elements_by_xpath( "//a[@class='js-offer-link offer-tile_link offer-tile_link--unavailable']" ) dish_links = [] for dish in dishes: dish_link = dish.get_attribute("href") dish_links.append(dish_link) for dish_link in dish_links: driver.get(dish_link) item_name = driver.find_element_by_xpath( "//h1[@class='item_name']").text item_img_srcset = driver.find_elements_by_xpath( "//img[@class='item_image']") if (item_img_srcset == []): continue print(item_name) print(item_img_srcset) item_img_srcset = item_img_srcset[0].get_attribute("srcset").split() img_src = item_img_srcset[len(item_img_srcset) - 2] matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions) if (len(matched_items) == 0): continue optimized_items = optimize_list(matched_items, item_name.lower()) for item in optimized_items: if n == 0: try: os.makedirs(path) except OSError: pass filename = foodie_id + "-" + str(n) + ".jpg" urlretrieve(img_src, path + filename) foodie_ids.append(foodie_id) items.append(item) filenames.append(filename) matches.append(item_name) n += 1 print(n) driver.close() if n > 0: d = { 'FoodieID': foodie_ids, 'Item': items, 'Filename': filenames, 'Matches': matches } df = pd.DataFrame(d) df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False) return 'Added Caviar Imgs' else: return 'No Caviar Imgs Scraped'
def scrape_images(storeId, foodie_id): name = "doordash_menu" url = "https://api-consumer-client.doordash.com/graphql" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', 'Content-Type': 'application/json', 'Credentials': 'include' } body = { "operationName": "menu", "variables": { # "storeId": "360", # "storeId": "2693" "storeId": storeId # "menuId": "223199" }, "query":''' query menu($storeId: ID!, $menuId: ID) { storeInformation(storeId: $storeId) { id name description isGoodForGroupOrders offersPickup offersDelivery deliveryFee sosDeliveryFee numRatings averageRating shouldShowStoreLogo isConsumerSubscriptionEligible headerImgUrl coverImgUrl distanceFromConsumer providesExternalCourierTracking fulfillsOwnDeliveries isDeliverableToConsumerAddress priceRange business { id name __typename } address { street printableAddress lat lng city state __typename } status { asapAvailable scheduledAvailable asapMinutesRange asapPickupMinutesRange __typename } merchantPromotions { id minimumOrderCartSubtotal newStoreCustomersOnly deliveryFee __typename } storeDisclaimers { id disclaimerDetailsLink disclaimerLinkSubstring disclaimerText displayTreatment __typename } __typename } storeMenus(storeId: $storeId, menuId: $menuId) { allMenus { id name subtitle isBusinessEnabled timesOpen __typename } currentMenu { id timesOpen hoursToOrderInAdvance isCatering minOrderSize menuCategories { ...StoreMenuCategoryFragment items { ...StoreMenuListItemFragment __typename } __typename } __typename } __typename } storeCrossLinks(storeId: $storeId) { trendingStores { ...StoreCrossLinkItemFragment __typename } trendingCategories { ...StoreCrossLinkItemFragment __typename } topCuisinesNearMe { ...StoreCrossLinkItemFragment __typename } nearbyCities { ...StoreCrossLinkItemFragment __typename } __typename } } fragment StoreMenuCategoryFragment on StoreMenuCategory { id subtitle title __typename } fragment StoreMenuListItemFragment on StoreMenuListItem { id description isTempDeactivated price imageUrl name __typename } fragment StoreCrossLinkItemFragment on StoreCrossLinkItem { name url __typename } ''' } response = requests.post(url, cookies={'X-CSRFToken': 'MKp9Os0ao3HiPO9ybnSFdDy7HrrodcxiFOWVhuhjaHEybo28kCAfBwMOWp6b78BU'}, data = json.dumps(body), headers = headers) print(response) allMenus = response.json()['data']['storeMenus']['allMenus'] #Logistics script_dir = os.path.abspath(os.path.join(__file__ ,"../..")) path = script_dir + "/csvfiles/images/" + foodie_id + "-images-doordash/" exceptions = ['a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'] + foodie_id.split('-') # menu_items = read_items(script_dir, foodie_id) json_t = extract_json(foodie_id) if(json_t['Items'] == []): return 'Could not pull images from database. Potential FoodieID mismatch.' menu_items = save_locally(json_t, foodie_id) foodie_ids = [] source_ids = [] items = [] filenames = [] matches = [] n = 0 for menu in allMenus: menu_name = menu['subtitle'] # self.body['variables']['menuId'] = menu['id'] # print(menu['id']) body['variables']['menuId'] = menu['id'] response = requests.post(url, cookies={'X-CSRFToken': 'MKp9Os0ao3HiPO9ybnSFdDy7HrrodcxiFOWVhuhjaHEybo28kCAfBwMOWp6b78BU'}, data = json.dumps(body), headers = headers) for category in response.json()['data']['storeMenus']['currentMenu']['menuCategories']: for item in category['items']: if(item['imageUrl']): item_name = item['name'] img_url = item['imageUrl'] matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions) if(len(matched_items) == 0): continue optimized_items = optimize_list(matched_items, item_name.lower()) for item in optimized_items: if n == 0: try: os.makedirs(path) except OSError: pass filename = foodie_id + "-" + str(n) + ".jpg" urlretrieve(img_url, path + filename) img = cv2.imread(path + filename, cv2.IMREAD_UNCHANGED) # get dimensions of image dimensions = img.shape # height, width, number of channels in image height = img.shape[0] width = img.shape[1] if(height > 300 and width > 450): print(height, width) foodie_ids.append(foodie_id) items.append(item) filenames.append(filename) matches.append(item_name) n += 1 print(n) if n > 0: d = {'FoodieID' : foodie_ids, 'Item' : items, 'Filename' : filenames, 'Matches' : matches} df = pd.DataFrame(d) df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False) return 'Added Doordash Imgs' else: return 'Zero Doordash Imgs Scraped'
def pull_images(response_json, foodie_id): menu_items = {} menu_items['Food'] = [] #Logistics script_dir = os.path.abspath(os.path.join(__file__, "../..")) path = script_dir + "/csvfiles/images/" + foodie_id + "-images-grubhub/" exceptions = [ 'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from' ] + foodie_id.split('-') # menu_items = read_items(script_dir, foodie_id) json_t = extract_json(foodie_id) if (json_t['Items'] == []): return 'Could not pull images from database. Potential FoodieID mismatch.' menu_items = save_locally(json_t, foodie_id) foodie_ids = [] source_ids = [] items = [] filenames = [] matches = [] n = 0 print(response_json) categories = response_json['restaurant']['menu_category_list'] for category in categories: dishes = category['menu_item_list'] for dish in dishes: item_name = dish['name'] if 'media_image' in dish: img_url = dish['media_image']['base_url'] + dish['media_image'][ 'public_id'] + '.' + dish['media_image']['format'] print(img_url) matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions) if (len(matched_items) == 0): continue optimized_items = optimize_list(matched_items, item_name.lower()) for item in optimized_items: if n == 0: try: os.makedirs(path) except OSError: pass filename = foodie_id + "-" + str(n) + ".jpg" urlretrieve(img_url, path + filename) foodie_ids.append(foodie_id) items.append(item) filenames.append(filename) matches.append(item_name) n += 1 print(n) if n > 0: d = { 'FoodieID': foodie_ids, 'Item': items, 'Filename': filenames, 'Matches': matches } df = pd.DataFrame(d) df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False) return 'Added GrubHub Imgs'