Exemplo n.º 1
0
def scrape_doordash_images(doordash_code, foodie_id):
    print("DoordashID:", doordash_code)
    if (doordash_code == ""):
        return False

    soup = pull_doordash_html(doordash_code)

    clean_soup = soup.find_all('script')[8].get_text().split("\"")[1]
    clean_soup = clean_soup.replace("\\u0022", "\"").replace("\\u002D", "-")
    json_data = json.loads(clean_soup)
    path = script_dir + "/csvfiles/images/" + foodie_id + "-images-doordash/"
    try:
        os.makedirs(path)
    except OSError:
        pass
    exceptions = [
        'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'
    ] + foodie_id.split('-')
    json_menu = extract_json(foodie_id)
    menu_items = save_locally(json_menu, foodie_id)
    n = 0

    for category in json_data['current_menu']['menu_categories']:
        title = category['title']
        cat_items = category['items']
        for item in cat_items:
            image_name = item['name']
            image_url = item['image_url']
            if image_url == None:
                continue

            matched_items = items_in_sentence(image_name, menu_items, 2,
                                              foodie_id, exceptions)
            if (len(matched_items) == 0):
                continue

            optimized_items = optimize_list(matched_items, image_name.lower())
            print(optimized_items)
            for optimized_item in optimized_items:
                filename = foodie_id + "-" + str(n) + ".jpg"
                urlretrieve(image_url, path + filename)
                foodie_ids.append(foodie_id)
                items.append(optimized_item)
                filenames.append(filename)
                matches.append(image_name)
                source_ids.append(doordash_code)
                n += 1

    d = {
        'FoodieID': foodie_ids,
        'Item': items,
        'Filename': filenames,
        'Matches': matches,
        'DoordashID': source_ids
    }
    df = pd.DataFrame(d)
    df.to_excel(path + foodie_id + ".xlsx",
                sheet_name='Sheet1',
                encoding="utf8",
                index=False)
Exemplo n.º 2
0
def match_sentence_item(sentence, menu_items, c):
	client = textapi.Client(ids[c], keys[c])
	neutral_words = ['got', 'shared', 'ordered', 'split', 'had']
	# remove sentence if it contains neutral-indicating words 
	if(remove_sentence(sentence, neutral_words)):
		return [], c

	# remove sentence if it does not contain a menu item 
	items_in_given_sentence = items_in_sentence(sentence, menu_items, 2)
	if(len(items_in_given_sentence) == 0):
		return [], c

	# call API to gather sentiment analysis 
	try:
		sentiment = client.Sentiment({'text' : sentence})
	except:
		print("changing client from: ", c)
		c = (c + 1) % 3
		print("to: ", c)
		client = textapi.Client(ids[c], keys[c])
		try:
			sentiment = client.Sentiment({'text': tokenized_sentence})
		except: 
			print("Error: Too many subjectivity requests from API. Pausing for 5.")
			time.sleep(5)
			return [], c

	# remove sentence if it is objective or neutral
	if(sentiment['subjectivity'] == 'objective'):
		return [], c
			
	# add single item if list of items is one-item long 
	if(len(items_in_given_sentence) == 1):
		return [next(iter(items_in_given_sentence))], c

	# add most optimal item out of list of multiple items 
	if(len(items_in_given_sentence) > 1):
		optimized_items = optimize_list(items_in_given_sentence, sentence.lower())
		return optimized_items, c
Exemplo n.º 3
0
def run_postmates_image_scraper(postmates_code, foodie_id):
    #Opening proper webpage
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--incognito")
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=chrome_options)

    wiki = "https://postmates.com/merchant/" + postmates_code
    driver.get(wiki)
    n = 0

    script_dir = os.path.abspath(os.path.join(__file__, "../.."))
    path = script_dir + "/csvfiles/images/" + foodie_id + "-images-postmates/"
    exceptions = [
        'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'
    ] + foodie_id.split('-')

    # menu_items = read_items(script_dir, foodie_id)
    json = extract_json(foodie_id)
    menu_items = save_locally(json, foodie_id)

    foodie_ids = []
    items = []
    filenames = []
    matches = []

    elements = driver.find_elements_by_xpath(
        "//div[@class='product-container css-2ko7m4 e1tw3vxs3']")
    for element in elements:
        item_name = element.find_element_by_xpath(
            ".//h3[@class='product-name css-1yjxguc e1tw3vxs4']"
        ).get_attribute("innerText")

        matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id,
                                          exceptions)
        if (len(matched_items) == 0):
            continue

        imgs = element.find_elements_by_xpath(
            ".//img[@class='css-1hyfx7x e1qfcze94']")
        for img in imgs:
            img_src = img.get_attribute("src")
            print(img_src)

            optimized_items = optimize_list(matched_items, item_name.lower())
            print(optimized_items)
            for item in optimized_items:

                if n == 0:
                    print("test")
                    try:
                        os.makedirs(path)
                    except OSError:
                        pass

                filename = foodie_id + "-" + str(n) + ".jpg"

                webp_finder = img_src.find('format=webp')
                print(webp_finder)
                img_src = img_src[:webp_finder]
                print(img_src)
                save_img_url(img_src, path + filename)

                foodie_ids.append(foodie_id)
                items.append(item)
                filenames.append(filename)
                matches.append(item_name)
                n += 1
                print(n)

    driver.close()

    if n > 0:
        d = {
            'FoodieID': foodie_ids,
            'Item': items,
            'Filename': filenames,
            'Matches': matches
        }
        df = pd.DataFrame(d)
        df.to_excel(path + foodie_id + ".xlsx",
                    sheet_name='Sheet1',
                    encoding="utf8",
                    index=False)
Exemplo n.º 4
0
def analyze_yelp_images(images_data, foodie_id, yelp_id):
    exceptions = [
        'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'
    ] + foodie_id.split('-')

    script_dir = os.path.abspath(os.path.join(__file__, "../.."))

    # menu_items = read_items(script_dir, foodie_id)
    json = extract_json(foodie_id)
    menu_items = save_locally(json, foodie_id)

    path = script_dir + "/csvfiles/images/" + foodie_id[:100] + yelp_id.replace(
        '/', '-') + "-images-yelp/"
    try:
        os.makedirs(path)
    except OSError:
        pass

    # initializing
    foodie_ids = []
    items = []
    filenames = []
    captions = []
    source_ids = []

    n = 100

    for image_data in images_data:
        caption = image_data[0]
        link = image_data[1]

        # remove sentence if it does not contain a menu item
        items_in_given_sentence = items_in_sentence(caption, menu_items, 2,
                                                    foodie_id, exceptions)
        print(items_in_given_sentence)
        if (len(items_in_given_sentence) == 0):
            continue

        # choose best item out of all matched items
        optimized_items = optimize_list(items_in_given_sentence,
                                        caption.lower())
        for item in optimized_items:

            filename = foodie_id + "-" + str(n) + ".jpg"
            urlretrieve(link, path + filename)

            foodie_ids.append(foodie_id)
            items.append(item)
            filenames.append(filename)
            captions.append(caption)
            source_ids.append(yelp_id)
            n += 1
            print(n)

    d = {
        'FoodieID': foodie_ids,
        'Item': items,
        'Filename': filenames,
        'Captions': captions,
        'YelpID': source_ids
    }
    df = pd.DataFrame(d)
    df.to_excel(path + foodie_id[:100] + yelp_id.replace('/', '-') + ".xlsx",
                sheet_name='Sheet1',
                encoding="utf8",
                index=False)

    return n - 100
Exemplo n.º 5
0
def run_chowbusimagescraper(chowbus_id, foodie_id):
	chrome_options = webdriver.ChromeOptions()
	chrome_options.add_argument("--incognito")
	chrome_options.add_argument('--headless')
	chrome_options.add_argument('--no-sandbox')
	chrome_options.add_argument('--disable-gpu')
	driver = webdriver.Chrome(options=chrome_options)

	wiki = "https://www.chowbus.com" + chowbus_id
	driver.get(wiki)
 
#Logistics
	script_dir = os.path.abspath(os.path.join(__file__ , "../.."))
	path = script_dir + "/csvfiles/images/" + foodie_id + "-images-chowbus/"
	exceptions = ['a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'] + foodie_id.split('-')
	print(path)


# menu_items = read_items(script_dir, foodie_id)
	json = extract_json(foodie_id)
	if(json['Items'] == []):
		return 'Could not pull images from database. Potential FoodieID mismatch.'
	menu_items = save_locally(json, foodie_id)

	foodie_ids = []
	source_ids = []
	items = []
	filenames = []
	matches = []
	n = 0


	elements = driver.find_elements_by_class_name('jss290')
	print(elements)
	print("Elements length", len(elements))
	for element in elements:
		item_name = element.find_elements_by_class_name('jss290').get_attribute(getText())

		matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions)
		if(len(matched_items) == 0):
			continue

		imgs = element.find_elements_by_class_name("jss326")
		for img in imgs:
			img_src = img.get_attribute("src")
			print(img_src)
			
			
		
			optimized_items = optimize_list(matched_items, item_name.lower())
			print("the length of list is: ", len(optimized_items))
			for item in optimized_items:
				if n == 0:
					try: os.makedirs(path)
					except OSError: pass
	
			filename = foodie_id + "-" + str(n) + ".jpg"
			urlretrieve(img_src, path + filename)
			print(filename)

			foodie_ids.append(foodie_id)
			items.append(item)
			filenames.append(filename)
			print("even more food")
			matches.append(item_name)
			n += 1 
			print(n)

	driver.close()
	if n > 0:
		d = {'FoodieID' : foodie_ids, 'Item' : items, 'Filename' : filenames, 'Matches' : matches}
		df = pd.DataFrame(d)
		df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False)
	return 'Added Chowbus Imgs'
	print(path)
Exemplo n.º 6
0
def run_caviar_image_scraper(caviar_id, foodie_id):
    #Open browser in incognito
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--incognito")
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=chrome_options)

    wiki = "https://www.trycaviar.com/" + caviar_id
    driver.get(wiki)

    #Logistics
    script_dir = os.path.abspath(os.path.join(__file__, "../.."))
    path = script_dir + "/csvfiles/images/" + foodie_id + "-images-caviar/"
    exceptions = [
        'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'
    ] + foodie_id.split('-')

    # menu_items = read_items(script_dir, foodie_id)
    json = extract_json(foodie_id)
    if (json['Items'] == []):
        return 'Could not pull images from database. Potential FoodieID mismatch.'
    menu_items = save_locally(json, foodie_id)

    foodie_ids = []
    source_ids = []
    items = []
    filenames = []
    matches = []
    n = 0

    #Click on reviews
    dishes = driver.find_elements_by_xpath(
        "//a[@class='js-offer-link offer-tile_link']")
    dishes = dishes + driver.find_elements_by_xpath(
        "//a[@class='js-offer-link offer-tile_link offer-tile_link--unavailable']"
    )
    dish_links = []
    for dish in dishes:
        dish_link = dish.get_attribute("href")
        dish_links.append(dish_link)

    for dish_link in dish_links:
        driver.get(dish_link)
        item_name = driver.find_element_by_xpath(
            "//h1[@class='item_name']").text
        item_img_srcset = driver.find_elements_by_xpath(
            "//img[@class='item_image']")
        if (item_img_srcset == []):
            continue

        print(item_name)
        print(item_img_srcset)

        item_img_srcset = item_img_srcset[0].get_attribute("srcset").split()
        img_src = item_img_srcset[len(item_img_srcset) - 2]

        matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id,
                                          exceptions)
        if (len(matched_items) == 0):
            continue

        optimized_items = optimize_list(matched_items, item_name.lower())
        for item in optimized_items:
            if n == 0:
                try:
                    os.makedirs(path)
                except OSError:
                    pass

            filename = foodie_id + "-" + str(n) + ".jpg"
            urlretrieve(img_src, path + filename)

            foodie_ids.append(foodie_id)
            items.append(item)
            filenames.append(filename)

            matches.append(item_name)
            n += 1
            print(n)

    driver.close()
    if n > 0:
        d = {
            'FoodieID': foodie_ids,
            'Item': items,
            'Filename': filenames,
            'Matches': matches
        }
        df = pd.DataFrame(d)
        df.to_excel(path + foodie_id + ".xlsx",
                    sheet_name='Sheet1',
                    encoding="utf8",
                    index=False)
        return 'Added Caviar Imgs'
    else:
        return 'No Caviar Imgs Scraped'
Exemplo n.º 7
0
def scrape_images(storeId, foodie_id):
	name = "doordash_menu"
	url = "https://api-consumer-client.doordash.com/graphql"

	headers = {
	     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
	     'Content-Type': 'application/json',
	     'Credentials': 'include'
	}
	body = {
	    "operationName": "menu",
	    "variables": {
	        # "storeId": "360",
	        # "storeId": "2693"
	        "storeId": storeId
	        # "menuId": "223199"
	    },
	    "query":'''
	          query menu($storeId: ID!, $menuId: ID) {
	            storeInformation(storeId: $storeId) {
	              id
	              name
	              description
	              isGoodForGroupOrders
	              offersPickup
	              offersDelivery
	              deliveryFee
	              sosDeliveryFee
	              numRatings
	              averageRating
	              shouldShowStoreLogo
	              isConsumerSubscriptionEligible
	              headerImgUrl
	              coverImgUrl
	              distanceFromConsumer
	              providesExternalCourierTracking
	              fulfillsOwnDeliveries
	              isDeliverableToConsumerAddress
	              priceRange
	              business {
	                id
	                name
	                __typename
	              }
	              address {
	                street
	                printableAddress
	                lat
	                lng
	                city
	                state
	                __typename
	              }
	              status {
	                asapAvailable
	                scheduledAvailable
	                asapMinutesRange
	                asapPickupMinutesRange
	                __typename
	              }
	              merchantPromotions {
	                id
	                minimumOrderCartSubtotal
	                newStoreCustomersOnly
	                deliveryFee
	                __typename
	              }
	              storeDisclaimers {
	                id
	                disclaimerDetailsLink
	                disclaimerLinkSubstring
	                disclaimerText
	                displayTreatment
	                __typename
	              }
	              __typename
	            }
	            storeMenus(storeId: $storeId, menuId: $menuId) {
	              allMenus {
	                id
	                name
	                subtitle
	                isBusinessEnabled
	                timesOpen
	                __typename
	              }
	              currentMenu {
	                id
	                timesOpen
	                hoursToOrderInAdvance
	                isCatering
	                minOrderSize
	                menuCategories {
	                  ...StoreMenuCategoryFragment
	                  items {
	                    ...StoreMenuListItemFragment
	                    __typename
	                  }
	                  __typename
	                }
	                __typename
	              }
	              __typename
	            }
	            storeCrossLinks(storeId: $storeId) {
	              trendingStores {
	                ...StoreCrossLinkItemFragment
	                __typename
	              }
	              trendingCategories {
	                ...StoreCrossLinkItemFragment
	                __typename
	              }
	              topCuisinesNearMe {
	                ...StoreCrossLinkItemFragment
	                __typename
	              }
	              nearbyCities {
	                ...StoreCrossLinkItemFragment
	                __typename
	              }
	              __typename
	            }
	          }

	        fragment StoreMenuCategoryFragment on StoreMenuCategory {
	          id
	          subtitle
	          title
	          __typename
	        }

	        fragment StoreMenuListItemFragment on StoreMenuListItem {
	          id
	          description
	          isTempDeactivated
	          price
	          imageUrl
	          name
	          __typename
	        }

	        fragment StoreCrossLinkItemFragment on StoreCrossLinkItem {
	          name
	          url
	          __typename
	        }

	    '''
	}
	response = requests.post(url, cookies={'X-CSRFToken': 'MKp9Os0ao3HiPO9ybnSFdDy7HrrodcxiFOWVhuhjaHEybo28kCAfBwMOWp6b78BU'}, data = json.dumps(body), headers = headers)
	print(response)
	allMenus = response.json()['data']['storeMenus']['allMenus']

	#Logistics
	script_dir = os.path.abspath(os.path.join(__file__ ,"../.."))
	path = script_dir + "/csvfiles/images/" + foodie_id + "-images-doordash/"
	exceptions = ['a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'] + foodie_id.split('-')

	# menu_items = read_items(script_dir, foodie_id)
	json_t = extract_json(foodie_id)
	if(json_t['Items'] == []):
		return 'Could not pull images from database. Potential FoodieID mismatch.'
	menu_items = save_locally(json_t, foodie_id)
	
	foodie_ids = []
	source_ids = []
	items = []
	filenames = []
	matches = []
	n = 0

	for menu in allMenus:
		menu_name = menu['subtitle']
		
		# self.body['variables']['menuId'] = menu['id']
		# print(menu['id'])
		body['variables']['menuId'] = menu['id']
		response = requests.post(url, cookies={'X-CSRFToken': 'MKp9Os0ao3HiPO9ybnSFdDy7HrrodcxiFOWVhuhjaHEybo28kCAfBwMOWp6b78BU'}, data = json.dumps(body), headers = headers)
		for category in response.json()['data']['storeMenus']['currentMenu']['menuCategories']:
			for item in category['items']:
				if(item['imageUrl']):
					item_name = item['name']
					img_url = item['imageUrl']

					matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions)
					if(len(matched_items) == 0):
						continue     

					optimized_items = optimize_list(matched_items, item_name.lower())
					for item in optimized_items:
						if n == 0:
							try: os.makedirs(path)
							except OSError: pass

						filename = foodie_id + "-" + str(n) + ".jpg"
						urlretrieve(img_url, path + filename)

						img = cv2.imread(path + filename, cv2.IMREAD_UNCHANGED)

						# get dimensions of image
						dimensions = img.shape

						# height, width, number of channels in image
						height = img.shape[0]
						width = img.shape[1]
						if(height > 300 and width > 450):
							print(height, width)
							foodie_ids.append(foodie_id)
							items.append(item)
							filenames.append(filename)

							matches.append(item_name)
							n += 1
							print(n)

	if n > 0:
		d = {'FoodieID' : foodie_ids, 'Item' : items, 'Filename' : filenames, 'Matches' : matches}
		df = pd.DataFrame(d)
		df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False)
		return 'Added Doordash Imgs'
	else:
		return 'Zero Doordash Imgs Scraped'
Exemplo n.º 8
0
def pull_images(response_json, foodie_id):
    menu_items = {}
    menu_items['Food'] = []

    #Logistics
    script_dir = os.path.abspath(os.path.join(__file__, "../.."))
    path = script_dir + "/csvfiles/images/" + foodie_id + "-images-grubhub/"
    exceptions = [
        'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'
    ] + foodie_id.split('-')

    # menu_items = read_items(script_dir, foodie_id)
    json_t = extract_json(foodie_id)
    if (json_t['Items'] == []):
        return 'Could not pull images from database. Potential FoodieID mismatch.'
    menu_items = save_locally(json_t, foodie_id)

    foodie_ids = []
    source_ids = []
    items = []
    filenames = []
    matches = []
    n = 0

    print(response_json)
    categories = response_json['restaurant']['menu_category_list']
    for category in categories:
        dishes = category['menu_item_list']
        for dish in dishes:
            item_name = dish['name']
            if 'media_image' in dish:
                img_url = dish['media_image']['base_url'] + dish['media_image'][
                    'public_id'] + '.' + dish['media_image']['format']
                print(img_url)
                matched_items = items_in_sentence(item_name, menu_items, 2,
                                                  foodie_id, exceptions)
                if (len(matched_items) == 0):
                    continue

                optimized_items = optimize_list(matched_items,
                                                item_name.lower())
                for item in optimized_items:
                    if n == 0:
                        try:
                            os.makedirs(path)
                        except OSError:
                            pass

                    filename = foodie_id + "-" + str(n) + ".jpg"
                    urlretrieve(img_url, path + filename)

                    foodie_ids.append(foodie_id)
                    items.append(item)
                    filenames.append(filename)

                    matches.append(item_name)
                    n += 1
                    print(n)

    if n > 0:
        d = {
            'FoodieID': foodie_ids,
            'Item': items,
            'Filename': filenames,
            'Matches': matches
        }
        df = pd.DataFrame(d)
        df.to_excel(path + foodie_id + ".xlsx",
                    sheet_name='Sheet1',
                    encoding="utf8",
                    index=False)

    return 'Added GrubHub Imgs'