def _crawl_michiganwholesaleequipment_category(category_link): response_text = request_("GET", category_link).text soup = BeautifulSoup(response_text, "html.parser") soup_main = soup.find('div', class_="item-list") item_links = [ el.find('a').get('href') for el in soup_main.find_all("div", class_="views-field-title") ] while True: next_page_link_el = soup.find("ul", class_="js-pager__items") sub_url = next_page_link_el.find("a").get("href") sub_text = next_page_link_el.find("span", class_="visually-hidden").text if sub_url is not None and sub_text == "Next page": url = f"https://www.michiganwholesaleequipment.com{sub_url}" response_text = request_("GET", url).text soup = BeautifulSoup(response_text, "html.parser") soup_main = soup.find('div', class_="item-list") item_links.extend([ el.find('a').get('href') for el in soup_main.find_all("div", class_="views-field-title") ]) else: break return set(item_links)
def _process_added_items(items): for item in items: codes_added = [] source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") for category in soup.find_all('li', class_="crumb_2"): na = re.sub(r"[\n\t]*", "", category.text) for description in soup.find_all('table', id='product-attribute-specs-table'): for data in description.find_all('td', class_='data'): data_new = re.sub(r"[\n\t\s]*", "", data.text) codes_added.append(data_new) Capacity = re.sub('[,s]', '', codes_added[4]).upper() Marque = codes_added[2].upper() Model = codes_added[3] Mat = codes_added[6] Annee = codes_added[1] post_name = na + " " + Capacity + " " + Marque + " " + Annee data = { 'post_name': post_name, 'capacity': Capacity, 'marque': Marque, 'model': Model, 'mat': Mat, 'annee': Annee, 'url': item } API_ENDPOINT = "" request_("POST", API_ENDPOINT, data=data)
def _process_added_items(items): for item in items: source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") try: category = soup.find("td", text="Category").text.strip() except Exception: category = "" try: capacity = "".join( re.findall( r"\d+", soup.find("td", text="Capacity").find_next("td").text)) + "LB" except Exception: capacity = "" try: marque = soup.find("td", text="Make").find_next("td").text.strip() except Exception: marque = "" try: model = soup.find("td", text="Model").find_next("td").text.strip() except Exception: model = "" try: year = soup.find("td", text="Year").find_next("td").text.strip() except Exception: year = "" try: mat_1 = soup.find("td", text="Mast").find_next("td").text.strip() mat = f"{mat_1}" except Exception: mat = "" try: engine = soup.find("td", text="Engine").find_next("td").text.strip() except Exception: engine = "" try: forks = soup.find("td", text="Forks").find_next("td").text.strip() except Exception: forks = "" try: attachment = soup.find("td", text="Attachment").find_next("td").text except Exception: attachment = "" data = { "post_name": f"{category} {capacity} {marque} {model} {year}", "capacity": capacity, "marque": marque, "model": model, "year": year, "mat": mat, "engine": engine, "forks": forks, "attachment": attachment, "url": item, } request_("POST", API_ENDPOINT, data=data)
def _process_added_items(items): for item in items: source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") soup_detilas = soup.select( ".woocommerce-product-details__short-description ul li") if len(soup_detilas) > 0: try: name = soup.find("h1", class_="product_title").text except Exception: name = "" #print(soup_detilas) try: capacity = soup_detilas[4].text.split(":")[1] except Exception: capacity = "" try: marque = soup_detilas[1].text.split(':')[1] except Exception: marque = "" try: model = soup_detilas[0].text.split(":")[1] except Exception: model = "" try: mat_2 = soup_detilas[7].text.split(":")[1] mat_1 = soup_detilas[8].text.split(":")[1] mat = f"{mat_2}-{mat_1}" except Exception: mat = "" try: year = soup_detilas[2].text.split(":")[1] except Exception: year = "" try: fuel = soup_detilas[6].text.split(":")[1].strip() except Exception: fuel = "" try: types = soup_detilas[5].text.split(":")[1].strip() except Exception: types = "" try: truck_types = soup_detilas[1].text.split(":")[1].strip() except Exception: truck_types = "" data = { "post_name": f"{name} {capacity} {marque} {year}", "capacity": capacity, "marque": marque, "model": model, "mat": mat, "annee": year, "fuel": fuel, "type": types, "truck_types": truck_types, "url": item, } request_("POST", API_ENDPOINT, data=data)
def _crawl_ceqinc(): link = "https://www.ceqinc.ca/inventaire?p={page}&s=1&condition=usage" page = 1 response_text = request_("GET", link.format(page=page)).text soup = BeautifulSoup(response_text, "html.parser") items = [ e.a.get("href") for e in soup.find_all("div", class_="car-content") ] while True: next_page = soup.find("ul", class_="pagination").find("a", text="»") if next_page is None: break page += 1 response_text = request_("GET", link.format(page=page)).text soup = BeautifulSoup(response_text, "html.parser") items.extend([ e.a.get("href") for e in soup.find_all("div", class_="car-content") ]) items = [ "https://www.ceqinc.ca" + item for item in items ] return items
def _process_added_items(items): for item in items: source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") try: name = soup.find("div", class_="section post-header").text.strip() except Exception: name = "" try: capacity = soup.find( "td", text="Base Capacity (lbs.)").find_next("td").text + "LB" except Exception: capacity = "" try: marque = soup.find("td", text="Make:").find_next("td").text except Exception: marque = "" try: model = soup.find("td", text="Model:").find_next("td").text except Exception: model = "" try: year = soup.find("td", text="Year:").find_next("td").text except Exception: year = "" try: mat_1 = soup.find("td", text="Mast Type:").find_next("td").text mat_2 = soup.find("td", text="Mast Type:").find_next("td").text mat_3 = soup.find("td", text="Mast Type:").find_next("td").text mat = f"{mat_1}/{mat_2}/{mat_3}" except Exception: mat = "" try: type_s = soup.find("td", text="Machine Type:").find_next("td").text except Exception: type_s = "" try: tire = soup.find("td", text="Tires:").find_next("td").text except Exception: tire = "" try: hours = soup.find("td", text="Hours:").find_next("td").text except Exception: hours = "" data = { "post_name": f"{name}", "capacity": capacity, "marque": marque, "model": model, "year": year, "mat": mat, "type": type_s, "tire": tire, "hours": hours, "url": item, } request_("POST", API_ENDPOINT, data=data)
def _process_added_items(items): for url, item in items: source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") soup_main = soup.find('div', {"id": item}) try: name = soup_main.find("div", class_="lbHeader").text.strip() except Exception: name = "" try: capacity = soup.find("td", text="Capacity:").find_next("td").text + "LB" except Exception: capacity = "" try: marque = soup.find("td", text="Mfr:").find_next("td").text except Exception: marque = "" try: model = soup.find("td", text="Model #:").find_next("td").text except Exception: model = "" try: year = soup.find("td", text="Year:").find_next("td").text except Exception: year = "" try: mat = soup.find("td", text="Mast:").find_next("td").text except Exception: mat = "" try: type_s = soup.find("td", text="Fuel Type:").find_next("td").text except Exception: type_s = "" try: types = soup.find("td", text="Type:").find_next("td").text except Exception: types = "" try: description = soup.find("td", text="Description:").find_next("td").text except Exception: description = "" data = { "post_name": f"{name}", "capacity": capacity, "marque": marque, "model": model, "year": year, "mat": mat, "type": type_s, "types": types, "description": description, "url": item, } API_ENDPOINT = "" request_("POST", API_ENDPOINT, data=data)
def _process_added_items(items): for item in items: source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") try: name = soup.find("h1",class_="product_title").text except Exception: name = "" product_details = soup.find_all('ul', class_="product_details") try: capacity = product_details[0].find_all('li')[5].text.split(':')[1].strip() except Exception: capacity = "" try: marque = product_details[0].find_all('li')[0].text.split(':')[1].strip() except Exception: marque = "" try: model = product_details[0].find_all('li')[1].text.split(':')[1].strip() except Exception: model = "" try: mat_1 = product_details[0].find_all('li')[6].text.split(':')[1].strip() mat_2 = product_details[0].find_all('li')[7].text.split(':')[1].strip() mat = f"{mat_1} {mat_2}" except Exception: mat = "" try: type_moteur = soup.find("td", text="Type moteur :").find_next("td").text.strip() except Exception: type_moteur = "" try: style_pneus = product_details[0].find_all('li')[4].text.split(':')[1].strip() except Exception: style_pneus = "" try: fourches = product_details[0].find_all('li')[9].text.split(':')[1].strip() except Exception: fourches = "" try: nourriture = product_details[0].find_all('li')[3].text.split(':')[1].strip() except Exception: nourriture = "" data = { "post_name": f"{name} {capacity} {marque}", "capacity": capacity, "marque": marque, "model": model, "mat": mat, "type_moteur": type_moteur, "style_pneus": style_pneus, "nourriture": nourriture, "fourches": fourches, "url": item, } request_("POST", API_ENDPOINT, data=data)
def _process_added_items(storage_client, items): print(f"[manuvic] Got {len(items)} added links") for item in items: print(f"[manuvic] Processing added link {item}") link_data = [] source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") for page_title in soup.find_all("h1", class_="page-title"): na = re.sub(r"[\n\t]*", "", page_title.text) for description in soup.find_all("div", id="product.info.descriptionmod"): for data in description.find_all("span", class_="infoValue"): data_new = re.sub(r"[\n\t\s]*", "", data.text) link_data.append(data_new) photo_links = _get_images(soup) print(f"[manuvic] Saving {len(photo_links)} photos to the bucket for {item}") item_path = _format_item_link(item) blob_path = f"manuvic/photos/{item_path}" save_photos_to_bucket( storage_client, blob_path, photo_links, BUCKET_NAME ) try: capacity = link_data[2] except IndexError: capacity = "" try: marque = link_data[3] except IndexError: marque = "" try: model = link_data[4] except IndexError: model = "" try: mat = link_data[8] except IndexError: mat = "" try: year = link_data[13] except IndexError: year = "" print(f"[manuvic] Posting data about the {item} to forklift.news website") request_( "POST", API_ENDPOINT, data={ "post_name": f"{na} {capacity} {marque} {year}", "capacity": capacity, "marque": marque, "model": model, "mat": mat, "annee": year, "url": item, })
def _process_added_items(items): for item in items: source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") try: capacity = soup.find("td", text="Capacité :").find_next("td").text.strip() + "LB" except Exception: capacity = "" try: marque = soup.find("span", class_="marque").text.strip() except Exception: marque = "" try: model = soup.find("span", class_="modele").text.strip() except Exception: model = "" try: mat_1 = soup.find("td", text="Mât :").find_next("td").text.strip() mat = f"{mat_1}" except Exception: mat = "" try: year = soup.find("td", text="Année :").find_next("td").text.strip() except Exception: year = "" try: heures = soup.find("td", text="Heures :").find_next("td").text.strip() except Exception: heures = "" try: type_moteur = soup.find("td", text="Type moteur :").find_next("td").text.strip() except Exception: type_moteur = "" try: style_pneus = soup.find("td", text="Styles pneus :").find_next("td").text.strip() except Exception: style_pneus = "" try: fourches = soup.find("td", text="Fourches :").find_next("td").text.strip() except Exception: fourches = "" data = { "post_name": f"{capacity} {marque} {year}", "capacity": capacity, "marque": marque, "model": model, "mat": mat, "annee": year, "heures": heures, "type_moteur": type_moteur, "style_pneus": style_pneus, "fourches": fourches, "url": item, } request_("POST", API_ENDPOINT, data=data)
def _process_added_items(items): for item in items: source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") try: name = soup.find("div", class_="su-service-title").text.strip() except Exception: name = "" try: capacity = soup.find("td", text="Capacity:").find_next("td").text + "LB" except Exception: capacity = "" try: marque = soup.find("td", text="Manufacturer:").find_next("td").text except Exception: marque = "" try: model = soup.find("td", text="Model #:").find_next("td").text except Exception: model = "" try: year = soup.find("td", text="Year:").find_next("td").text except Exception: year = "" try: mat = soup.find("td", text="Mast:").find_next("td").text except Exception: mat = "" try: type_s = soup.find("td", text="Type:").find_next("td").text except Exception: type_s = "" try: tire = soup.find("td", text="Tire:").find_next("td").text except Exception: tire = "" try: condition = soup.find("td", text="Condition:").find_next("td").text except Exception: condition = "" data = { "post_name": f"{name}", "capacity": capacity, "marque": marque, "model": model, "year": year, "mat": mat, "type": type_s, "tire": tire, "condition": condition, "url": item, } request_("POST", API_ENDPOINT, data=data)
def _process_added_items(items): for item in items: url_link = f"https://www.michiganwholesaleequipment.com{item}" source = request_("GET", url_link).text soup = BeautifulSoup(source, "html.parser") try: name = soup.find("section", { "id": "block-zurb-foundation-page-title" }).text.strip() except Exception: name = "" try: capacity = soup.find("div", text="Capacity").find_next("div").text except Exception: capacity = "" try: model = soup.find("div", text="Equipment Model").find_next("div").text except Exception: model = "" try: hours = soup.find("div", text="Hours").find_next("div").text except Exception: hours = "" try: mat = soup.find("div", text="Mast").find_next("div").text except Exception: mat = "" try: type_s = soup.find("div", text="Fuel Type").find_next("div").text except Exception: type_s = "" try: tire = soup.find("div", text="Equipment Type").find_next("div").text except Exception: tire = "" try: year = soup.find("div", text="Year").find_next("div").text except Exception: year = "" data = { "post_name": f"{name}", "capacity": capacity, "hours": hours, "model": model, "mat": mat, "type": type_s, "tire": tire, "year": year, "url": url_link, } request_("POST", API_ENDPOINT, data=data)
def _process_added_items(items): print(f"[almachinery] Got {len(items)} added links") for item in items: print(f"[almachinery] Processing added link {item}") source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") try: name = soup.find("p", class_="category-title").text except Exception: name = "" try: capacity = soup.find("span", text="Capacité").find_next("span").text + "LB" except Exception: capacity = "" try: marque = soup.find( "span", text="Marque").find_next("span").find("img").get("alt") except Exception: marque = "" try: model = soup.find("span", text="No de série").find_next("span").text except Exception: model = "" try: mat_1 = soup.find("span", text="Type de mât").find_next("span").text mat_2 = soup.find("span", text="Hauteur du mât").find_next("span").text mat = f"{mat_1},{mat_2}" except Exception: mat = "" try: year = soup.find("span", text="Année").find_next("span").text except Exception: year = "" print( f"[almachinery] Posting data about the {item} to forklift.news website" ) request_("POST", "URL", data={ "post_name": f"{name} {capacity} {marque} {year}", "capacity": capacity, "marque": marque, "model": model, "mat": mat, "annee": year, "url": item, })
def _crawl_komatsuforklift_category(category_link): response_text = request_("GET", category_link).text soup = BeautifulSoup(response_text, "html.parser") item_links = [(category_link, el.get('data-reveal-id')) for el in soup.find_all("a", class_="img")] while True: next_page_link_el = soup.find("a", text="next »") if next_page_link_el is not None: response_text = request_("GET", next_page_link_el["href"]).text soup = BeautifulSoup(response_text, "html.parser") item_links.extend([(next_page_link_el["href"], el.get("data-reveal-id")) for el in soup.find_all("a", class_="img")]) else: break return set(item_links)
def _crawl_nfelifts_category(category_link): response_text = request_("GET", category_link).text soup = BeautifulSoup(response_text, "html.parser") item_links = [ el.get('href') for el in soup.find_all("a", class_="more-link") ] return set(item_links)
def crawl_manuvic(request): if request.method == "POST": print("[manuvic] Started crawling website") response_text = request_( "GET", "https://www.manuvic.com/produits/chariots-elevateurs.html?cat=116&product_list_limit=100" ).text soup = BeautifulSoup(response_text, "html.parser") item_links = [ el.get("href") for el in soup.find_all("a", class_="product photo product-item-photo") ] db = firestore.Client() storage_client = storage.Client() if not item_links: send_warning_email(SENDGRID_API_KEY, SENDER_EMAIL, RECEIVER_EMAILS, "manuvic") return "No links were found on manuvic website" comparison_result = add_and_compare_new_items(db, "manuvic", item_links) added_items, deleted_items = comparison_result["added"], comparison_result["deleted"] email_text = "" if added_items: _process_added_items(storage_client, added_items) email_text += format_links_modified("Added", added_items) if email_text != "": send_email(SENDGRID_API_KEY, SENDER_EMAIL, RECEIVER_EMAILS, "Comparison results for manuvic", email_text) return email_text else: return "No new added or new deleted items found" else: return "This method is not supported"
def _crawl_achatusag_category(category_link): response_text = request_("GET", category_link).text soup = BeautifulSoup(response_text, "html.parser") item_links = [ el.find('a').get('href') for el in soup.find_all("div", class_="flex_display") ] return set(item_links)
def _crawl_multichariots_category(category_link): response_text = request_("GET", category_link).text soup = BeautifulSoup(response_text, "html.parser") item_links = [ el.find('a').get('href') for el in soup.find_all("h3", class_="product-title") ] return set(item_links)
def _crawl_machinerieplante_category_2(category_link): response_text = request_("GET", category_link).text soup = BeautifulSoup(response_text, "html.parser") item_links = [ el.get('href') for el in soup.find_all("a", class_="Menugj") ] for ct in item_links: url_link = f"http://www.machinerieplante.com/fr/equipement/{ct}" response_text = request_("GET", url_link).text soup = BeautifulSoup(response_text, "html.parser") item_links_test = [ el.get('href') for el in soup.find_all("a", class_="Menugj") ] item_links.extend(item_links_test) return set(item_links)
def _crawl_chariotelevateurhardy_category(category_link): response_text = request_("GET", category_link).text soup = BeautifulSoup(response_text, "html.parser") item_links = [ el.find('a').get('href') for el in soup.find_all("h2", class_="entry-title") ] return set(item_links)
def _crawl_equipementse3_category(category_link): response_text = request_("GET", category_link).text soup = BeautifulSoup(response_text, "html.parser") item_links = [ el.find('a').get('href') for el in soup.find_all("h2", class_="term-title") ] return set(item_links)
def _process_added_items(items): print(f"[ceqinc] Got {len(items)} added links") for item in items: print(f"[ceqinc] Processing added link {item}") source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") try: name = soup.find("span", text="Type d'équipement").find_next("strong").text except Exception: name = "" try: capacity = "".join(re.findall(r"\d+", soup.find("span", text="Capacité").find_next("strong").text)) + "LB" except Exception: capacity = "" try: marque = soup.find("span", text="Marque").find_next("strong").text except Exception: marque = "" try: model = soup.find("span", text="Modèle").find_next("strong").text except Exception: model = "" try: mat_1 = soup.find("span", text="Hauteur du mât (abaissé)").find_next("strong").text mat_2 = soup.find("span", text="Hauteur du mât (élévation)").find_next("strong").text mat = f"abaissé: {mat_1}, élévation: {mat_2}" except Exception: mat = "" try: year = soup.find("span", text="Année").find_next("strong").text except Exception: year = "" print(f"[ceqinc] Posting data about the {item} to forklift.news website") request_( "POST", "", data={ "post_name": f"{name} {capacity} {marque} {year}", "capacity": capacity, "marque": marque, "model": model, "mat": mat, "annee": year, "url": item, })
def _process_added_items(items): for item in items: source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") try: capacity = soup.find("div", text="Capacity:").find_next("div").text + "LB" except Exception: capacity = "" try: marque = soup.find("div", text="Make:").find_next("div").text except Exception: marque = "" try: model = soup.find("div", text="Model:").find_next("div").text except Exception: model = "" try: year = soup.find("div", text="Year:").find_next("div").text except Exception: year = "" try: type_s = soup.find("div", text="Type:").find_next("div").text except Exception: type_s = "" try: tire = soup.find("div", text="Upright:").find_next("div").text except Exception: tire = "" try: hours = soup.find("div", text="Hours:Hours:").find_next("div").text except Exception: hours = "" data = { "post_name": f"{type_s} {capacity} {marque} {year}", "capacity": capacity, "marque": marque, "model": model, "year": year, "type": type_s, "tire": tire, "hours": hours, "url": item, } request_("POST", API_ENDPOINT, data=data)
def _crawl_machinerieplante_category(category_link): url_link = f"http://www.machinerieplante.com/fr/equipement/{category_link}" response_text = request_("GET", url_link).text soup = BeautifulSoup(response_text, "html.parser") item_links = [ el.get('href') for el in soup.find_all("a", class_="Lfooter") ] return set(item_links)
def _crawl_almachinery(): api_link = "https://www.a1machinery.com/fr/inventaire/api?capacity_from=0&capacity_to=55000&p={page}&referer=/fr/Produits?capacity_from=0&capacity_to=55000&p={page}" initial_page = 1 response = request_("GET", api_link.format(page=initial_page)) response_json = response.json() pages = response_json.get("pages") items = [{"link": item["url"]} for item in response_json.get("items")] for page in range(2, pages + 1): response = request_("GET", api_link.format(page=page)) response_json = response.json() items.extend([{ "link": item["url"] } for item in response_json.get("items")]) items = [f"https://www.a1machinery.com{item['link']}" for item in items] return items
def _crawl_ldlqc_category(category_link): response_text = request_("GET", category_link).text soup = BeautifulSoup(response_text, "html.parser") item_links = [ el.find('a').get("href") for el in soup.find_all("div", class_="title") ] while True: next_page_link_el = soup.find("a", class_="next") if next_page_link_el is not None: response_text = request_("GET", next_page_link_el["href"]).text soup = BeautifulSoup(response_text, "html.parser") item_links.extend([ el.find('a').get("href") for el in soup.find_all("div", class_="title") ]) else: break return set(item_links)
def _crawl_liftnorthamerica_category(category_link): response_text = request_("GET", category_link).text soup = BeautifulSoup(response_text, "html.parser") item_links = [ el.get('href') for el in soup.find_all("h4", class_="product-title") ] while True: next_page_link_el = soup.find("a", class_="next page-numbers") if next_page_link_el is not None: url = next_page_link_el["href"] response_text = request_("GET", url).text soup = BeautifulSoup(response_text, "html.parser") item_links.extend([ el.get("href") for el in soup.find_all("h4", class_="product-title") ]) else: break return set(item_links)
def _process_added_items(items): for item in items: source = request_("GET", item).text soup = BeautifulSoup(source, "html.parser") try: name = soup.find("h1", class_="post-title").text except Exception: name = "" soup_detilas = soup.select(".specification li") try: capacity = soup_detilas[5].text.split(":")[1] except Exception: capacity = "" try: marque = soup_detilas[1].text.split(':')[1] except Exception: marque = "" try: model = soup_detilas[2].text.split(":")[1] except Exception: model = "" try: mat = soup_detilas[9].text.split(":")[1] except Exception: mat = "" try: year = soup_detilas[3].text.split(":")[1] except Exception: year = "" data = { "post_name": f"{name} {capacity} {marque} {year}", "capacity": capacity, "marque": marque, "model": model, "mat": mat, "annee": year, "url": item, } request_("POST", API_ENDPOINT, data=data)
def _process_added_items(items): print(f"[southeastforklifts] Got {len(items)} added links") for item in items: print( f"[southeastforklifts] Posting data about the {item} to forklift.news website" ) item_data = _parse_item(item) request_( "POST", API_ENDPOINT, data={ "post_name": f"{item_data['name']} {item_data['capacity']} {item_data['marque']} {item_data['year']}", "capacity": item_data['capacity'], "marque": item_data['marque'], "model": item_data['model'], "mat": item_data['mat'], "annee": item_data['year'], "url": item, })
def _crawl_canadacrown_category(category_link): response_text = request_("GET", category_link).text soup = BeautifulSoup(response_text, "html.parser") for el in soup.find_all("div", class_="image"): try: url = el.find('a').get('href') base_url = f"{main_link}{url}" item_links_all.append(base_url) except Exception: pass return set(item_links_all)