def check_top_achat(urls): out_results = [] for url in urls: tree = util.get_tree(url) nb_resultats = tree.xpath( '//*[@id="content"]/nav[1]/ul/li[4]/text()')[0] nb = util.make_num(nb_resultats) results = [] liste_prix_ = tree.xpath( "//section[@class = 'produits list']//div[@itemprop= 'price']/text()" ) liste_titres = tree.xpath( "//section[@class = 'produits list']//div[@class = 'libelle']/a/h3/text()" ) liste_dispos = tree.xpath( "//section[@class = 'produits list']//section[last()]/@class") for i in range(0, int(nb)): prix_ = liste_prix_[i][0:-4] prix = util.make_num(prix_) if (int(prix) >= 850): continue titre = liste_titres[i] geforce_ad = " + 1 an d'abonnement GeForce Now offert ! ".lower() call_of_ad = "+ Call of Duty: Black Ops Cold War offert ! ".lower() if ('water' in titre.lower() or 'hydro' in titre.lower()): continue elif (geforce_ad in titre.lower()): titre = titre[0:len(titre) - len(geforce_ad)] elif (call_of_ad in titre.lower()): titre = titre[0:len(titre) - len(call_of_ad)] raw_dispo = liste_dispos[i] dispo = "" if (raw_dispo == 'en-rupture'): dispo = 'Rupture' elif (raw_dispo == 'dispo-sous-7-jours'): dispo = 'sous 7 jours' elif (raw_dispo == 'dispo-entre-7-15-jours'): dispo = 'entre 7-15 jours' elif (raw_dispo == 'dispo-plus-15-jours'): dispo = '+ de 15 jours' else: dispo = raw_dispo results.append( ('topachat.com ' + util.clean_string(titre), dispo, util.clean_string(prix))) out_results += results return out_results
def get_useful_links(page): links = [] tree = get_tree(page) if tree is None: return [] # remove related content sections # references and related content sections bad_section_finders = [ "//div[@class=\'relatedItem\']", #http://www.tandfonline.com/doi/abs/10.4161/auto.19496 "//div[@class=\'citedBySection\']", #10.3171/jns.1966.25.4.0458 "//div[@class=\'references\']", #https://www.emeraldinsight.com/doi/full/10.1108/IJCCSM-04-2017-0089 "//div[contains(@class, 'ref-list')]", #https://www.jpmph.org/journal/view.php?doi=10.3961/jpmph.16.069 "//div[@id=\'supplementary-material\']", #https://www.jpmph.org/journal/view.php?doi=10.3961/jpmph.16.069 "//div[contains(@class, 'cta-guide-authors')]", # https://www.journals.elsevier.com/physics-of-the-dark-universe/ "//div[contains(@class, 'footer-publication')]", # https://www.journals.elsevier.com/physics-of-the-dark-universe/ "//d-appendix", # https://distill.pub/2017/aia/ "//dt-appendix", # https://distill.pub/2016/handwriting/ "//div[starts-with(@id, 'dt-cite')]", # https://distill.pub/2017/momentum/ ] for section_finder in bad_section_finders: for bad_section in tree.xpath(section_finder): bad_section.clear() # now get the links link_elements = tree.xpath("//a") for link in link_elements: link_text = link.text_content().strip().lower() if link_text: link.anchor = link_text if "href" in link.attrib: link.href = link.attrib["href"] else: # also a useful link if it has a solo image in it, and that image includes "pdf" in its filename link_content_elements = [l for l in link] if len(link_content_elements)==1: link_insides = link_content_elements[0] if link_insides.tag=="img": if "src" in link_insides.attrib and "pdf" in link_insides.attrib["src"]: link.anchor = u"image: {}".format(link_insides.attrib["src"]) if "href" in link.attrib: link.href = link.attrib["href"] if hasattr(link, "anchor") and hasattr(link, "href"): links.append(link) return links
def ldlc_targeted(url): tree = util.get_tree(url) name = tree.xpath("/html/body/div[3]/div[2]/div[1]/h1/text()")[0] dispo = tree.xpath( "/html/body/div[3]/div[2]/div[2]/div[3]/aside/div[4]/div[1]/div[2]/div/span/text()" )[0] prix_ = tree.xpath( "/html/body/div[3]/div[2]/div[2]/div[3]/aside/div[1]/div/text()" )[0][0:-1] prix = util.make_num(prix_) return (util.clean_string(name), util.clean_string(dispo), util.clean_string(prix))
def check_pc_componentes(urls): out_results = [] for url in urls: tree = util.get_tree(url) titres = tree.xpath( f"//div[@class = 'c-product-card__content']/header/h3/a/text()") prixs = tree.xpath( f"//div[@class = 'c-product-card__content']/div[2]/div/span/text()" ) dispos = tree.xpath( f"//div[@class = 'c-product-card__content']/div[3]/text()") results = [] for titre, prix, dispo in zip(titres, prixs, dispos): if (',' in prix): prix = util.make_num(prix[0:-4]) else: prix = util.make_num(prix) if (int(prix) >= 850): continue if 'rtx' not in titre.lower(): continue avoid_bool = False avoid_words = [ 'reacondicionado', 'recondicionado', 'water', 'hydro', 'ekwb', 'intel', 'ryzen', '2080', '2070', 'i7', 'i5', 'Vector' ] for a in avoid_words: if a in util.clean_string(titre.lower()): avoid_bool = True break if avoid_bool: continue if (util.clean_string(dispo).lower() == "sin fecha de entrada"): dispo = "Rupture" else: dispo = "Check dispo" results.append( ('pccomponentes.com ' + util.clean_string(titre), dispo, util.clean_string(prix))) out_results += results return out_results
def get_useful_links(page): links = [] tree = get_tree(page) if tree is None: return [] # remove related content sections # references and related content sections bad_section_finders = [ "//div[@class=\'relatedItem\']", #http://www.tandfonline.com/doi/abs/10.4161/auto.19496 "//div[@class=\'citedBySection\']", #10.3171/jns.1966.25.4.0458 "//div[@class=\'references\']" #https://www.emeraldinsight.com/doi/full/10.1108/IJCCSM-04-2017-0089 ] for section_finder in bad_section_finders: for bad_section in tree.xpath(section_finder): bad_section.clear() # now get the links link_elements = tree.xpath("//a") for link in link_elements: link_text = link.text_content().strip().lower() if link_text: link.anchor = link_text if "href" in link.attrib: link.href = link.attrib["href"] else: # also a useful link if it has a solo image in it, and that image includes "pdf" in its filename link_content_elements = [l for l in link] if len(link_content_elements) == 1: link_insides = link_content_elements[0] if link_insides.tag == "img": if "src" in link_insides.attrib and "pdf" in link_insides.attrib[ "src"]: link.anchor = u"image: {}".format( link_insides.attrib["src"]) if "href" in link.attrib: link.href = link.attrib["href"] if hasattr(link, "anchor") and hasattr(link, "href"): links.append(link) return links
def check_ldlc(urls): out_results = [] for url in urls: tree = util.get_tree(url) nb_resultats = tree.xpath( '/html/body/div[3]/div/div[3]/div[1]/div/div[2]/div[1]/div[1]/text()' )[0] nb = util.make_num(nb_resultats) #48 is the maximum of items in a page if int(nb) > 48: nb = 48 results = [] for i in range(1, int(nb) + 1): prix_ = tree.xpath( f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[4]/div[1]/div/text()" )[0] prix = util.make_num(prix_) if (int(prix) >= 850): continue titre = tree.xpath( f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[1]/div[1]/h3/a/text()" )[0] if ('water' in titre.lower() or 'hydro' in titre.lower()): continue dispo = tree.xpath( f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[3]/div/div[2]/div/span/text()" )[0] dispo_p2 = tree.xpath( f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[3]/div/div[2]/div/span/em/text()" ) if len(dispo_p2) >= 1: dispo = dispo + ' ' + dispo_p2[0] results.append(('LDLC.com ' + util.clean_string(titre), util.clean_string(dispo), util.clean_string(prix))) out_results += results return out_results
def get_pdf_in_meta(page): if "citation_pdf_url" in page: if DEBUG_SCRAPING: logger.info(u"citation_pdf_url in page") tree = get_tree(page) if tree is not None: metas = tree.xpath("//meta") for meta in metas: if "name" in meta.attrib: if meta.attrib["name"]=="citation_pdf_url": if "content" in meta.attrib: link = DuckLink(href=meta.attrib["content"], anchor="<meta citation_pdf_url>") return link else: # backup if tree fails regex = r'<meta name="citation_pdf_url" content="(.*?)">' matches = re.findall(regex, page) if matches: link = DuckLink(href=matches[0], anchor="<meta citation_pdf_url>") return link return None
def test2(): url = "http://getbootstrap.com/" tree = get_tree(url) e = tree.xpath('//div[@class="bs-docs-featurette"]/div[@class="container"]')[0] short_text = get_element_short_text(e) return short_text
def get_useful_links(page): links = [] tree = get_tree(page) if tree is None: return [] # remove related content sections bad_section_finders = [ # references and related content sections "//div[@class=\'relatedItem\']", #http://www.tandfonline.com/doi/abs/10.4161/auto.19496 "//div[@class=\'citedBySection\']", #10.3171/jns.1966.25.4.0458 "//div[@class=\'references\']", #https://www.emeraldinsight.com/doi/full/10.1108/IJCCSM-04-2017-0089 "//div[contains(@class, 'ref-list')]", #https://www.jpmph.org/journal/view.php?doi=10.3961/jpmph.16.069 "//div[@id=\'supplementary-material\']", #https://www.jpmph.org/journal/view.php?doi=10.3961/jpmph.16.069 "//div[contains(@class, 'cta-guide-authors')]", # https://www.journals.elsevier.com/physics-of-the-dark-universe/ "//div[contains(@class, 'footer-publication')]", # https://www.journals.elsevier.com/physics-of-the-dark-universe/ "//d-appendix", # https://distill.pub/2017/aia/ "//dt-appendix", # https://distill.pub/2016/handwriting/ "//div[starts-with(@id, 'dt-cite')]", # https://distill.pub/2017/momentum/ "//ol[contains(@class, 'ref-item')]", # http://www.cjcrcn.org/article/html_9778.html "//div[contains(@class, 'NLM_back')]", # https://pubs.acs.org/doi/10.1021/acs.est.7b05624 "//div[contains(@class, 'NLM_citation')]", # https://pubs.acs.org/doi/10.1021/acs.est.7b05624 "//div[@id=\'relatedcontent\']", # https://pubs.acs.org/doi/10.1021/acs.est.7b05624 "//ul[@id=\'book-metrics\']", # https://link.springer.com/book/10.1007%2F978-3-319-63811-9 "//section[@id=\'article_references\']", # https://www.nejm.org/doi/10.1056/NEJMms1702111 "//div[@id=\'attach_additional_files\']", # https://digitalcommons.georgiasouthern.edu/ij-sotl/vol5/iss2/14/ "//span[contains(@class, 'fa-lock')]", # https://www.dora.lib4ri.ch/eawag/islandora/object/eawag%3A15303 # can't tell what chapter/section goes with what doi "//div[@id=\'booktoc\']", # https://link.springer.com/book/10.1007%2F978-3-319-63811-9 "//div[@id=\'tocWrapper\']", # https://www.elgaronline.com/view/edcoll/9781786431417/9781786431417.xml ] for section_finder in bad_section_finders: for bad_section in tree.xpath(section_finder): bad_section.clear() # now get the links link_elements = tree.xpath("//a") for link in link_elements: link_text = link.text_content().strip().lower() if link_text: link.anchor = link_text if "href" in link.attrib: link.href = link.attrib["href"] else: # also a useful link if it has a solo image in it, and that image includes "pdf" in its filename link_content_elements = [l for l in link] if len(link_content_elements) == 1: link_insides = link_content_elements[0] if link_insides.tag == "img": if "src" in link_insides.attrib and "pdf" in link_insides.attrib[ "src"]: link.anchor = u"image: {}".format( link_insides.attrib["src"]) if "href" in link.attrib: link.href = link.attrib["href"] if hasattr(link, "anchor") and hasattr(link, "href"): links.append(link) return links