示例#1
0
def check_top_achat(urls):
    out_results = []
    for url in urls:
        tree = util.get_tree(url)

        nb_resultats = tree.xpath(
            '//*[@id="content"]/nav[1]/ul/li[4]/text()')[0]
        nb = util.make_num(nb_resultats)
        results = []

        liste_prix_ = tree.xpath(
            "//section[@class = 'produits list']//div[@itemprop= 'price']/text()"
        )
        liste_titres = tree.xpath(
            "//section[@class = 'produits list']//div[@class = 'libelle']/a/h3/text()"
        )
        liste_dispos = tree.xpath(
            "//section[@class = 'produits list']//section[last()]/@class")

        for i in range(0, int(nb)):
            prix_ = liste_prix_[i][0:-4]
            prix = util.make_num(prix_)
            if (int(prix) >= 850):
                continue

            titre = liste_titres[i]
            geforce_ad = " + 1 an d'abonnement GeForce Now offert ! ".lower()
            call_of_ad = "+ Call of Duty: Black Ops Cold War offert ! ".lower()
            if ('water' in titre.lower() or 'hydro' in titre.lower()):
                continue
            elif (geforce_ad in titre.lower()):
                titre = titre[0:len(titre) - len(geforce_ad)]
            elif (call_of_ad in titre.lower()):
                titre = titre[0:len(titre) - len(call_of_ad)]

            raw_dispo = liste_dispos[i]
            dispo = ""
            if (raw_dispo == 'en-rupture'):
                dispo = 'Rupture'
            elif (raw_dispo == 'dispo-sous-7-jours'):
                dispo = 'sous 7 jours'
            elif (raw_dispo == 'dispo-entre-7-15-jours'):
                dispo = 'entre 7-15 jours'
            elif (raw_dispo == 'dispo-plus-15-jours'):
                dispo = '+ de 15 jours'
            else:
                dispo = raw_dispo

            results.append(
                ('topachat.com         ' + util.clean_string(titre), dispo,
                 util.clean_string(prix)))
        out_results += results

    return out_results
示例#2
0
def get_useful_links(page):
    links = []

    tree = get_tree(page)
    if tree is None:
        return []

    # remove related content sections

    # references and related content sections
    bad_section_finders = [
        "//div[@class=\'relatedItem\']",  #http://www.tandfonline.com/doi/abs/10.4161/auto.19496
        "//div[@class=\'citedBySection\']",  #10.3171/jns.1966.25.4.0458
        "//div[@class=\'references\']",  #https://www.emeraldinsight.com/doi/full/10.1108/IJCCSM-04-2017-0089
        "//div[contains(@class, 'ref-list')]", #https://www.jpmph.org/journal/view.php?doi=10.3961/jpmph.16.069
        "//div[@id=\'supplementary-material\']", #https://www.jpmph.org/journal/view.php?doi=10.3961/jpmph.16.069
        "//div[contains(@class, 'cta-guide-authors')]",  # https://www.journals.elsevier.com/physics-of-the-dark-universe/
        "//div[contains(@class, 'footer-publication')]",  # https://www.journals.elsevier.com/physics-of-the-dark-universe/
        "//d-appendix",  # https://distill.pub/2017/aia/
        "//dt-appendix",  # https://distill.pub/2016/handwriting/
        "//div[starts-with(@id, 'dt-cite')]",  # https://distill.pub/2017/momentum/
    ]
    for section_finder in bad_section_finders:
        for bad_section in tree.xpath(section_finder):
            bad_section.clear()

    # now get the links
    link_elements = tree.xpath("//a")

    for link in link_elements:
        link_text = link.text_content().strip().lower()
        if link_text:
            link.anchor = link_text
            if "href" in link.attrib:
                link.href = link.attrib["href"]

        else:
            # also a useful link if it has a solo image in it, and that image includes "pdf" in its filename
            link_content_elements = [l for l in link]
            if len(link_content_elements)==1:
                link_insides = link_content_elements[0]
                if link_insides.tag=="img":
                    if "src" in link_insides.attrib and "pdf" in link_insides.attrib["src"]:
                        link.anchor = u"image: {}".format(link_insides.attrib["src"])
                        if "href" in link.attrib:
                            link.href = link.attrib["href"]

        if hasattr(link, "anchor") and hasattr(link, "href"):
            links.append(link)

    return links
示例#3
0
def ldlc_targeted(url):
    tree = util.get_tree(url)

    name = tree.xpath("/html/body/div[3]/div[2]/div[1]/h1/text()")[0]
    dispo = tree.xpath(
        "/html/body/div[3]/div[2]/div[2]/div[3]/aside/div[4]/div[1]/div[2]/div/span/text()"
    )[0]
    prix_ = tree.xpath(
        "/html/body/div[3]/div[2]/div[2]/div[3]/aside/div[1]/div/text()"
    )[0][0:-1]

    prix = util.make_num(prix_)
    return (util.clean_string(name), util.clean_string(dispo),
            util.clean_string(prix))
示例#4
0
def check_pc_componentes(urls):
    out_results = []
    for url in urls:
        tree = util.get_tree(url)

        titres = tree.xpath(
            f"//div[@class = 'c-product-card__content']/header/h3/a/text()")
        prixs = tree.xpath(
            f"//div[@class = 'c-product-card__content']/div[2]/div/span/text()"
        )
        dispos = tree.xpath(
            f"//div[@class = 'c-product-card__content']/div[3]/text()")

        results = []
        for titre, prix, dispo in zip(titres, prixs, dispos):
            if (',' in prix):
                prix = util.make_num(prix[0:-4])
            else:
                prix = util.make_num(prix)

            if (int(prix) >= 850):
                continue

            if 'rtx' not in titre.lower():
                continue

            avoid_bool = False
            avoid_words = [
                'reacondicionado', 'recondicionado', 'water', 'hydro', 'ekwb',
                'intel', 'ryzen', '2080', '2070', 'i7', 'i5', 'Vector'
            ]
            for a in avoid_words:
                if a in util.clean_string(titre.lower()):
                    avoid_bool = True
                    break

            if avoid_bool:
                continue

            if (util.clean_string(dispo).lower() == "sin fecha de entrada"):
                dispo = "Rupture"
            else:
                dispo = "Check dispo"

            results.append(
                ('pccomponentes.com    ' + util.clean_string(titre), dispo,
                 util.clean_string(prix)))

        out_results += results
    return out_results
示例#5
0
def get_useful_links(page):
    links = []

    tree = get_tree(page)
    if tree is None:
        return []

    # remove related content sections

    # references and related content sections
    bad_section_finders = [
        "//div[@class=\'relatedItem\']",  #http://www.tandfonline.com/doi/abs/10.4161/auto.19496
        "//div[@class=\'citedBySection\']",  #10.3171/jns.1966.25.4.0458
        "//div[@class=\'references\']"  #https://www.emeraldinsight.com/doi/full/10.1108/IJCCSM-04-2017-0089
    ]
    for section_finder in bad_section_finders:
        for bad_section in tree.xpath(section_finder):
            bad_section.clear()

    # now get the links
    link_elements = tree.xpath("//a")

    for link in link_elements:
        link_text = link.text_content().strip().lower()
        if link_text:
            link.anchor = link_text
            if "href" in link.attrib:
                link.href = link.attrib["href"]

        else:
            # also a useful link if it has a solo image in it, and that image includes "pdf" in its filename
            link_content_elements = [l for l in link]
            if len(link_content_elements) == 1:
                link_insides = link_content_elements[0]
                if link_insides.tag == "img":
                    if "src" in link_insides.attrib and "pdf" in link_insides.attrib[
                            "src"]:
                        link.anchor = u"image: {}".format(
                            link_insides.attrib["src"])
                        if "href" in link.attrib:
                            link.href = link.attrib["href"]

        if hasattr(link, "anchor") and hasattr(link, "href"):
            links.append(link)

    return links
示例#6
0
def check_ldlc(urls):
    out_results = []
    for url in urls:
        tree = util.get_tree(url)

        nb_resultats = tree.xpath(
            '/html/body/div[3]/div/div[3]/div[1]/div/div[2]/div[1]/div[1]/text()'
        )[0]
        nb = util.make_num(nb_resultats)

        #48 is the maximum of items in a page
        if int(nb) > 48:
            nb = 48

        results = []

        for i in range(1, int(nb) + 1):
            prix_ = tree.xpath(
                f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[4]/div[1]/div/text()"
            )[0]
            prix = util.make_num(prix_)
            if (int(prix) >= 850):
                continue

            titre = tree.xpath(
                f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[1]/div[1]/h3/a/text()"
            )[0]
            if ('water' in titre.lower() or 'hydro' in titre.lower()):
                continue

            dispo = tree.xpath(
                f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[3]/div/div[2]/div/span/text()"
            )[0]

            dispo_p2 = tree.xpath(
                f"//*[@id='listing']//*[@data-position='{i}']/div[2]/div[3]/div/div[2]/div/span/em/text()"
            )
            if len(dispo_p2) >= 1:
                dispo = dispo + ' ' + dispo_p2[0]

            results.append(('LDLC.com             ' + util.clean_string(titre),
                            util.clean_string(dispo), util.clean_string(prix)))

        out_results += results

    return out_results
示例#7
0
def get_pdf_in_meta(page):
    if "citation_pdf_url" in page:
        if DEBUG_SCRAPING:
            logger.info(u"citation_pdf_url in page")

        tree = get_tree(page)
        if tree is not None:
            metas = tree.xpath("//meta")
            for meta in metas:
                if "name" in meta.attrib:
                    if meta.attrib["name"]=="citation_pdf_url":
                        if "content" in meta.attrib:
                            link = DuckLink(href=meta.attrib["content"], anchor="<meta citation_pdf_url>")
                            return link
        else:
            # backup if tree fails
            regex = r'<meta name="citation_pdf_url" content="(.*?)">'
            matches = re.findall(regex, page)
            if matches:
                link = DuckLink(href=matches[0], anchor="<meta citation_pdf_url>")
                return link
    return None
示例#8
0
def get_pdf_in_meta(page):
    if "citation_pdf_url" in page:
        if DEBUG_SCRAPING:
            logger.info(u"citation_pdf_url in page")

        tree = get_tree(page)
        if tree is not None:
            metas = tree.xpath("//meta")
            for meta in metas:
                if "name" in meta.attrib:
                    if meta.attrib["name"]=="citation_pdf_url":
                        if "content" in meta.attrib:
                            link = DuckLink(href=meta.attrib["content"], anchor="<meta citation_pdf_url>")
                            return link
        else:
            # backup if tree fails
            regex = r'<meta name="citation_pdf_url" content="(.*?)">'
            matches = re.findall(regex, page)
            if matches:
                link = DuckLink(href=matches[0], anchor="<meta citation_pdf_url>")
                return link
    return None
示例#9
0
def test2():
    url = "http://getbootstrap.com/"
    tree = get_tree(url)
    e = tree.xpath('//div[@class="bs-docs-featurette"]/div[@class="container"]')[0]
    short_text = get_element_short_text(e)
    return short_text
示例#10
0
def get_useful_links(page):
    links = []

    tree = get_tree(page)
    if tree is None:
        return []

    # remove related content sections

    bad_section_finders = [
        # references and related content sections
        "//div[@class=\'relatedItem\']",  #http://www.tandfonline.com/doi/abs/10.4161/auto.19496
        "//div[@class=\'citedBySection\']",  #10.3171/jns.1966.25.4.0458
        "//div[@class=\'references\']",  #https://www.emeraldinsight.com/doi/full/10.1108/IJCCSM-04-2017-0089
        "//div[contains(@class, 'ref-list')]",  #https://www.jpmph.org/journal/view.php?doi=10.3961/jpmph.16.069
        "//div[@id=\'supplementary-material\']",  #https://www.jpmph.org/journal/view.php?doi=10.3961/jpmph.16.069
        "//div[contains(@class, 'cta-guide-authors')]",  # https://www.journals.elsevier.com/physics-of-the-dark-universe/
        "//div[contains(@class, 'footer-publication')]",  # https://www.journals.elsevier.com/physics-of-the-dark-universe/
        "//d-appendix",  # https://distill.pub/2017/aia/
        "//dt-appendix",  # https://distill.pub/2016/handwriting/
        "//div[starts-with(@id, 'dt-cite')]",  # https://distill.pub/2017/momentum/
        "//ol[contains(@class, 'ref-item')]",  # http://www.cjcrcn.org/article/html_9778.html
        "//div[contains(@class, 'NLM_back')]",  # https://pubs.acs.org/doi/10.1021/acs.est.7b05624
        "//div[contains(@class, 'NLM_citation')]",  # https://pubs.acs.org/doi/10.1021/acs.est.7b05624
        "//div[@id=\'relatedcontent\']",  # https://pubs.acs.org/doi/10.1021/acs.est.7b05624
        "//ul[@id=\'book-metrics\']",  # https://link.springer.com/book/10.1007%2F978-3-319-63811-9
        "//section[@id=\'article_references\']",  # https://www.nejm.org/doi/10.1056/NEJMms1702111
        "//div[@id=\'attach_additional_files\']",  # https://digitalcommons.georgiasouthern.edu/ij-sotl/vol5/iss2/14/
        "//span[contains(@class, 'fa-lock')]",  # https://www.dora.lib4ri.ch/eawag/islandora/object/eawag%3A15303

        # can't tell what chapter/section goes with what doi
        "//div[@id=\'booktoc\']",  # https://link.springer.com/book/10.1007%2F978-3-319-63811-9
        "//div[@id=\'tocWrapper\']",  # https://www.elgaronline.com/view/edcoll/9781786431417/9781786431417.xml
    ]

    for section_finder in bad_section_finders:
        for bad_section in tree.xpath(section_finder):
            bad_section.clear()

    # now get the links
    link_elements = tree.xpath("//a")

    for link in link_elements:
        link_text = link.text_content().strip().lower()
        if link_text:
            link.anchor = link_text
            if "href" in link.attrib:
                link.href = link.attrib["href"]

        else:
            # also a useful link if it has a solo image in it, and that image includes "pdf" in its filename
            link_content_elements = [l for l in link]
            if len(link_content_elements) == 1:
                link_insides = link_content_elements[0]
                if link_insides.tag == "img":
                    if "src" in link_insides.attrib and "pdf" in link_insides.attrib[
                            "src"]:
                        link.anchor = u"image: {}".format(
                            link_insides.attrib["src"])
                        if "href" in link.attrib:
                            link.href = link.attrib["href"]

        if hasattr(link, "anchor") and hasattr(link, "href"):
            links.append(link)

    return links