Пример #1
0
	def test_readtext(self):
		url = 'http://globoesporte.globo.com/futebol/brasileirao-serie-a/noticia/2012/11/gremio-reage-empata-com-lusa-e-confirma-rebaixamento-do-palmeiras.html'
		soup = crawler.loadbs(url)
		crawler.readtext(soup)
Пример #2
0
def process_html(url, request, response, data, **kwarg):
    data = str(data, "utf-8", "ignore")
    try:
        soup = bs4.BeautifulSoup(data)
    except:
        return

        # remove os scripts, eles só atrapalham
    for script in soup.find_all("script"):
        script.decompose()

    link_url = kwarg["link_url"]
    this_domain_id = kwarg["domain_id"]
    this_url_id = kwarg["url_id"]

    # salva e cria referência para todos os links desta página
    imgs = soup.find_all("img", src=True)
    links = soup.find_all("a", href=True)
    with db.scope():
        for img in imgs:
            img_url = urlhelper.Url.parse(img.get("src"), url)
            img_title = img.get("title")

            domain_id = db.save_domain(img_url.domain)
            url_id = db.save_url(domain_id, img_url.path, None, 2)
            db.associate(this_url_id, url_id, img_title)

        for link in links:
            m = re.match("\s*(\w+):", link.get("href"))
            if m and m.group(1) not in ("http", "https"):
                continue

            link_text = get_text(link).strip()
            link_url = urlhelper.Url.parse(link.get("href"), url)

            domain_id = db.save_domain(link_url.domain)
            url_id = db.save_url(domain_id, link_url.path, None, None)
            db.associate(this_url_id, url_id, link_text)

        hs = soup.find_all("h1")
        hs += soup.find_all("h2")
        hs += soup.find_all("h3")
        hs += soup.find_all("h4")
        hs += soup.find_all("h5")
        hs += soup.find_all("h6")

        for hx in hs:
            if not hx.a or len(hx.get_text()) > 0 and len(hx.a.get_text()) / len(hx.get_text()) < 0.3:
                header_text = get_text(hx).strip()
                db.save_header(this_url_id, header_text)

        output = io.StringIO()
        outputHtml = io.StringIO()
        text_elements = crawler.readtext(soup)

        for el in text_elements:
            if isinstance(el, bs4.NavigableString):
                outputHtml.write(str(el) + "\n")
                output.write(el)
            elif not el.a or len(el.get_text()) > 0 and len(el.a.get_text()) / len(el.get_text()) < 0.3:
                outputHtml.write(str(el) + "\n")
                output.write(get_text(el))

        og_title = soup.find("meta", attrs={"property": "og:title"})
        if og_title:
            title = og_title.get("content")
        else:
            twitter_title = soup.find("meta", attrs={"name": "twitter:title"})
            if twitter_title:
                title = twitter_title.get("content")
            else:
                main_title = soup.find("meta", attrs={"name": "title"})
                if main_title:
                    title = main_title.get("content")
                else:
                    title = get_text(soup.title)

        og_description = soup.find("meta", attrs={"property": "og:description"})
        if og_description:
            description = og_description.get("content")
        else:
            twitter_description = soup.find("meta", attrs={"name": "twitter:description"})
            if twitter_description:
                description = twitter_description.get("content")
            else:
                main_description = soup.find("meta", attrs={"name": "description"})
                if main_description:
                    description = main_description.get("content")
                else:
                    description = None

        try:
            print("HTML:", this_url_id, url)
        except:
            pass

        db.save_document(this_url_id, title, description, re.sub(" +", " ", output.getvalue()), outputHtml.getvalue())