def get_top_headers(domain): with db.scope(): result = db.get_top_headers(domain) if result: return { 'success': result } return {}
def get_domain_links(domain): with db.scope(): links = db.get_links(domain) if links: return { 'success': links } return {}
def get_text(urlId): with db.scope(): document = db.get_url_document(urlId) if document: return { 'success': document } return {}
def getwebsiteinfo(domain): with db.scope(): domain = db.correct_name(domain) if domain: linksTo = db.get_link_to(domain) linkedBy = db.get_linked_by(domain) x = { 'domain': domain, 'linksTo': {}, 'linkedBy': {}, 'details': { 'links': db.get_count_local_links(domain), 'foreignLinks': db.get_count_foreign_links(domain), 'images': db.get_count_images(domain), 'mainCategories': ['Geral'] } } for link in linksTo: x['linksTo'][link] = db.get_count_local_links(link) for link in linkedBy: x['linkedBy'][link] = db.get_count_foreign_links(link) return x
def process(url, request, response, data): request = CaseInsensitiveDict.fromdict(json.loads(request)) response = CaseInsensitiveDict.fromdict(json.loads(response)) # verifica se é um content-type válido m = re.match("^\s*([\w-]+/[\w-]+)", response.get("Content-Type", "text/html")) if not m or m.group(1) not in allowed_mimes: return # descobre o tipo de dado na resposta e qual função # irá tratá-la if m.group(1) == "text/html": func = process_html link_type = 1 elif m.group(1).startswith("image/"): func = process_img link_type = 2 elif m.group(1) == "text/plain": func = process_plain link_type = 3 # salva o link sem as informações para fazer as referências necessárias link_url = urlhelper.Url.parse(url) with db.scope(): this_domain_id = db.save_domain(link_url.domain) this_url_id = db.save_url(this_domain_id, link_url.path, m.group(1), link_type) kwarg = {"link_url": link_url, "domain_id": this_domain_id, "url_id": this_url_id} # chama a função específica para tratar esta url func(url, request, response, data, **kwarg)
def get_image(urlId): response.content_type = 'image/jpeg' with db.scope(): blob = db.get_image(urlId) if blob: return blob abort(404, 'Imagem inexistente.')
def process_img(url, request, response, data, **kwarg): this_url_id = kwarg["url_id"] try: print("IMAGE:", this_url_id, url) except: pass blob = images.resize(140, 140, data) if blob: with db.scope(): db.save_image(this_url_id, blob)
def get_domain_images(name): with db.scope(): images = db. get_domain_images(name) return { 'success': images }
def get_url_images(urlId): with db.scope(): images = db. get_images(urlId) return { 'success': images }
def typeaheaddomain(query): with db.scope(): result = db.querydomain(query) return { 'options': result } return {}
def test_add_url(self): with db.scope(): domain_id = db.save_domain('google.com') url_id = db.save_url(domain_id, '/google+/index.aspx?user=main#2') self.assertIsInstance(url_id, int)
def test_add_domain(self): with db.scope(): self.assertIsInstance(db.save_domain('google.com'), int)
def process_html(url, request, response, data, **kwarg): data = str(data, "utf-8", "ignore") try: soup = bs4.BeautifulSoup(data) except: return # remove os scripts, eles só atrapalham for script in soup.find_all("script"): script.decompose() link_url = kwarg["link_url"] this_domain_id = kwarg["domain_id"] this_url_id = kwarg["url_id"] # salva e cria referência para todos os links desta página imgs = soup.find_all("img", src=True) links = soup.find_all("a", href=True) with db.scope(): for img in imgs: img_url = urlhelper.Url.parse(img.get("src"), url) img_title = img.get("title") domain_id = db.save_domain(img_url.domain) url_id = db.save_url(domain_id, img_url.path, None, 2) db.associate(this_url_id, url_id, img_title) for link in links: m = re.match("\s*(\w+):", link.get("href")) if m and m.group(1) not in ("http", "https"): continue link_text = get_text(link).strip() link_url = urlhelper.Url.parse(link.get("href"), url) domain_id = db.save_domain(link_url.domain) url_id = db.save_url(domain_id, link_url.path, None, None) db.associate(this_url_id, url_id, link_text) hs = soup.find_all("h1") hs += soup.find_all("h2") hs += soup.find_all("h3") hs += soup.find_all("h4") hs += soup.find_all("h5") hs += soup.find_all("h6") for hx in hs: if not hx.a or len(hx.get_text()) > 0 and len(hx.a.get_text()) / len(hx.get_text()) < 0.3: header_text = get_text(hx).strip() db.save_header(this_url_id, header_text) output = io.StringIO() outputHtml = io.StringIO() text_elements = crawler.readtext(soup) for el in text_elements: if isinstance(el, bs4.NavigableString): outputHtml.write(str(el) + "\n") output.write(el) elif not el.a or len(el.get_text()) > 0 and len(el.a.get_text()) / len(el.get_text()) < 0.3: outputHtml.write(str(el) + "\n") output.write(get_text(el)) og_title = soup.find("meta", attrs={"property": "og:title"}) if og_title: title = og_title.get("content") else: twitter_title = soup.find("meta", attrs={"name": "twitter:title"}) if twitter_title: title = twitter_title.get("content") else: main_title = soup.find("meta", attrs={"name": "title"}) if main_title: title = main_title.get("content") else: title = get_text(soup.title) og_description = soup.find("meta", attrs={"property": "og:description"}) if og_description: description = og_description.get("content") else: twitter_description = soup.find("meta", attrs={"name": "twitter:description"}) if twitter_description: description = twitter_description.get("content") else: main_description = soup.find("meta", attrs={"name": "description"}) if main_description: description = main_description.get("content") else: description = None try: print("HTML:", this_url_id, url) except: pass db.save_document(this_url_id, title, description, re.sub(" +", " ", output.getvalue()), outputHtml.getvalue())