def start(urls: list) -> None: assert urls is not None and len(urls) DBG(f'start\nurls: {urls}') for url in urls: text = pagetext(url) print(url) print(text) return
def resolve_tag_info(self, info, url): DBG("Resolving meta tag info.") r = requests.get(url) html = r.content soup = BeautifulSoup(html, 'lxml') meta = soup.find_all('meta') df = pd.DataFrame([m.attrs for m in meta]).fillna('') tag_cols = df.columns tag_data = df.values return TagInfo( url=url, tag_cols=tag_cols, tag_data=tag_data, )
def resolve_nlp_info(self, info, url): text = pagetext(url) global nlp if not nlp: logger.info("Loading NLP model.") nlp = spacy.load("en_core_web_lg") DBG("Done loading NLP model.") doc = nlp(text) objects = {entity.text.strip(): entity.label_ for entity in doc.ents} displacy_markup = displacy.render(doc, style="ent") return NLPInfo( url=url, text=escape(text), objects=objects.items(), displacy_markup=displacy_markup, )
def resolve_html(self, info): DBG("Resolving HTML.") html = pagehtml(self.url) DBG(f'HTML length: {len(html)}') return html
def resolve_request_info(self, info): DBG("Resolving request info.") r = requests.get(self.url) return [*get_info(r).items()]
def resolve_summary_subjectivity(self, info): DBG("resolve_summary_subjectivity") html = pagehtml(self.url) summary = get_sumy(3, html, self.url) blob = TextBlob(summary) return blob.sentiment.subjectivity
def resolve_summary(self, info): DBG("Resolving summary.") html = pagehtml(self.url) summary = get_sumy(3, html, self.url) return summary
def resolve_subjectivity(self, info): DBG("Resolving subjectivity.") blob = TextBlob(self.text) return blob.sentiment.subjectivity
def resolve_polarity(self, info): DBG("Resolving polarity.") blob = TextBlob(self.text) return blob.sentiment.polarity
def resolve_links(self, info): for link in urlutils.find_all_links(self.text): DBG(f'Found link: {link}') yield link