示例#1
0
def start(urls: list) -> None:
    assert urls is not None and len(urls)
    DBG(f'start\nurls: {urls}')
    for url in urls:
        text = pagetext(url)
        print(url)
        print(text)
    return
示例#2
0
 def resolve_tag_info(self, info, url):
     DBG("Resolving meta tag info.")
     r = requests.get(url)
     html = r.content
     soup = BeautifulSoup(html, 'lxml')
     meta = soup.find_all('meta')
     df = pd.DataFrame([m.attrs for m in meta]).fillna('')
     tag_cols = df.columns
     tag_data = df.values
     return TagInfo(
         url=url,
         tag_cols=tag_cols,
         tag_data=tag_data,
     )
示例#3
0
 def resolve_nlp_info(self, info, url):
     text = pagetext(url)
     global nlp
     if not nlp:
         logger.info("Loading NLP model.")
         nlp = spacy.load("en_core_web_lg")
         DBG("Done loading NLP model.")
     doc = nlp(text)
     objects = {entity.text.strip(): entity.label_ for entity in doc.ents}
     displacy_markup = displacy.render(doc, style="ent")
     return NLPInfo(
         url=url,
         text=escape(text),
         objects=objects.items(),
         displacy_markup=displacy_markup,
     )
示例#4
0
 def resolve_html(self, info):
     DBG("Resolving HTML.")
     html = pagehtml(self.url)
     DBG(f'HTML length: {len(html)}')
     return html
示例#5
0
 def resolve_request_info(self, info):
     DBG("Resolving request info.")
     r = requests.get(self.url)
     return [*get_info(r).items()]
示例#6
0
 def resolve_summary_subjectivity(self, info):
     DBG("resolve_summary_subjectivity")
     html = pagehtml(self.url)
     summary = get_sumy(3, html, self.url)
     blob = TextBlob(summary)
     return blob.sentiment.subjectivity
示例#7
0
 def resolve_summary(self, info):
     DBG("Resolving summary.")
     html = pagehtml(self.url)
     summary = get_sumy(3, html, self.url)
     return summary
示例#8
0
 def resolve_subjectivity(self, info):
     DBG("Resolving subjectivity.")
     blob = TextBlob(self.text)
     return blob.sentiment.subjectivity
示例#9
0
 def resolve_polarity(self, info):
     DBG("Resolving polarity.")
     blob = TextBlob(self.text)
     return blob.sentiment.polarity
示例#10
0
 def resolve_links(self, info):
     for link in urlutils.find_all_links(self.text):
         DBG(f'Found link: {link}')
         yield link