def get_scores(text): analysis = Readability(text) results = {} results['ari'] = analysis.ARI() results['fkgl'] = analysis.FleschKincaidGradeLevel() results['cli'] = analysis.ColemanLiauIndex() results['fre'] = analysis.FleschReadingEase() results['gfi'] = analysis.GunningFogIndex() results['lix'] = analysis.LIX() results['rix'] = analysis.RIX() results['smog'] = analysis.SMOGIndex() return results
def setup(self): self.words = [] self.nouns = {} self.verbs = {} self.similarity = -1 self.active_words = [] self.passive_words = [] self.direct_words = [] self.indirect_words = [] self.positive_words = [] self.negative_words = [] self.line_break = False rd = Readability(self.text) self.FleschReadingEase = rd.FleschReadingEase() self.FleschKincaidGradeLevel = rd.FleschKincaidGradeLevel() self.GunningFogIndex = rd.GunningFogIndex() self.SMOGIndex = rd.SMOGIndex()
def getReadabilities(string): read = Readability(string) return read.FleschReadingEase(), read.FleschKincaidGradeLevel()
class Article(ArticleText): def __init__(self, url, raw=None): if not raw: raw = requests.get(url).content soup = BeautifulSoup(raw, 'lxml') self.url = url self.metadata = self.getMeta(soup) if self.metadata['content']: content = self.metadata['content'].find("section") if not content: content = self.metadata['content'] content = " ".join(list(content.stripped_strings)) else: raise Exception( "No content found for", url, "\nPlease add custom constraints [if any] in attributes_list.py" ) self.rd = Readability(content) super().__init__(self.metadata['title'], content) def iterTillHit(self, soup, arglist, target=None): for arg in arglist: cont = soup.find(*arg) if cont: if not target: return cont elif cont.text: return cont.text else: return cont[target] else: return None def getMeta(self, soup): # Title, Keywords, Description, Author, Published attr_d = {} attr_d['title'] = self.iterTillHit(soup, TITLE_L, 'content') attr_d['keyword'] = self.iterTillHit(soup, KEYWORD_L, 'content') attr_d['desc'] = self.iterTillHit(soup, DESC_L, 'content') attr_d['author'] = self.iterTillHit(soup, AUTHOR_L, 'content') attr_d['published'] = self.iterTillHit(soup, PUBLISHED_L, 'content') attr_d['content'] = self.iterTillHit(soup, CONTENT_L) return attr_d def num_hrefs(self): return len(self.metadata['content'].findAll("a", href=True)) def num_self_hrefs(self): site = urlparse(self.url)[1] return sum([ 1 for href in self.metadata['content'].findAll("a", href=True) if site in href['href'] ]) def num_imgs(self): return len(self.metadata['content'].findAll("img")) def num_videos(self): return len(self.metadata['content'].findAll("iframe")) def num_keywords(self): return len(self.metadata['keyword'].split( ",")) if self.metadata['keyword'] else 0 def daystuff(self): weekday_dict = [["weekday_is_monday", 0], ["weekday_is_tuesday", 0], ["weekday_is_wednesday", 0], ["weekday_is_thursday", 0], ["weekday_is_friday", 0], ["weekday_is_saturday", 0], ["weekday_is_sunday", 0], ["is_weekend", 0]] try: weekday = dateparser.parse(self.metadata['published']).weekday() weekday_dict[weekday][1] = 1 weekday_dict[-1][1] = 1 if weekday > 4 else 0 except TypeError: pass finally: return dict(weekday_dict) def lda(self): lda_dict = getLDA(self.metadata['title'])[0] lda_dict = {"LDA_%.2d" % index: val for index, val in lda_dict} return lda_dict def readability(self): readability_dict = { 'ARI': self.rd.ARI(), 'FleschReadingEase': self.rd.FleschReadingEase(), 'FleschKincaidGradeLevel': self.rd.FleschKincaidGradeLevel(), 'GunningFogIndex': self.rd.GunningFogIndex(), 'SMOGIndex': self.rd.SMOGIndex(), 'ColemanLiauIndex': self.rd.ColemanLiauIndex(), 'LIX': self.rd.LIX(), 'RIX': self.rd.RIX() } return readability_dict def stats(self): attributes = [ 'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos', 'num_keywords' ] meta_dict = super().stats() meta_dict.update({func: getattr(self, func)() for func in attributes}) meta_dict.update(self.daystuff()) meta_dict.update(self.lda()) meta_dict.update(self.readability()) return meta_dict