示例#1
0
def get_scores(text):
    analysis = Readability(text)
    results = {}
    results['ari'] = analysis.ARI()
    results['fkgl'] = analysis.FleschKincaidGradeLevel()
    results['cli'] = analysis.ColemanLiauIndex()
    results['fre'] = analysis.FleschReadingEase()
    results['gfi'] = analysis.GunningFogIndex()
    results['lix'] = analysis.LIX()
    results['rix'] = analysis.RIX()
    results['smog'] = analysis.SMOGIndex()
    return results
示例#2
0
class Article(ArticleText):
    def __init__(self, url, raw=None):
        if not raw:
            raw = requests.get(url).content
        soup = BeautifulSoup(raw, 'lxml')
        self.url = url
        self.metadata = self.getMeta(soup)
        if self.metadata['content']:
            content = self.metadata['content'].find("section")
            if not content:
                content = self.metadata['content']
            content = " ".join(list(content.stripped_strings))
        else:
            raise Exception(
                "No content found for", url,
                "\nPlease add custom constraints [if any] in attributes_list.py"
            )
        self.rd = Readability(content)
        super().__init__(self.metadata['title'], content)

    def iterTillHit(self, soup, arglist, target=None):
        for arg in arglist:
            cont = soup.find(*arg)
            if cont:
                if not target:
                    return cont
                elif cont.text:
                    return cont.text
                else:
                    return cont[target]
        else:
            return None

    def getMeta(self, soup):
        # Title, Keywords, Description, Author, Published
        attr_d = {}
        attr_d['title'] = self.iterTillHit(soup, TITLE_L, 'content')
        attr_d['keyword'] = self.iterTillHit(soup, KEYWORD_L, 'content')
        attr_d['desc'] = self.iterTillHit(soup, DESC_L, 'content')
        attr_d['author'] = self.iterTillHit(soup, AUTHOR_L, 'content')
        attr_d['published'] = self.iterTillHit(soup, PUBLISHED_L, 'content')
        attr_d['content'] = self.iterTillHit(soup, CONTENT_L)

        return attr_d

    def num_hrefs(self):
        return len(self.metadata['content'].findAll("a", href=True))

    def num_self_hrefs(self):
        site = urlparse(self.url)[1]
        return sum([
            1 for href in self.metadata['content'].findAll("a", href=True)
            if site in href['href']
        ])

    def num_imgs(self):
        return len(self.metadata['content'].findAll("img"))

    def num_videos(self):
        return len(self.metadata['content'].findAll("iframe"))

    def num_keywords(self):
        return len(self.metadata['keyword'].split(
            ",")) if self.metadata['keyword'] else 0

    def daystuff(self):
        weekday_dict = [["weekday_is_monday", 0], ["weekday_is_tuesday", 0],
                        ["weekday_is_wednesday", 0],
                        ["weekday_is_thursday", 0], ["weekday_is_friday", 0],
                        ["weekday_is_saturday", 0], ["weekday_is_sunday", 0],
                        ["is_weekend", 0]]

        try:
            weekday = dateparser.parse(self.metadata['published']).weekday()
            weekday_dict[weekday][1] = 1
            weekday_dict[-1][1] = 1 if weekday > 4 else 0
        except TypeError:
            pass
        finally:
            return dict(weekday_dict)

    def lda(self):
        lda_dict = getLDA(self.metadata['title'])[0]
        lda_dict = {"LDA_%.2d" % index: val for index, val in lda_dict}
        return lda_dict

    def readability(self):
        readability_dict = {
            'ARI': self.rd.ARI(),
            'FleschReadingEase': self.rd.FleschReadingEase(),
            'FleschKincaidGradeLevel': self.rd.FleschKincaidGradeLevel(),
            'GunningFogIndex': self.rd.GunningFogIndex(),
            'SMOGIndex': self.rd.SMOGIndex(),
            'ColemanLiauIndex': self.rd.ColemanLiauIndex(),
            'LIX': self.rd.LIX(),
            'RIX': self.rd.RIX()
        }
        return readability_dict

    def stats(self):
        attributes = [
            'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
            'num_keywords'
        ]
        meta_dict = super().stats()
        meta_dict.update({func: getattr(self, func)() for func in attributes})
        meta_dict.update(self.daystuff())
        meta_dict.update(self.lda())
        meta_dict.update(self.readability())
        return meta_dict