示例#1
0
 def __init__(
     self,
     url,
     consent_elem: selenium.webdriver.remote.webelement.WebElement,
 ):
     self.elem = consent_elem  # The content of the button text.
     self.html = self.elem.get_attribute("outerHTML")
     self.text = self.elem.text
     self.hasDenyAll = False
     self.hasAcceptAll = False
     self.readabilityARI = None
     self.readabilityFLESH = None
     self.scrn = None
     # Check if we have deny/accept all buttons.
     self.hasDenyAll = self._find_deny_all()
     self.hasAcceptAll = self._find_appr_all()
     # Lets define our readability by ARI and FLESH.
     r = Readability(self.text)
     self.readabilityARI = r.ari().__dict__
     self.readabilityFLESH = r.flesch().__dict__
     # Lets see if we can find any checkboxes...
     self.totalCheckboxes = 0
     self.checkedCheckboxes = 0
     total_checkboxes = self.elem.find_elements_by_css_selector(
         "input[type='checkbox']")
     checked_checkboxes = self.elem.find_elements_by_css_selector(
         "input[type='checkbox']:checked")
     self.totalCheckboxes = len(total_checkboxes)
     self.checkedCheckboxes = len(checked_checkboxes)
示例#2
0
def get_scores(text):
    analysis = Readability(text)
    results = {}
    results['ari'] = analysis.ARI()
    results['fkgl'] = analysis.FleschKincaidGradeLevel()
    results['cli'] = analysis.ColemanLiauIndex()
    results['fre'] = analysis.FleschReadingEase()
    results['gfi'] = analysis.GunningFogIndex()
    results['lix'] = analysis.LIX()
    results['rix'] = analysis.RIX()
    results['smog'] = analysis.SMOGIndex()
    return results
示例#3
0
 def __init__(self, url, raw=None):
     if not raw:
         raw = requests.get(url).content
     soup = BeautifulSoup(raw, 'lxml')
     self.url = url
     self.metadata = self.getMeta(soup)
     if self.metadata['content']:
         content = self.metadata['content'].find("section")
         if not content:
             content = self.metadata['content']
         content = " ".join(list(content.stripped_strings))
     else:
         raise Exception(
             "No content found for", url,
             "\nPlease add custom constraints [if any] in attributes_list.py"
         )
     self.rd = Readability(content)
     super().__init__(self.metadata['title'], content)
def extract_user_reading_levels(
    input_file_name=DEFAULT_RAW_REVIEWS_FILE_NAME,
    output_file_name=DEFAULT_READING_LEVELS_FILE_NAME,
    reviews_to_analyze_per_user=float('inf')):
    """
	Given a Yelp dataset reviews file, builds a file:
		user_1_ID user_1_reading_level
			.
			.
			.
		user_N_ID user_N_reading_level

	WARNING: This function is computationally expensive. The amount of computation can be limited
	by setting reviews_to_analyze_per_user, the maximum number of reviews to analyze per user.
	On a 2011 MacBook Air, 1000 reviews take 2-3 seconds to analyze.
	"""
    # Maps each user ID --> [running sum of review reading levels, running number of reviews]
    total_reading_level_and_review_count_for_user = defaultdict(lambda: [0, 0])

    # Compute the above mapping
    with open(raw_data_absolute_path(input_file_name)) as reviews_file:

        for review_JSON in reviews_file:
            review = json.loads(review_JSON)

            # Skip reviews from users who we have analyzed to the maximum desired
            if total_reading_level_and_review_count_for_user[
                    review['user_id']][1] >= reviews_to_analyze_per_user:
                continue

            # TODO: Try other reading level metrics
            try:
                total_reading_level_and_review_count_for_user[
                    review['user_id']][0] += Readability(
                        review['text']).SMOGIndex()
                total_reading_level_and_review_count_for_user[
                    review['user_id']][1] += 1
            except UnicodeEncodeError as error:
                pass

    # Compute each user's average reading level
    # Note: minimum SMOG index is 3.0, but users without reviews are assigned 0
    average_reading_level_for_user = {
        user_ID: safe_divide(total_reading_level, review_count)
        for user_ID, [total_reading_level, review_count] in
        total_reading_level_and_review_count_for_user.iteritems()
    }

    write_single_user_attribute(average_reading_level_for_user,
                                output_file_name)
示例#5
0
    def setup(self):
        self.words = []
        self.nouns = {}
        self.verbs = {}
        self.similarity = -1

        self.active_words = []
        self.passive_words = []
        
        self.direct_words = []
        self.indirect_words = []

        self.positive_words = []
        self.negative_words = []

        self.line_break = False

        rd = Readability(self.text)
        self.FleschReadingEase = rd.FleschReadingEase()
        self.FleschKincaidGradeLevel = rd.FleschKincaidGradeLevel()
        self.GunningFogIndex = rd.GunningFogIndex()
        self.SMOGIndex = rd.SMOGIndex()
示例#6
0
def getReadabilities(string):
    read = Readability(string)
    return read.FleschReadingEase(), read.FleschKincaidGradeLevel()
示例#7
0
def assess_readability(text):
    '''Assess the readability of text with the Flesch-Kincaid Grade Level test,
    as implemented in Python here: https://github.com/mmautner/readability'''

    # Assess grade level
    return Readability(text).FleschKincaidGradeLevel()
示例#8
0
class Article(ArticleText):
    def __init__(self, url, raw=None):
        if not raw:
            raw = requests.get(url).content
        soup = BeautifulSoup(raw, 'lxml')
        self.url = url
        self.metadata = self.getMeta(soup)
        if self.metadata['content']:
            content = self.metadata['content'].find("section")
            if not content:
                content = self.metadata['content']
            content = " ".join(list(content.stripped_strings))
        else:
            raise Exception(
                "No content found for", url,
                "\nPlease add custom constraints [if any] in attributes_list.py"
            )
        self.rd = Readability(content)
        super().__init__(self.metadata['title'], content)

    def iterTillHit(self, soup, arglist, target=None):
        for arg in arglist:
            cont = soup.find(*arg)
            if cont:
                if not target:
                    return cont
                elif cont.text:
                    return cont.text
                else:
                    return cont[target]
        else:
            return None

    def getMeta(self, soup):
        # Title, Keywords, Description, Author, Published
        attr_d = {}
        attr_d['title'] = self.iterTillHit(soup, TITLE_L, 'content')
        attr_d['keyword'] = self.iterTillHit(soup, KEYWORD_L, 'content')
        attr_d['desc'] = self.iterTillHit(soup, DESC_L, 'content')
        attr_d['author'] = self.iterTillHit(soup, AUTHOR_L, 'content')
        attr_d['published'] = self.iterTillHit(soup, PUBLISHED_L, 'content')
        attr_d['content'] = self.iterTillHit(soup, CONTENT_L)

        return attr_d

    def num_hrefs(self):
        return len(self.metadata['content'].findAll("a", href=True))

    def num_self_hrefs(self):
        site = urlparse(self.url)[1]
        return sum([
            1 for href in self.metadata['content'].findAll("a", href=True)
            if site in href['href']
        ])

    def num_imgs(self):
        return len(self.metadata['content'].findAll("img"))

    def num_videos(self):
        return len(self.metadata['content'].findAll("iframe"))

    def num_keywords(self):
        return len(self.metadata['keyword'].split(
            ",")) if self.metadata['keyword'] else 0

    def daystuff(self):
        weekday_dict = [["weekday_is_monday", 0], ["weekday_is_tuesday", 0],
                        ["weekday_is_wednesday", 0],
                        ["weekday_is_thursday", 0], ["weekday_is_friday", 0],
                        ["weekday_is_saturday", 0], ["weekday_is_sunday", 0],
                        ["is_weekend", 0]]

        try:
            weekday = dateparser.parse(self.metadata['published']).weekday()
            weekday_dict[weekday][1] = 1
            weekday_dict[-1][1] = 1 if weekday > 4 else 0
        except TypeError:
            pass
        finally:
            return dict(weekday_dict)

    def lda(self):
        lda_dict = getLDA(self.metadata['title'])[0]
        lda_dict = {"LDA_%.2d" % index: val for index, val in lda_dict}
        return lda_dict

    def readability(self):
        readability_dict = {
            'ARI': self.rd.ARI(),
            'FleschReadingEase': self.rd.FleschReadingEase(),
            'FleschKincaidGradeLevel': self.rd.FleschKincaidGradeLevel(),
            'GunningFogIndex': self.rd.GunningFogIndex(),
            'SMOGIndex': self.rd.SMOGIndex(),
            'ColemanLiauIndex': self.rd.ColemanLiauIndex(),
            'LIX': self.rd.LIX(),
            'RIX': self.rd.RIX()
        }
        return readability_dict

    def stats(self):
        attributes = [
            'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
            'num_keywords'
        ]
        meta_dict = super().stats()
        meta_dict.update({func: getattr(self, func)() for func in attributes})
        meta_dict.update(self.daystuff())
        meta_dict.update(self.lda())
        meta_dict.update(self.readability())
        return meta_dict