예제 #1
0
 def test_custom_wpm(self):
     text = 'some test content ' * 100
     result = readtime.of_text(text)
     self.assertEquals(result.wpm, DEFAULT_WPM)
     self.assertEquals(result.seconds, 68)
     self.assertEquals(result.text, u('2 min'))
     wpm = 50
     result = readtime.of_text(text, wpm=wpm)
     self.assertEquals(result.wpm, wpm)
     self.assertEquals(result.seconds, 360)
     self.assertEquals(type(result.seconds), int)
     self.assertEquals(result.text, u('6 min'))
     self.assertEquals(u(result), u('6 min read'))
예제 #2
0
 def test_plain_text(self):
     inp = open('tests/samples/plain_text.txt').read()
     result = readtime.of_text(inp)
     self.assertEquals(result.seconds, 154)
     self.assertEquals(type(result.seconds), int)
     self.assertEquals(result.text, u('3 min'))
     self.assertEquals(u(result), u('3 min read'))
def get_article_readtime(page, settings):

    article = get_article(page, settings)
    if article.text:
        result = readtime.of_text(article.text)
        return str(result.text), {'read_time': str(result.text)}

    return 'No article found', {}
예제 #4
0
    def test_transitions(self):
        word = 'word '
        for x in range(10):

            # test the maximum num words for x read time
            text = word * 265 * x
            result = readtime.of_text(text)
            self.assertEquals(result.seconds, x * 60 if x > 0 else 1)
            self.assertEquals(result.text,
                              u('{0} min'.format(x if x > 0 else 1)))
            self.assertEquals(u(result),
                              u('{0} min read'.format(x if x > 0 else 1)))

            # test the maximum + 1 num words, and make sure read time is x + 1
            text += 'word'
            result = readtime.of_text(text)
            self.assertEquals(result.seconds, x * 60 + 1)
            self.assertEquals(result.text, u('{0} min'.format(x + 1)))
            self.assertEquals(u(result), u('{0} min read'.format(x + 1)))
예제 #5
0
def virtual_read_wait(text: str) -> float:
    """Function to fake the reading time for the given text

    :param text: text to read
    :return: time taken to read the provided text
    """
    time_to_read = readtime.of_text(text).seconds
    wait(time_to_read * 1.3)

    return time_to_read
예제 #6
0
    def save(self, *args, **kwargs):
        self.slug = slugify(self.title, allow_unicode=True)
        self.read_time = readtime.of_text(self.content)
        super().save(*args, **kwargs)

        img = Image.open(self.thumbnail.path)
        if img.width > 640 or img.height > 640:
            output_size = (640, 640)
            img.thumbnail(output_size, Image.ANTIALIAS)
            img.save(self.thumbnail.path, optimize=True)
예제 #7
0
 def save(self, *args, **kwargs):
     self.readtime = str(readtime.of_text(self.title + self.content))
     if self.slug is None:
         mslug = slugify(self.title)
         exists = Post.objects.filter(slug=mslug).exists()
         count = 1
         while exists:
             count += 1
             mslug = slugify(self.title) + "-" + str(count)
             exists = Post.objects.filter(slug=mslug).exists()
         self.slug = mslug
     super().save(*args, **kwargs)
예제 #8
0
    def test_can_add(self):
        inp = open('tests/samples/plain_text.txt').read()
        result1 = readtime.of_text(inp)
        self.assertEquals(result1.seconds, 154)

        inp = open('tests/samples/markdown.md').read()
        result2 = readtime.of_markdown(inp)
        self.assertEquals(result2.seconds, 236)

        result = (result1 + result2)
        self.assertEquals(result.seconds, 154 + 236)
        self.assertEquals(type(result.seconds), int)
        self.assertEquals(result.text, u('7 min'))
        self.assertEquals(u(result), u('7 min read'))
예제 #9
0
    async def get_reading_time(self, url):
        try:
            html = requests.get(url).content
            paragraphs = justext.justext(html, justext.get_stoplist("English"))
            full_text = "\n\n".join(
                [p.text for p in paragraphs if not p.is_boilerplate]
            )
            result = readtime.of_text(full_text)

            if result.seconds <= RESPONSE_LIMIT_SECS:
                log.info("Article reading time under limit: {} secs, url=`{}`".format(
                    result.seconds,
                    url
                ))
                return 0, None

            return result.seconds, result.text
        except:
            pass
예제 #10
0
    def get_reading_time(self):
        result = readtime.of_text(str(self.body))
        reading_time = result.minutes
        unit = " minutes"

        return str(reading_time) + unit
예제 #11
0
 def get_readtime(self):
     result = readtime.of_text(self.description)
     return result.text
예제 #12
0
    def get(self, request, interest_name):
        if interest_name == "yourfeed":
            user = User.objects.get(username='******')
            user_profile = profile.objects.get(user=user)
            data = []
            for i in user_profile.interest.all():
                qs = Blog.objects.filter(interests=i)
                print(i, qs)
                for j in qs:
                    date = str(j.post_date).split()
                    date_str = date[0]
                    date_val = date_str.split('-')
                    x = datetime.datetime(int(date_val[0]), int(date_val[1]),
                                          int(date_val[2]))
                    result = readtime.of_text(j.content)
                    minutes = result.minutes
                    val = x.strftime('%Y %b %d')
                    link = str(j.cover_photo)
                    new_one = False
                    p = link.find('/fit/t')
                    if p != -1:
                        new_one = True
                        f = link.split('/')
                        for index, kl in enumerate(f):
                            if kl == 't':
                                f[index + 1] = 1110
                                f[index + 2] = 732
                        cov = f[0]

                        for po in f[1:]:
                            cov = cov + '/' + str(po)
                    link = str(j.cover_photo)
                    p = link.find('/freeze/focal')
                    if p != -1:
                        new_one = True
                        f = link.split('/')
                        for index, kl in enumerate(f):
                            if kl == 'focal':
                                f[index + 1] = 1110
                                f[index + 2] = 732
                        cov = f[0]
                        for po in f[1:]:
                            cov = cov + '/' + str(po)
                    link = str(j.cover_photo)
                    p = link.find('/freeze/max')
                    if p != -1:
                        new_one = True
                        f = link.split('/')
                        for index, kl in enumerate(f):
                            if kl == 'max':
                                f[index + 1] = 1050
                        cov = f[0]
                        for po in f[1:]:
                            cov = cov + '/' + str(po)
                    if new_one == False:
                        cov = j.cover_photo
                    # print(cov,end = ' ')
                    # print(i[5])
                    var = {
                        'id': j.id,
                        'author': j.author,
                        'heading': j.heading,
                        'content': j.content,
                        'post_date': val,
                        'interests': j.interests,
                        'cover_photo': cov,
                        'readtime': minutes
                    }
                    data.append(var)
            shuffle(data)
            return Response(data)

        cur = conn.cursor()
        va = "SELECT * FROM blog_interest WHERE interest_name = '" + str(
            interest_name) + "';"
        cur.execute(va)
        p = cur.fetchone()
        print(p)
        cur.execute(
            "SELECT author_id, heading, content, post_date, interests_id,cover_photo,id FROM blog_blog WHERE interests_id = '"
            + str(p[0]) + "' ORDER BY RANDOM();")
        rows = cur.fetchall()
        data = []
        for i in rows:
            cur.execute("SELECT username FROM auth_user where id = " +
                        str(i[0]))
            name = cur.fetchone()
            date = str(i[3]).split()
            date_str = date[0]
            date_val = date_str.split('-')
            x = datetime.datetime(int(date_val[0]), int(date_val[1]),
                                  int(date_val[2]))
            result = readtime.of_text(i[2])
            minutes = result.minutes
            val = x.strftime('%Y %b %d')
            link = str(i[5])
            new_one = False
            p = link.find('/fit/t')
            if p != -1:
                new_one = True
                f = link.split('/')
                for index, kl in enumerate(f):
                    if kl == 't':
                        f[index + 1] = 1110
                        f[index + 2] = 732
                cov = f[0]

                for po in f[1:]:
                    cov = cov + '/' + str(po)
            link = str(i[5])
            p = link.find('/freeze/focal')
            if p != -1:
                new_one = True
                f = link.split('/')
                for index, kl in enumerate(f):
                    if kl == 'focal':
                        f[index + 1] = 1110
                        f[index + 2] = 732
                cov = f[0]
                for po in f[1:]:
                    cov = cov + '/' + str(po)
            link = str(i[5])
            p = link.find('/freeze/max')
            if p != -1:
                new_one = True
                f = link.split('/')
                for index, kl in enumerate(f):
                    if kl == 'max':
                        f[index + 1] = 1050
                cov = f[0]
                for po in f[1:]:
                    cov = cov + '/' + str(po)
            if new_one == False:
                cov = i[5]
            # print(cov,end = ' ')
            # print(i[5])
            var = {
                'id': i[6],
                'author': name[0],
                'heading': i[1],
                'content': i[2],
                'post_date': val,
                'interests': i[4],
                'cover_photo': cov,
                'readtime': minutes
            }
            data.append(var)
        return Response(data)
예제 #13
0
    def get(self, request, blog_id):
        blog = Blog.objects.get(id=blog_id)
        cur = conn.cursor()
        cur.execute(
            "SELECT author_id, heading, content, post_date, interests_id,cover_photo FROM blog_blog WHERE id ="
            + str(blog_id))
        row = cur.fetchone()
        print(row)
        cur.execute("SELECT username FROM auth_user where id = " + str(row[0]))
        name = cur.fetchone()
        date = str(row[3]).split()
        date_str = date[0]
        date_val = date_str.split('-')
        x = datetime.datetime(int(date_val[0]), int(date_val[1]),
                              int(date_val[2]))
        result = readtime.of_text(row[2])
        minutes = result.minutes
        val = x.strftime('%Y %b %d')
        author_obj = User.objects.get(username=name[0])

        if author_obj in blog.upvotes.all():
            upvote = True
        else:
            upvote = False
        link = str(row[5])
        new_one = False
        p = link.find('/fit/t')
        if p != -1:
            new_one = True
            f = link.split('/')
            for index, kl in enumerate(f):
                if kl == 't':
                    f[index + 1] = 1110
                    f[index + 2] = 732
            cov = f[0]

            for po in f[1:]:
                cov = cov + '/' + str(po)
        link = str(row[5])
        p = link.find('/freeze/focal')
        if p != -1:
            new_one = True
            f = link.split('/')
            for index, kl in enumerate(f):
                if kl == 'focal':
                    f[index + 1] = 1110
                    f[index + 2] = 732
            cov = f[0]
            for po in f[1:]:
                cov = cov + '/' + str(po)
        link = str(row[5])
        p = link.find('/freeze/max')
        if p != -1:
            new_one = True
            f = link.split('/')
            for index, kl in enumerate(f):
                if kl == 'max':
                    f[index + 1] = 1050
            cov = f[0]
            for po in f[1:]:
                cov = cov + '/' + str(po)
        if new_one == False:
            cov = row[5]

        total_upvotes = len(blog.upvotes.all())
        booah = User.objects.get(username=name[0])
        follow = Follower.objects.get(follower=booah)
        user = User.objects.get(username='******')
        is_follow = False
        if user in follow.following.all():
            is_follow = True
        var = {
            'id': blog_id,
            'author': name[0],
            'heading': row[1],
            'content': row[2],
            'post_date': val,
            'interests': row[4],
            'cover_photo': cov,
            'readtime': minutes,
            'upvote': upvote,
            'total_upvote': total_upvotes,
            'is_follow': is_follow
        }

        return Response(var)
예제 #14
0
파일: main.py 프로젝트: Lulzx/sumitup
def process(update, context):
    if update.message:
        text = update.message.text
    else:
        return
    links = find(text)
    # handling for groups, when message has no links
    if not links:  # and update.message.chat.type == "super_group":
        return
    link = links[0]
    # try:
    #     link = links[0]
    # except:
    #     update.message.reply_text("Oh! Send a valid link.")
    article = Article(link)
    article.download()
    article.parse()
    try:
        author = "✍ *Author:* " + article.authors + "\n"
    except:
        author = ""
    date = "📅 *Publication Date:* "
    try:
        date += str(article.publish_date.strftime('%Y-%m-%d'))
    except:
        if article.publish_date is None:
            date = ""
        else:
            date += str(article.publish_date)
    value = article.html
    tree = fromstring(value)
    title = str(tree.findtext('.//title'))
    lang = translator.detect(title).lang
    if lang != 'en':
        text = translate(link)
        if text == 'null':
            return
        update.message.reply_text(text)
        link = find(text)[0]
        article = Article(link)
        article.download()
        article.parse()
    text = article.text
    soup = bs(value, 'lxml')
    outline = ""
    for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
        heading_text = heading.text.strip()
        if heading.name in ["h1", "h2"]:
            heading_text = f"*{heading_text}*"
        outline += int(heading.name[1:]) * ' ' + '- ' + heading_text + '\n'
    article.nlp()
    keywords = article.keywords
    tags = ""
    for keyword in keywords:
        tags += " #" + keyword
    summary = article.summary
    summary_points = ""
    for x in summary.splitlines():
        summary_points += "↦️ " + x + "\n"
    summary = summary_points
    read = readtime.of_text(text)
    msg = f"""🔗 *Link:* {link}\n{author}{date}\n🚩 *Title: {title}*\n\n🗨 *Summary:*\n _{summary}_\n"""
    msg += f"""🎋 *Outline: * \n{outline}\n"""
    msg += f"""🤔 *Reading Time:* {read}\n""".replace("min", "mins")
    msg += f"""📑 *Tags:* {tags}\n """
    query = urllib.parse.quote(msg.replace('*', '**').replace('_', '__'))
    share_url = 'tg://msg_url?url=' + query
    button_list = [
        InlineKeyboardButton('Add to reading list', callback_data=1),
        InlineKeyboardButton("📬 Share", url=share_url)
    ]
    reply_markup = InlineKeyboardMarkup(build_menu(button_list, n_cols=2))
    update.message.reply_text(
        msg, parse_mode=telegram.ParseMode.MARKDOWN, reply_markup=reply_markup)

    if update.message.chat_id != ADMIN:
        context.bot.send_message(chat_id="{}".format(ADMIN),
                                 text='{}'.format(
                                     update.message.from_user.first_name + " *sent:*\n" + msg),
                                 parse_mode=telegram.ParseMode.MARKDOWN)
예제 #15
0
 def test_plain_text_unicode(self):
     result = readtime.of_text('Some simple text')
     self.assertEquals(unicode(result), u('1 min read'))
예제 #16
0
def time_to_read(text):
    result = readtime.of_text(text)

    return f"{result.text} read"
예제 #17
0
    def parse(self, response):
        body = json.loads(response.body)
        googleClient = language.LanguageServiceClient(credentials=credentials)


        for value in body['articles']:
            
            if value['content'] is None: 
                content = ''
            else: 
                content = value['content']
                
            description = value['description']
            
            document = types.Document(
                content=content,
                type=enums.Document.Type.PLAIN_TEXT)
            sentiment = googleClient.analyze_sentiment(document=document).document_sentiment
            response = paralleldots.keywords(description)
            
            tags_dict = [{
                "keyword": "elon musk",
                "tag": "elon"
            },
            {
                "keyword": "model 3 model Y model X model S car electric vehicles ev's evs vehicle auto industry",
                "tag": "auto"
            },
            {
                "keyword": "home battery battery batteries solar panel solar panels home energy",
                "tag": "solar"
            }]

            tags = []
    
            try:
                for keyword in response['keywords']:  
                    for tag in tags_dict:
                        if (keyword['keyword'].lower() in tag['keyword'].lower()):
                            tags.append(tag['tag'])
            except KeyError:
                print ("No key found")


            print(response)

            # pdb.set_trace()

            readTime = readtime.of_text(content)

            class Person:
                "This is a person class"
                age = 10

                def greet(self):
                    print('Hello')

            # utc 
            # local_time = pytz.timezone("America/New_York")
            # naive_datetime = datetime.strptime (value['publishedAt'], "%Y-%m-%dT%H:%M:%SZ")
            # local_datetime = local_time.localize(naive_datetime, is_dst=None)
            # utc_datetime = local_datetime.astimezone(pytz.utc)
            # utc_timestamp = datetime.replace(tzinfo=timezone.utc).timestamp()
                        
            # Getting the current date  
            # and time 
            dt = datetime.strptime (value['publishedAt'], "%Y-%m-%dT%H:%M:%SZ")
  
            
            # pdb.set_trace()

            # utc_time = dt.replace(tzinfo = timezone.utc) 
            # utc_timestamp = utc_time.timestamp() 
            
            # print(utc_timestamp)
            

            newsItem = NewsApiItem()
            
            year = int(value['publishedAt'].split('-')[0])
            month = int(value['publishedAt'].split('-')[1])
            day = int(value['publishedAt'].split('-')[2].split('T')[0])
            hours = int(value['publishedAt'].split('-')[2].split('T')[1].split(':')[0])
            seconds = int(value['publishedAt'].split('-')[2].split(':')[1])
            miliseconds = int(value['publishedAt'].split('-')[2].split(':')[2].split('Z')[0])

            dt = datetime(year,month,day,hours,seconds,miliseconds).timestamp()
            # pdb.set_trace()

            newsItem['publishDate'] = value['publishedAt']
            newsItem['publisher'] = value['source']['name']
            newsItem['author'] = value['author']
            newsItem['description'] = value['description']
            newsItem['articleLink'] = value['url']
            newsItem['sentiment'] = sentiment.score
            # newsItem['magnitude'] = sentiment.magnitude
            newsItem['title'] = value['title']
            newsItem['tags'] = tags
            newsItem['topic'] = 'tesla'
            newsItem['readTime'] = readTime.seconds
            newsItem['utc'] = dt
            print( dt )
        
            # newsItem['author_sentiment'] = updateAuthorSentiment
            # newsItem['publisher_sentiment'] = updatePublisherSentiment

            # get the news story
            # run the sentiment analysis on that story 
            # attribute sentiment to the author and store that data independently 
            # attribute sentiment to the publisher and store that data independently 
            # attribute sentiment to the news story as well and finish the news agg process and store data

            # print('news item', newsItem)
            yield newsItem
예제 #18
0
 def save(self, *args, **kwargs):
     if not self.is_category:
         self.read_time = readtime.of_text(self.text).minutes
     return super().save(*args, **kwargs)
예제 #19
0
파일: models.py 프로젝트: kayesokua/kepo
 def get_read_time(self):
     result = readtime.of_text(self.content)
     return result.text
예제 #20
0
    def extract_article_contents(self, article_number_code):
        # call the article doc
        file = docx.Document('D:/CS/web/Verily/Verily/articles/article' +
                             str(article_number_code) + '.docx')

        # extract the article into paragraphs and substitute special characters
        for para in file.paragraphs:
            parag = re.sub(r'“', '"', para.text)
            parag = re.sub(r'”', '"', parag)
            parag = re.sub(r"’", "'", parag)
            parag = re.sub(r"‘", "'", parag)
            parag = re.sub(r" ", " ", parag)
            parag = re.sub(r"–", "-", parag)
            parag = re.sub(r"—", "-", parag)
            parag = re.sub(r"ü", "u", parag)
            parag = re.sub(r"é", "&#233", parag)
            self.fulltext.append(parag)
            self.text_string = self.text_string + " " + parag

        # extract the title and the articlepage.html name from the article
        self.article_title_insert = self.fulltext[0].strip()
        self.article_title_insert = re.sub('"', "'", self.article_title_insert)
        self.title_word_count = self.article_title_insert.lower().split(' ')
        self.article_title_code = "article"
        if len(self.title_word_count) >= 3:
            self.article_title_code = self.article_title_code + str(article_number_code) + '-' + \
                                      self.title_word_count[0] + '-' + self.title_word_count[1] + '-' + \
                                      self.title_word_count[2]
        elif len(self.title_word_count) == 2:
            self.article_title_code = self.article_title_code + str(article_number_code) + '-' + \
                                      self.title_word_count[0] + '-' + self.title_word_count[1]
        elif len(self.title_word_count) == 1:
            self.article_title_code = self.article_title_code + str(article_number_code) + '-' + \
                                      self.title_word_count[0]
        self.article_title_code = self.article_title_code.strip()

        # extract the author name(s) from the articles
        self.author_names_insert = self.fulltext[2].strip()
        self.author_names = self.author_names_insert.split(',')
        self.writerpage_codes = self.author_names_insert.lower().split(',')
        self.writerpage_codes = [
            re.sub(r' ', '', author) for author in self.writerpage_codes
        ]
        self.writerpage_codes = [
            'w-' + author for author in self.writerpage_codes
        ]

        # initialize default article information
        self.department = "Verity Today"
        self.article_date = "2020"

        # manually infer department from article number
        self.department = 'Tech' if article_number_code in [
            1, 2, 3, 42, 43
        ] else self.department
        self.department = 'Sports' if article_number_code in [
            4, 5, 6, 7, 45, 46
        ] else self.department
        self.department = 'Social Change' if article_number_code in [
            8, 9, 10, 11, 12
        ] else self.department
        self.department = 'Mental Health' if article_number_code in [
            13, 14, 15, 16
        ] else self.department
        self.department = 'Global' if article_number_code in [
            17, 18, 19, 20, 44
        ] else self.department
        self.department = 'Entertainment' if article_number_code in [
            21, 22, 23, 24, 25, 26, 27
        ] else self.department
        self.department = 'Education' if article_number_code in [
            28, 29, 30, 31, 32
        ] else self.department
        self.department = 'Creativity' if article_number_code in [
            33, 34, 35, 36, 37, 38
        ] else self.department
        self.department = 'Biz & Eco' if article_number_code in [
            39, 40, 41
        ] else self.department

        with open('article_info.csv', mode='r+', newline="\n") as article_info:
            reader = csv.reader(article_info, delimiter=',')
            for row in reader:
                if str(row[0]) == str(article_number_code):
                    self.department = 'Tech' if str(
                        row[3][0]) == 'T' else self.department
                    self.department = 'Sports' if str(
                        row[3][0]) == 'S' else self.department
                    self.department = 'Social Change' if str(
                        row[3][1]) == 'o' else self.department
                    self.department = 'Mental Health' if str(
                        row[3][0]) == 'M' else self.department
                    self.department = 'Global' if str(
                        row[3][0]) == 'G' else self.department
                    self.department = 'Entertainment' if str(
                        row[3][0]) == 'E' else self.department
                    self.department = 'Education' if str(
                        row[3][1]) == 'd' else self.department
                    self.department = 'Creativity' if str(
                        row[3][0]) == 'C' else self.department
                    self.department = 'Biz & Eco' if str(
                        row[3][0]) == 'B' else self.department
                    self.article_date = str(row[4])
                    break

        self.department_code = (self.department.split(' '))[0].lower()

        self.articlepage_code = 'a-' + self.department_code + '-' + self.article_title_code

        # assign the date of the article
        if 1 <= article_number_code <= 46:
            self.article_date = "July 2020"
        elif 47 <= article_number_code <= 47:
            self.article_date = "August 2020"

        # calculate the time to read the article
        self.readtime = str(readtime.of_text(self.text_string))

        print("Article details and content extracted, 0")
        return "Article details and content extracted, 0"
예제 #21
0
 def test_plain_text_empty(self):
     result = readtime.of_text('')
     self.assertEquals(result.seconds, 1)
     self.assertEquals(result.text, u('1 min'))
     self.assertEquals(u(result), u('1 min read'))
예제 #22
0
 def save(self, *args, **kwargs):
     self.readtime = readtime.of_text(self.body).minutes
     super(Post, self).save(*args, **kwargs)  # Call the real save() method
예제 #23
0
    def get(self,request):
        data = []
        user = User.objects.get(username="******")
        blogs = Blog.objects.filter(author=user)
        for j in blogs:
            date = str(j.post_date).split()
            date_str = date[0]
            date_val = date_str.split('-')
            x = datetime.datetime(int(date_val[0]), int(date_val[1]), int(date_val[2]))
            result = readtime.of_text(j.content)
            minutes = result.minutes
            val = x.strftime('%Y %b %d')
            link = str(j.cover_photo)
            new_one = False
            p = link.find('/fit/t')
            if p != -1:
                new_one = True
                f = link.split('/')
                for index, kl in enumerate(f):
                    if kl == 't':
                        f[index + 1] = 1110
                        f[index + 2] = 732
                cov = f[0]

                for po in f[1:]:
                    cov = cov + '/' + str(po)
            link = str(j.cover_photo)
            p = link.find('/freeze/focal')
            if p != -1:
                new_one = True
                f = link.split('/')
                for index, kl in enumerate(f):
                    if kl == 'focal':
                        f[index + 1] = 1110
                        f[index + 2] = 732
                cov = f[0]
                for po in f[1:]:
                    cov = cov + '/' + str(po)
            link = str(j.cover_photo)
            p = link.find('/freeze/max')
            if p != -1:
                new_one = True
                f = link.split('/')
                for index, kl in enumerate(f):
                    if kl == 'max':
                        f[index + 1] = 1050
                cov = f[0]
                for po in f[1:]:
                    cov = cov + '/' + str(po)
            if new_one == False:
                cov = j.cover_photo
            # print(cov,end = ' ')
            # print(i[5])
            j_user = User.objects.get(username=j.author)
            var = {
                'id': j.id,
                'author': j.author.username,
                'heading': j.heading,
                'content': j.content,
                'post_date': val,
                'interests': j.interests.id,
                'cover_photo': cov,
                'readtime': minutes
            }
            data.append(var)
        return Response(data)
예제 #24
0
 def countWords(self):
     return readtime.of_text(self.body)
예제 #25
0
 def test_plain_text_null(self):
     result = readtime.of_text(None)
     self.assertEquals(result.seconds, 0)
     self.assertEquals(result.text, u('1 min'))
     self.assertEquals(u(result), u('1 min read'))
예제 #26
0
 def get_readtime(self):
   result = readtime.of_text(self.body)
   return result.text 
예제 #27
0
파일: xml.py 프로젝트: macbre/wbc
def generate():
    """
    Execute the script with provided arguments
    """
    args = docopt(__doc__, version='WBC v1.0')
    logger = logging.getLogger('generate_xml')

    chapter_break = '__CHAPTER__'

    publication_ids = args['ID']
    logger.info('Generating XML for publication(s): {}'.format(publication_ids))

    xml = SphinxXML()

    # schema

    # fields are full-text searchable
    xml.add_field('title')
    xml.add_field('chapter')
    xml.add_field('content')

    # attributes are accessible via SELECT queries
    xml.add_attr('title', 'string')
    xml.add_attr('chapter', 'string')
    xml.add_attr('content', 'string')
    xml.add_attr('read_time', 'int')
    xml.add_attr('published_year', 'int')
    xml.add_attr('publication_id', 'int')
    xml.add_attr('document_id', 'int')

    xml.start()

    for publication_id in publication_ids.split(','):
        # read index.json for the publication
        index_path = 'publications/{}/index.json'.format(publication_id)
        with open(index_path) as fp:
            publication_data = json.load(fp)

        logger.info("Got {} issues for '{}'".format(
            publication_data['count'], publication_data['name'].encode('utf-8')))

        # add documents
        for issue in publication_data['issues']:
            published_year = issue['year'].split('_')[-1]  # 1951_1956

            try:
                content = get_content_stream(publication_id, issue['year'], issue['id'], chapter_break=chapter_break)
            except IOError:
                logger.error('Failed opening an issue file', exc_info=True)
                continue

            # split by chapters and index them separately
            chapters = content.getvalue().split(chapter_break)

            for chapter in chapters:
                chapter = chapter.strip()

                xml.add_document(
                    document_id=str(issue['id']),
                    title=issue['name'].encode('utf-8'),
                    chapter=chapter.split("\n")[0].strip(),
                    content=chapter,
                    read_time=str(readtime.of_text(chapter).seconds),
                    published_year=published_year,
                    publication_id=publication_id
                )

            content.close()

    xml.end()
예제 #28
0
파일: xml.py 프로젝트: macbre/wbc
def generate():
    """
    Execute the script with provided arguments
    """
    args = docopt(__doc__, version='WBC v1.0')
    logger = logging.getLogger('generate_xml')

    chapter_break = '__CHAPTER__'

    publication_ids = args['ID']
    logger.info(
        'Generating XML for publication(s): {}'.format(publication_ids))

    xml = SphinxXML()

    # schema

    # fields are full-text searchable
    xml.add_field('title')
    xml.add_field('chapter')
    xml.add_field('content')

    # attributes are accessible via SELECT queries
    xml.add_attr('title', 'string')
    xml.add_attr('chapter', 'string')
    xml.add_attr('content', 'string')
    xml.add_attr('read_time', 'int')
    xml.add_attr('published_year', 'int')
    xml.add_attr('publication_id', 'int')
    xml.add_attr('document_id', 'int')

    xml.start()

    for publication_id in publication_ids.split(','):
        # read index.json for the publication
        index_path = 'publications/{}/index.json'.format(publication_id)
        with open(index_path) as fp:
            publication_data = json.load(fp)

        logger.info("Got {} issues for '{}'".format(
            publication_data['count'],
            publication_data['name'].encode('utf-8')))

        # add documents
        for issue in publication_data['issues']:
            published_year = issue['year'].split('_')[-1]  # 1951_1956

            try:
                content = get_content_stream(publication_id,
                                             issue['year'],
                                             issue['id'],
                                             chapter_break=chapter_break)
            except IOError:
                logger.error('Failed opening an issue file', exc_info=True)
                continue

            # split by chapters and index them separately
            chapters = content.getvalue().split(chapter_break)

            for chapter in chapters:
                chapter = chapter.strip()

                xml.add_document(document_id=str(issue['id']),
                                 title=issue['name'].encode('utf-8'),
                                 chapter=chapter.split("\n")[0].strip(),
                                 content=chapter,
                                 read_time=str(
                                     readtime.of_text(chapter).seconds),
                                 published_year=published_year,
                                 publication_id=publication_id)

            content.close()

    xml.end()
예제 #29
0
import nltk
from html2text import html2text
import lxml
from lxml.html.clean import Cleaner
import os.path
from os import path

companies = ["apple", "microsoft", "google", "facebook", "twitter", "amazon"]
years = range(2014, 2020)
for c in companies:
    for y in years:
        filename = c + "_" + str(y) + ".html"
        if (path.exists(filename)):
            file = codecs.open(filename, 'r', 'utf-8')
            document = BeautifulSoup(file.read()).get_text()
            readtime_result = readtime.of_text(document)
            cleaned_html_file = clean_html(document)
            corpus_arr = word_count(cleaned_html_file)
            word_count = len(corpus_arr)
            print(readtime_result)
            print("The total word count is: ", word_count)


def clean_html(html):

    # remove javascript
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
    cleaned = re.sub(r"<script[\d\D]*?>[\d\D]*?</script>", "", cleaned)

    # remove html comments
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)