def test_custom_wpm(self): text = 'some test content ' * 100 result = readtime.of_text(text) self.assertEquals(result.wpm, DEFAULT_WPM) self.assertEquals(result.seconds, 68) self.assertEquals(result.text, u('2 min')) wpm = 50 result = readtime.of_text(text, wpm=wpm) self.assertEquals(result.wpm, wpm) self.assertEquals(result.seconds, 360) self.assertEquals(type(result.seconds), int) self.assertEquals(result.text, u('6 min')) self.assertEquals(u(result), u('6 min read'))
def test_plain_text(self): inp = open('tests/samples/plain_text.txt').read() result = readtime.of_text(inp) self.assertEquals(result.seconds, 154) self.assertEquals(type(result.seconds), int) self.assertEquals(result.text, u('3 min')) self.assertEquals(u(result), u('3 min read'))
def get_article_readtime(page, settings): article = get_article(page, settings) if article.text: result = readtime.of_text(article.text) return str(result.text), {'read_time': str(result.text)} return 'No article found', {}
def test_transitions(self): word = 'word ' for x in range(10): # test the maximum num words for x read time text = word * 265 * x result = readtime.of_text(text) self.assertEquals(result.seconds, x * 60 if x > 0 else 1) self.assertEquals(result.text, u('{0} min'.format(x if x > 0 else 1))) self.assertEquals(u(result), u('{0} min read'.format(x if x > 0 else 1))) # test the maximum + 1 num words, and make sure read time is x + 1 text += 'word' result = readtime.of_text(text) self.assertEquals(result.seconds, x * 60 + 1) self.assertEquals(result.text, u('{0} min'.format(x + 1))) self.assertEquals(u(result), u('{0} min read'.format(x + 1)))
def virtual_read_wait(text: str) -> float: """Function to fake the reading time for the given text :param text: text to read :return: time taken to read the provided text """ time_to_read = readtime.of_text(text).seconds wait(time_to_read * 1.3) return time_to_read
def save(self, *args, **kwargs): self.slug = slugify(self.title, allow_unicode=True) self.read_time = readtime.of_text(self.content) super().save(*args, **kwargs) img = Image.open(self.thumbnail.path) if img.width > 640 or img.height > 640: output_size = (640, 640) img.thumbnail(output_size, Image.ANTIALIAS) img.save(self.thumbnail.path, optimize=True)
def save(self, *args, **kwargs): self.readtime = str(readtime.of_text(self.title + self.content)) if self.slug is None: mslug = slugify(self.title) exists = Post.objects.filter(slug=mslug).exists() count = 1 while exists: count += 1 mslug = slugify(self.title) + "-" + str(count) exists = Post.objects.filter(slug=mslug).exists() self.slug = mslug super().save(*args, **kwargs)
def test_can_add(self): inp = open('tests/samples/plain_text.txt').read() result1 = readtime.of_text(inp) self.assertEquals(result1.seconds, 154) inp = open('tests/samples/markdown.md').read() result2 = readtime.of_markdown(inp) self.assertEquals(result2.seconds, 236) result = (result1 + result2) self.assertEquals(result.seconds, 154 + 236) self.assertEquals(type(result.seconds), int) self.assertEquals(result.text, u('7 min')) self.assertEquals(u(result), u('7 min read'))
async def get_reading_time(self, url): try: html = requests.get(url).content paragraphs = justext.justext(html, justext.get_stoplist("English")) full_text = "\n\n".join( [p.text for p in paragraphs if not p.is_boilerplate] ) result = readtime.of_text(full_text) if result.seconds <= RESPONSE_LIMIT_SECS: log.info("Article reading time under limit: {} secs, url=`{}`".format( result.seconds, url )) return 0, None return result.seconds, result.text except: pass
def get_reading_time(self): result = readtime.of_text(str(self.body)) reading_time = result.minutes unit = " minutes" return str(reading_time) + unit
def get_readtime(self): result = readtime.of_text(self.description) return result.text
def get(self, request, interest_name): if interest_name == "yourfeed": user = User.objects.get(username='******') user_profile = profile.objects.get(user=user) data = [] for i in user_profile.interest.all(): qs = Blog.objects.filter(interests=i) print(i, qs) for j in qs: date = str(j.post_date).split() date_str = date[0] date_val = date_str.split('-') x = datetime.datetime(int(date_val[0]), int(date_val[1]), int(date_val[2])) result = readtime.of_text(j.content) minutes = result.minutes val = x.strftime('%Y %b %d') link = str(j.cover_photo) new_one = False p = link.find('/fit/t') if p != -1: new_one = True f = link.split('/') for index, kl in enumerate(f): if kl == 't': f[index + 1] = 1110 f[index + 2] = 732 cov = f[0] for po in f[1:]: cov = cov + '/' + str(po) link = str(j.cover_photo) p = link.find('/freeze/focal') if p != -1: new_one = True f = link.split('/') for index, kl in enumerate(f): if kl == 'focal': f[index + 1] = 1110 f[index + 2] = 732 cov = f[0] for po in f[1:]: cov = cov + '/' + str(po) link = str(j.cover_photo) p = link.find('/freeze/max') if p != -1: new_one = True f = link.split('/') for index, kl in enumerate(f): if kl == 'max': f[index + 1] = 1050 cov = f[0] for po in f[1:]: cov = cov + '/' + str(po) if new_one == False: cov = j.cover_photo # print(cov,end = ' ') # print(i[5]) var = { 'id': j.id, 'author': j.author, 'heading': j.heading, 'content': j.content, 'post_date': val, 'interests': j.interests, 'cover_photo': cov, 'readtime': minutes } data.append(var) shuffle(data) return Response(data) cur = conn.cursor() va = "SELECT * FROM blog_interest WHERE interest_name = '" + str( interest_name) + "';" cur.execute(va) p = cur.fetchone() print(p) cur.execute( "SELECT author_id, heading, content, post_date, interests_id,cover_photo,id FROM blog_blog WHERE interests_id = '" + str(p[0]) + "' ORDER BY RANDOM();") rows = cur.fetchall() data = [] for i in rows: cur.execute("SELECT username FROM auth_user where id = " + str(i[0])) name = cur.fetchone() date = str(i[3]).split() date_str = date[0] date_val = date_str.split('-') x = datetime.datetime(int(date_val[0]), int(date_val[1]), int(date_val[2])) result = readtime.of_text(i[2]) minutes = result.minutes val = x.strftime('%Y %b %d') link = str(i[5]) new_one = False p = link.find('/fit/t') if p != -1: new_one = True f = link.split('/') for index, kl in enumerate(f): if kl == 't': f[index + 1] = 1110 f[index + 2] = 732 cov = f[0] for po in f[1:]: cov = cov + '/' + str(po) link = str(i[5]) p = link.find('/freeze/focal') if p != -1: new_one = True f = link.split('/') for index, kl in enumerate(f): if kl == 'focal': f[index + 1] = 1110 f[index + 2] = 732 cov = f[0] for po in f[1:]: cov = cov + '/' + str(po) link = str(i[5]) p = link.find('/freeze/max') if p != -1: new_one = True f = link.split('/') for index, kl in enumerate(f): if kl == 'max': f[index + 1] = 1050 cov = f[0] for po in f[1:]: cov = cov + '/' + str(po) if new_one == False: cov = i[5] # print(cov,end = ' ') # print(i[5]) var = { 'id': i[6], 'author': name[0], 'heading': i[1], 'content': i[2], 'post_date': val, 'interests': i[4], 'cover_photo': cov, 'readtime': minutes } data.append(var) return Response(data)
def get(self, request, blog_id): blog = Blog.objects.get(id=blog_id) cur = conn.cursor() cur.execute( "SELECT author_id, heading, content, post_date, interests_id,cover_photo FROM blog_blog WHERE id =" + str(blog_id)) row = cur.fetchone() print(row) cur.execute("SELECT username FROM auth_user where id = " + str(row[0])) name = cur.fetchone() date = str(row[3]).split() date_str = date[0] date_val = date_str.split('-') x = datetime.datetime(int(date_val[0]), int(date_val[1]), int(date_val[2])) result = readtime.of_text(row[2]) minutes = result.minutes val = x.strftime('%Y %b %d') author_obj = User.objects.get(username=name[0]) if author_obj in blog.upvotes.all(): upvote = True else: upvote = False link = str(row[5]) new_one = False p = link.find('/fit/t') if p != -1: new_one = True f = link.split('/') for index, kl in enumerate(f): if kl == 't': f[index + 1] = 1110 f[index + 2] = 732 cov = f[0] for po in f[1:]: cov = cov + '/' + str(po) link = str(row[5]) p = link.find('/freeze/focal') if p != -1: new_one = True f = link.split('/') for index, kl in enumerate(f): if kl == 'focal': f[index + 1] = 1110 f[index + 2] = 732 cov = f[0] for po in f[1:]: cov = cov + '/' + str(po) link = str(row[5]) p = link.find('/freeze/max') if p != -1: new_one = True f = link.split('/') for index, kl in enumerate(f): if kl == 'max': f[index + 1] = 1050 cov = f[0] for po in f[1:]: cov = cov + '/' + str(po) if new_one == False: cov = row[5] total_upvotes = len(blog.upvotes.all()) booah = User.objects.get(username=name[0]) follow = Follower.objects.get(follower=booah) user = User.objects.get(username='******') is_follow = False if user in follow.following.all(): is_follow = True var = { 'id': blog_id, 'author': name[0], 'heading': row[1], 'content': row[2], 'post_date': val, 'interests': row[4], 'cover_photo': cov, 'readtime': minutes, 'upvote': upvote, 'total_upvote': total_upvotes, 'is_follow': is_follow } return Response(var)
def process(update, context): if update.message: text = update.message.text else: return links = find(text) # handling for groups, when message has no links if not links: # and update.message.chat.type == "super_group": return link = links[0] # try: # link = links[0] # except: # update.message.reply_text("Oh! Send a valid link.") article = Article(link) article.download() article.parse() try: author = "✍ *Author:* " + article.authors + "\n" except: author = "" date = "📅 *Publication Date:* " try: date += str(article.publish_date.strftime('%Y-%m-%d')) except: if article.publish_date is None: date = "" else: date += str(article.publish_date) value = article.html tree = fromstring(value) title = str(tree.findtext('.//title')) lang = translator.detect(title).lang if lang != 'en': text = translate(link) if text == 'null': return update.message.reply_text(text) link = find(text)[0] article = Article(link) article.download() article.parse() text = article.text soup = bs(value, 'lxml') outline = "" for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]): heading_text = heading.text.strip() if heading.name in ["h1", "h2"]: heading_text = f"*{heading_text}*" outline += int(heading.name[1:]) * ' ' + '- ' + heading_text + '\n' article.nlp() keywords = article.keywords tags = "" for keyword in keywords: tags += " #" + keyword summary = article.summary summary_points = "" for x in summary.splitlines(): summary_points += "↦️ " + x + "\n" summary = summary_points read = readtime.of_text(text) msg = f"""🔗 *Link:* {link}\n{author}{date}\n🚩 *Title: {title}*\n\n🗨 *Summary:*\n _{summary}_\n""" msg += f"""🎋 *Outline: * \n{outline}\n""" msg += f"""🤔 *Reading Time:* {read}\n""".replace("min", "mins") msg += f"""📑 *Tags:* {tags}\n """ query = urllib.parse.quote(msg.replace('*', '**').replace('_', '__')) share_url = 'tg://msg_url?url=' + query button_list = [ InlineKeyboardButton('Add to reading list', callback_data=1), InlineKeyboardButton("📬 Share", url=share_url) ] reply_markup = InlineKeyboardMarkup(build_menu(button_list, n_cols=2)) update.message.reply_text( msg, parse_mode=telegram.ParseMode.MARKDOWN, reply_markup=reply_markup) if update.message.chat_id != ADMIN: context.bot.send_message(chat_id="{}".format(ADMIN), text='{}'.format( update.message.from_user.first_name + " *sent:*\n" + msg), parse_mode=telegram.ParseMode.MARKDOWN)
def test_plain_text_unicode(self): result = readtime.of_text('Some simple text') self.assertEquals(unicode(result), u('1 min read'))
def time_to_read(text): result = readtime.of_text(text) return f"{result.text} read"
def parse(self, response): body = json.loads(response.body) googleClient = language.LanguageServiceClient(credentials=credentials) for value in body['articles']: if value['content'] is None: content = '' else: content = value['content'] description = value['description'] document = types.Document( content=content, type=enums.Document.Type.PLAIN_TEXT) sentiment = googleClient.analyze_sentiment(document=document).document_sentiment response = paralleldots.keywords(description) tags_dict = [{ "keyword": "elon musk", "tag": "elon" }, { "keyword": "model 3 model Y model X model S car electric vehicles ev's evs vehicle auto industry", "tag": "auto" }, { "keyword": "home battery battery batteries solar panel solar panels home energy", "tag": "solar" }] tags = [] try: for keyword in response['keywords']: for tag in tags_dict: if (keyword['keyword'].lower() in tag['keyword'].lower()): tags.append(tag['tag']) except KeyError: print ("No key found") print(response) # pdb.set_trace() readTime = readtime.of_text(content) class Person: "This is a person class" age = 10 def greet(self): print('Hello') # utc # local_time = pytz.timezone("America/New_York") # naive_datetime = datetime.strptime (value['publishedAt'], "%Y-%m-%dT%H:%M:%SZ") # local_datetime = local_time.localize(naive_datetime, is_dst=None) # utc_datetime = local_datetime.astimezone(pytz.utc) # utc_timestamp = datetime.replace(tzinfo=timezone.utc).timestamp() # Getting the current date # and time dt = datetime.strptime (value['publishedAt'], "%Y-%m-%dT%H:%M:%SZ") # pdb.set_trace() # utc_time = dt.replace(tzinfo = timezone.utc) # utc_timestamp = utc_time.timestamp() # print(utc_timestamp) newsItem = NewsApiItem() year = int(value['publishedAt'].split('-')[0]) month = int(value['publishedAt'].split('-')[1]) day = int(value['publishedAt'].split('-')[2].split('T')[0]) hours = int(value['publishedAt'].split('-')[2].split('T')[1].split(':')[0]) seconds = int(value['publishedAt'].split('-')[2].split(':')[1]) miliseconds = int(value['publishedAt'].split('-')[2].split(':')[2].split('Z')[0]) dt = datetime(year,month,day,hours,seconds,miliseconds).timestamp() # pdb.set_trace() newsItem['publishDate'] = value['publishedAt'] newsItem['publisher'] = value['source']['name'] newsItem['author'] = value['author'] newsItem['description'] = value['description'] newsItem['articleLink'] = value['url'] newsItem['sentiment'] = sentiment.score # newsItem['magnitude'] = sentiment.magnitude newsItem['title'] = value['title'] newsItem['tags'] = tags newsItem['topic'] = 'tesla' newsItem['readTime'] = readTime.seconds newsItem['utc'] = dt print( dt ) # newsItem['author_sentiment'] = updateAuthorSentiment # newsItem['publisher_sentiment'] = updatePublisherSentiment # get the news story # run the sentiment analysis on that story # attribute sentiment to the author and store that data independently # attribute sentiment to the publisher and store that data independently # attribute sentiment to the news story as well and finish the news agg process and store data # print('news item', newsItem) yield newsItem
def save(self, *args, **kwargs): if not self.is_category: self.read_time = readtime.of_text(self.text).minutes return super().save(*args, **kwargs)
def get_read_time(self): result = readtime.of_text(self.content) return result.text
def extract_article_contents(self, article_number_code): # call the article doc file = docx.Document('D:/CS/web/Verily/Verily/articles/article' + str(article_number_code) + '.docx') # extract the article into paragraphs and substitute special characters for para in file.paragraphs: parag = re.sub(r'“', '"', para.text) parag = re.sub(r'”', '"', parag) parag = re.sub(r"’", "'", parag) parag = re.sub(r"‘", "'", parag) parag = re.sub(r" ", " ", parag) parag = re.sub(r"–", "-", parag) parag = re.sub(r"—", "-", parag) parag = re.sub(r"ü", "u", parag) parag = re.sub(r"é", "é", parag) self.fulltext.append(parag) self.text_string = self.text_string + " " + parag # extract the title and the articlepage.html name from the article self.article_title_insert = self.fulltext[0].strip() self.article_title_insert = re.sub('"', "'", self.article_title_insert) self.title_word_count = self.article_title_insert.lower().split(' ') self.article_title_code = "article" if len(self.title_word_count) >= 3: self.article_title_code = self.article_title_code + str(article_number_code) + '-' + \ self.title_word_count[0] + '-' + self.title_word_count[1] + '-' + \ self.title_word_count[2] elif len(self.title_word_count) == 2: self.article_title_code = self.article_title_code + str(article_number_code) + '-' + \ self.title_word_count[0] + '-' + self.title_word_count[1] elif len(self.title_word_count) == 1: self.article_title_code = self.article_title_code + str(article_number_code) + '-' + \ self.title_word_count[0] self.article_title_code = self.article_title_code.strip() # extract the author name(s) from the articles self.author_names_insert = self.fulltext[2].strip() self.author_names = self.author_names_insert.split(',') self.writerpage_codes = self.author_names_insert.lower().split(',') self.writerpage_codes = [ re.sub(r' ', '', author) for author in self.writerpage_codes ] self.writerpage_codes = [ 'w-' + author for author in self.writerpage_codes ] # initialize default article information self.department = "Verity Today" self.article_date = "2020" # manually infer department from article number self.department = 'Tech' if article_number_code in [ 1, 2, 3, 42, 43 ] else self.department self.department = 'Sports' if article_number_code in [ 4, 5, 6, 7, 45, 46 ] else self.department self.department = 'Social Change' if article_number_code in [ 8, 9, 10, 11, 12 ] else self.department self.department = 'Mental Health' if article_number_code in [ 13, 14, 15, 16 ] else self.department self.department = 'Global' if article_number_code in [ 17, 18, 19, 20, 44 ] else self.department self.department = 'Entertainment' if article_number_code in [ 21, 22, 23, 24, 25, 26, 27 ] else self.department self.department = 'Education' if article_number_code in [ 28, 29, 30, 31, 32 ] else self.department self.department = 'Creativity' if article_number_code in [ 33, 34, 35, 36, 37, 38 ] else self.department self.department = 'Biz & Eco' if article_number_code in [ 39, 40, 41 ] else self.department with open('article_info.csv', mode='r+', newline="\n") as article_info: reader = csv.reader(article_info, delimiter=',') for row in reader: if str(row[0]) == str(article_number_code): self.department = 'Tech' if str( row[3][0]) == 'T' else self.department self.department = 'Sports' if str( row[3][0]) == 'S' else self.department self.department = 'Social Change' if str( row[3][1]) == 'o' else self.department self.department = 'Mental Health' if str( row[3][0]) == 'M' else self.department self.department = 'Global' if str( row[3][0]) == 'G' else self.department self.department = 'Entertainment' if str( row[3][0]) == 'E' else self.department self.department = 'Education' if str( row[3][1]) == 'd' else self.department self.department = 'Creativity' if str( row[3][0]) == 'C' else self.department self.department = 'Biz & Eco' if str( row[3][0]) == 'B' else self.department self.article_date = str(row[4]) break self.department_code = (self.department.split(' '))[0].lower() self.articlepage_code = 'a-' + self.department_code + '-' + self.article_title_code # assign the date of the article if 1 <= article_number_code <= 46: self.article_date = "July 2020" elif 47 <= article_number_code <= 47: self.article_date = "August 2020" # calculate the time to read the article self.readtime = str(readtime.of_text(self.text_string)) print("Article details and content extracted, 0") return "Article details and content extracted, 0"
def test_plain_text_empty(self): result = readtime.of_text('') self.assertEquals(result.seconds, 1) self.assertEquals(result.text, u('1 min')) self.assertEquals(u(result), u('1 min read'))
def save(self, *args, **kwargs): self.readtime = readtime.of_text(self.body).minutes super(Post, self).save(*args, **kwargs) # Call the real save() method
def get(self,request): data = [] user = User.objects.get(username="******") blogs = Blog.objects.filter(author=user) for j in blogs: date = str(j.post_date).split() date_str = date[0] date_val = date_str.split('-') x = datetime.datetime(int(date_val[0]), int(date_val[1]), int(date_val[2])) result = readtime.of_text(j.content) minutes = result.minutes val = x.strftime('%Y %b %d') link = str(j.cover_photo) new_one = False p = link.find('/fit/t') if p != -1: new_one = True f = link.split('/') for index, kl in enumerate(f): if kl == 't': f[index + 1] = 1110 f[index + 2] = 732 cov = f[0] for po in f[1:]: cov = cov + '/' + str(po) link = str(j.cover_photo) p = link.find('/freeze/focal') if p != -1: new_one = True f = link.split('/') for index, kl in enumerate(f): if kl == 'focal': f[index + 1] = 1110 f[index + 2] = 732 cov = f[0] for po in f[1:]: cov = cov + '/' + str(po) link = str(j.cover_photo) p = link.find('/freeze/max') if p != -1: new_one = True f = link.split('/') for index, kl in enumerate(f): if kl == 'max': f[index + 1] = 1050 cov = f[0] for po in f[1:]: cov = cov + '/' + str(po) if new_one == False: cov = j.cover_photo # print(cov,end = ' ') # print(i[5]) j_user = User.objects.get(username=j.author) var = { 'id': j.id, 'author': j.author.username, 'heading': j.heading, 'content': j.content, 'post_date': val, 'interests': j.interests.id, 'cover_photo': cov, 'readtime': minutes } data.append(var) return Response(data)
def countWords(self): return readtime.of_text(self.body)
def test_plain_text_null(self): result = readtime.of_text(None) self.assertEquals(result.seconds, 0) self.assertEquals(result.text, u('1 min')) self.assertEquals(u(result), u('1 min read'))
def get_readtime(self): result = readtime.of_text(self.body) return result.text
def generate(): """ Execute the script with provided arguments """ args = docopt(__doc__, version='WBC v1.0') logger = logging.getLogger('generate_xml') chapter_break = '__CHAPTER__' publication_ids = args['ID'] logger.info('Generating XML for publication(s): {}'.format(publication_ids)) xml = SphinxXML() # schema # fields are full-text searchable xml.add_field('title') xml.add_field('chapter') xml.add_field('content') # attributes are accessible via SELECT queries xml.add_attr('title', 'string') xml.add_attr('chapter', 'string') xml.add_attr('content', 'string') xml.add_attr('read_time', 'int') xml.add_attr('published_year', 'int') xml.add_attr('publication_id', 'int') xml.add_attr('document_id', 'int') xml.start() for publication_id in publication_ids.split(','): # read index.json for the publication index_path = 'publications/{}/index.json'.format(publication_id) with open(index_path) as fp: publication_data = json.load(fp) logger.info("Got {} issues for '{}'".format( publication_data['count'], publication_data['name'].encode('utf-8'))) # add documents for issue in publication_data['issues']: published_year = issue['year'].split('_')[-1] # 1951_1956 try: content = get_content_stream(publication_id, issue['year'], issue['id'], chapter_break=chapter_break) except IOError: logger.error('Failed opening an issue file', exc_info=True) continue # split by chapters and index them separately chapters = content.getvalue().split(chapter_break) for chapter in chapters: chapter = chapter.strip() xml.add_document( document_id=str(issue['id']), title=issue['name'].encode('utf-8'), chapter=chapter.split("\n")[0].strip(), content=chapter, read_time=str(readtime.of_text(chapter).seconds), published_year=published_year, publication_id=publication_id ) content.close() xml.end()
def generate(): """ Execute the script with provided arguments """ args = docopt(__doc__, version='WBC v1.0') logger = logging.getLogger('generate_xml') chapter_break = '__CHAPTER__' publication_ids = args['ID'] logger.info( 'Generating XML for publication(s): {}'.format(publication_ids)) xml = SphinxXML() # schema # fields are full-text searchable xml.add_field('title') xml.add_field('chapter') xml.add_field('content') # attributes are accessible via SELECT queries xml.add_attr('title', 'string') xml.add_attr('chapter', 'string') xml.add_attr('content', 'string') xml.add_attr('read_time', 'int') xml.add_attr('published_year', 'int') xml.add_attr('publication_id', 'int') xml.add_attr('document_id', 'int') xml.start() for publication_id in publication_ids.split(','): # read index.json for the publication index_path = 'publications/{}/index.json'.format(publication_id) with open(index_path) as fp: publication_data = json.load(fp) logger.info("Got {} issues for '{}'".format( publication_data['count'], publication_data['name'].encode('utf-8'))) # add documents for issue in publication_data['issues']: published_year = issue['year'].split('_')[-1] # 1951_1956 try: content = get_content_stream(publication_id, issue['year'], issue['id'], chapter_break=chapter_break) except IOError: logger.error('Failed opening an issue file', exc_info=True) continue # split by chapters and index them separately chapters = content.getvalue().split(chapter_break) for chapter in chapters: chapter = chapter.strip() xml.add_document(document_id=str(issue['id']), title=issue['name'].encode('utf-8'), chapter=chapter.split("\n")[0].strip(), content=chapter, read_time=str( readtime.of_text(chapter).seconds), published_year=published_year, publication_id=publication_id) content.close() xml.end()
import nltk from html2text import html2text import lxml from lxml.html.clean import Cleaner import os.path from os import path companies = ["apple", "microsoft", "google", "facebook", "twitter", "amazon"] years = range(2014, 2020) for c in companies: for y in years: filename = c + "_" + str(y) + ".html" if (path.exists(filename)): file = codecs.open(filename, 'r', 'utf-8') document = BeautifulSoup(file.read()).get_text() readtime_result = readtime.of_text(document) cleaned_html_file = clean_html(document) corpus_arr = word_count(cleaned_html_file) word_count = len(corpus_arr) print(readtime_result) print("The total word count is: ", word_count) def clean_html(html): # remove javascript cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip()) cleaned = re.sub(r"<script[\d\D]*?>[\d\D]*?</script>", "", cleaned) # remove html comments cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)