def generate_companies_tweets_stats(*, vocab_tweets_index_path: str, topics_words_csv_path: str, topics_words_md_path: str): topics_words = [] topics_words_index = {} for i, r in enumerate(dictlines_from_csv(topics_words_csv_path)): topics_words.append(r) topics_words_index[r['Word']] = i for row in next_vocab_tweets_index(vocab_tweets_index_path): if row['Word'] in topics_words_index: i = topics_words_index[row['Word']] topics_words[i]['tweets'] = len(row['Tweet_Ids'].split(',')) fields = ['Word/Name', 'Number of Tweets', 'Meaning'] fields_align = [MdCellAlign.left, MdCellAlign.left, MdCellAlign.center] with open(topics_words_md_path, "w", encoding='utf-8') as f_out: md_static_part = f"""{h1('Study Tweets Containing Topics/Words')} {table_header(fields, fields_align)} """ f_out.writelines(md_static_part) rank = 1 for word in topics_words: hanzi = word['Word'] row = [link(hanzi, f"{hanzi}.md"), str(word['tweets']), word['meaning']] f_out.writelines(f"{table_row(row)}\n") rank += 1
def write_word_card(*, title: str, tweet_data: List[Mapping[str, Mapping]], cards_study_folder: str, tweets_per_page: int): num_tweets = len(tweet_data) pages = (num_tweets // tweets_per_page) + (1 if num_tweets % tweets_per_page > 0 else 0) logging.info( f"Generating {title}, number of tweets {len(tweet_data)}, pages = {pages}" ) tweets_title_heading = f"Tweets containing {title}" word_static_md_part = f"""{h1(title)} Search {link('mdbg', mdbg_link(title=title))} for definition Search {link('wiktionary', wiktionary_link(title=title))} for definition {h3(tweets_title_heading)} """ for page in range(pages): word_md_filepath = f"{cards_study_folder}/{word_md_filename(title=title, page=page)}" logging.info(f"page {page}, pages {pages}, {word_md_filepath}") with open(word_md_filepath, "w", encoding='utf-8') as f_out: link_text = word_previous_next_links(title=title, page=page, pages=pages) if link_text: f_out.write(f"{link_text}\n") f_out.writelines(word_static_md_part) low = page * tweets_per_page last = low + tweets_per_page high = num_tweets if last > num_tweets else last for i in range(low, high): tweet = tweet_data[i] f_out.write(f"{hr()}\n") date_source = f"{tweet['Date']} ~ {tweet['Source']}" f_out.write(f"{h5(date_source)}\n") tweet_text = tweet['Tweet'] f_out.write(f"{blockquote(tweet_text)}\n") f_out.write( f"\n{link('Google Translation', googtrans_link(source_text=tweet_text))}\n" ) if len(tweet['Words']) > 1: f_out.write( f"{h5('Other Words/Names of Interest in the Above Tweet')}\n" ) buffer = [] other_words = sorted(tweet['Words'].difference({title})) for other_word in other_words: buffer.append(link(other_word, f"{other_word}.md")) f_out.write(f"{', '.join(buffer)}\n") if link_text: f_out.write(f"____\n\n{link_text}\n")
def word_previous_next_links(*, title: str, page: int, pages: int) -> str: """ :param title: :param page: zero-based :param pages: total number of pages :return: """ if pages == 1: return '' next_page = link("Next Page", f"{title}-{page+1:02}.md") prev_page = link("Previous Page", word_md_filename(title=title, page=page - 1)) if page == 0: return next_page elif page == pages - 1: return prev_page else: return f"{prev_page} | {next_page}"
def generate_vocab_tweets_stats(*, vocab_tweets_index_path: str, cards_study_folder: str): vocab_tweets = [(row['Word'], len(row['Tweet_Ids'].split(','))) for row in next_vocab_tweets_index(vocab_tweets_index_path) ] vocab_tweets.sort(key=lambda data: data[1], reverse=True) fields = ['Rank', 'Word/Name', 'Number of Tweets'] fields_align = [MdCellAlign.left, MdCellAlign.center, MdCellAlign.center] with open(f"{cards_study_folder}/words_tweets_stats.md", "w", encoding='utf-8') as f_out: md_static_part = f"""{h1('Study Words/Names Statistics')} {table_header(fields, fields_align)} """ f_out.writelines(md_static_part) rank = 1 for word, num_tweets in vocab_tweets: row = [str(rank), link(word, f"{word}.md"), str(num_tweets)] f_out.writelines(f"{table_row(row)}\n") rank += 1
def generate_companies_tweets_stats(*, vocab_tweets_index_path: str, cards_study_folder: str, companies_csv_path: str): companies_tickers = [] companies_index = {} for i, r in enumerate(dictlines_from_csv(companies_csv_path)): companies_tickers.append(r) companies_index[r['Chinese Name']] = i for row in next_vocab_tweets_index(vocab_tweets_index_path): if row['Word'] in companies_index: i = companies_index[row['Word']] companies_tickers[i]['tweets'] = len(row['Tweet_Ids'].split(',')) fields = [ 'Chinese Name', 'English Name', 'U.S. Ticker', 'Number of Tweets' ] fields_align = [ MdCellAlign.left, MdCellAlign.left, MdCellAlign.center, MdCellAlign.center ] with open(f"{cards_study_folder}/companies_tweets_stats.md", "w", encoding='utf-8') as f_out: md_static_part = f"""{h1('Companies/Brands ')} {table_header(fields, fields_align)} """ f_out.writelines(md_static_part) rank = 1 for company in companies_tickers: hanzi = company['Chinese Name'] row = [ link(hanzi, f"{hanzi}.md"), company['English Name'], company['Ticker'], str(company['tweets']) ] f_out.writelines(f"{table_row(row)}\n") rank += 1
def generate_curated_words_study(*, words_toml_path: str, curated_words_study_path: str): _md_template = [ f"""{h1(f"A Subset of Curated Words Extracted From Tweets")} To learn how the words are used in context, read the tweets by clicking or tapping on the Chinese words. For a complete list of the curated words, check it out {link('here', 'words_tweets_stats.md')}. """ ] toml_words = toml.load(words_toml_path) fields_align = [MdCellAlign.left, MdCellAlign.center] pprint(toml_words) for name, heading in toml_words['category-names'].items(): _md_template.append(h2(heading)) _md_template.append(table_header(['', ''], fields_align)) for word in toml_words[name]: _md_template.append( table_row([link(word['hz'], f"{word['hz']}.md"), word['en']])) with open(curated_words_study_path, "w") as fh: fh.writelines('\n'.join(_md_template))
def write(self, md_filepath: str, title: str, reverse_sort: bool = True): self._report_data.sort(key=lambda tw: tw.tweet_date, reverse=reverse_sort) mdtb_rows = [] for attrs in self._report_data: link = googtrans_link(source_text=attrs.tweet_text) mdtb_rows.append( md.table_row([ str(attrs.tweet_date), attrs.tweet_source, md.link(attrs.tweet_text, link) ])) tbl = '\n'.join(mdtb_rows) report = f"""## {title} Tweets with [{title}](https://en.wiktionary.org/wiki/{title}). Tap or click to check if Wiktionary has an entry for it. | UTC Date | Tweet Source | Tweet (click or tap to see Google Translation) | |:-----------------|:-------------|:------------------| {tbl} """ with open(md_filepath, "w") as f_out: f_out.writelines(report)
def new_write_word_card(summarized_tweets_words: Mapping[str, Mapping], cards_study_folder: str, tweets_per_page: int, word_tweets: WordAndTweets): tweet_data = [ summarized_tweets_words[tweet_id] for tweet_id in word_tweets.tweet_ids ] tweet_data.sort(key=lambda tw: tw['Date'], reverse=True) num_tweets = len(tweet_data) title = word_tweets.word pages = (num_tweets // tweets_per_page) + (1 if num_tweets % tweets_per_page > 0 else 0) logging.info( f"Generating {title}, number of tweets {len(tweet_data)}, pages = {pages}" ) tweets_title_heading = f"Tweets containing {title}" word_static_md_part = f"""{h1(title)} Search {link('mdbg', mdbg_link(title=title))} for definition Search {link('wiktionary', wiktionary_link(title=title))} for definition {h3(tweets_title_heading)} """ for page in range(pages): word_md_filepath = f"{cards_study_folder}/{word_md_filename(title=title, page=page)}" logging.info(f"page {page}, pages {pages}, {word_md_filepath}") with open(word_md_filepath, "w", encoding='utf-8') as f_out: link_text = word_previous_next_links(title=title, page=page, pages=pages) md_page_buffer = [ f"{link_text}\n{word_static_md_part}" if link_text else word_static_md_part ] low = page * tweets_per_page last = low + tweets_per_page high = num_tweets if last > num_tweets else last for i in range(low, high): tweet = tweet_data[i] date_source = f"{tweet['Date']} ~ {tweet['Source']}" tweet_text = tweet['Tweet'] md_tweet_body = f"""{hr()} {h5(date_source)} {blockquote(tweet['Tweet'])} {link('Google Translation', googtrans_link(source_text=tweet_text))} """ md_page_buffer.append(md_tweet_body) if len(tweet['Words']) > 1: buffer = [] other_words = sorted(tweet['Words'].difference({title})) for other_word in other_words: buffer.append(link(other_word, f"{other_word}.md")) md_other_words = f"""{h5('Other Words/Names of Interest in the Above Tweet')} {', '.join(buffer)} """ md_page_buffer.append(md_other_words) if link_text: md_bottom_link = f"""____ {link_text} """ md_page_buffer.append(md_bottom_link) f_out.writelines(md_page_buffer)