async def _plot_messages_distribution(msgs, your_name, target_name, results_directory): """Shows how messages are distributed.""" plt.heat_map(msgs, results_directory) await asyncio.sleep(delay) plt.pie_messages_per_author(msgs, your_name, target_name, results_directory) await asyncio.sleep(delay) plt.stackplot_non_text_messages_percentage(msgs, results_directory) await asyncio.sleep(delay) plt.barplot_non_text_messages(msgs, results_directory) await asyncio.sleep(delay) plt.barplot_messages_per_weekday(msgs, your_name, target_name, results_directory) await asyncio.sleep(delay) plt.barplot_messages_per_day(msgs, results_directory) await asyncio.sleep(delay) plt.barplot_messages_per_minutes(msgs, results_directory) await asyncio.sleep(delay) plt.barplot_non_text_messages(msgs, results_directory) await asyncio.sleep(delay) plt.distplot_messages_per_hour(msgs, results_directory) await asyncio.sleep(delay) plt.distplot_messages_per_month(msgs, results_directory) await asyncio.sleep(delay) plt.distplot_messages_per_day(msgs, results_directory) await asyncio.sleep(delay) plt.lineplot_messages(msgs, your_name, target_name, results_directory) await asyncio.sleep(delay) log_line("Messages distribution was analysed.")
async def get_telegram_messages(your_name, target_name, loop=None, target_id=None, num=1000000): """Retrieves a list of messages from Telegram dialogue. Notes: Requires a ready-to-use Telegram secrets (id, hash etc). Asks for target's id in a case this parameter is None. Retrieves a photo album as distinct messages with photos. Args: your_name (str): Your name. target_name (str): Target's name. loop (asyncio.windows_events._WindowsSelectorEventLoop, optional): An event loop. target_id (int,optional): Target's dialogue id. num (int,optional): No more than num NEWEST messages will be retrieved. Returns: A list of MyMessage objects (from older messages to newer). """ async with (await _get_client(loop=loop)) as client: if target_id is None: target_id = await _get_target_dialog_id(client) target_entity = await client.get_entity(target_id) log_line("Receiving Telegram messages...") telethon_messages = await _retrieve_messages(client, target_entity, num) messages = [ _telethon_msg_to_mymessage(msg, target_id, your_name, target_name) for msg in telethon_messages ] log_line(f"{len(messages)} Telegram messages were received") return messages
def barplot_messages_per_minutes(msgs, path_to_save, minutes=2): sns.set(style="whitegrid", palette="muted") sns.despine(top=True) messages_per_minutes = stools.get_messages_per_minutes(msgs, minutes) xticks_labels = stools.get_hours() xticks = [i * 60 // minutes for i in range(24)] min_minutes = len(min(messages_per_minutes.values(), key=lambda day: len(day))) max_minutes = len(max(messages_per_minutes.values(), key=lambda day: len(day))) pal = sns.color_palette("GnBu_d", max_minutes - min_minutes + 1)[::-1] ax = sns.barplot(x=list(range(len(messages_per_minutes))), y=[len(day) for day in messages_per_minutes.values()], edgecolor="none", palette=np.array(pal)[[len(day) - min_minutes for day in messages_per_minutes.values()]]) _change_bar_width(ax, 1.) ax.set(xlabel="hour", ylabel="messages") ax.set_xticklabels(xticks_labels) ax.tick_params(axis='x', bottom=True, color="#A9A9A9") plt.xticks(xticks, rotation=65) fig = plt.gcf() fig.set_size_inches(20, 10) fig.savefig(os.path.join(path_to_save, barplot_messages_per_minutes.__name__ + ".png"), dpi=600) # plt.show() log_line(f"{barplot_messages_per_minutes.__name__} was created.") plt.close("all")
def barplot_messages_per_weekday(msgs, your_name, target_name, path_to_save): sns.set(style="whitegrid", palette="pastel") messages_per_weekday = stools.get_messages_per_weekday(msgs) labels = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] ax = sns.barplot(x=labels, y=[len(weekday) for weekday in messages_per_weekday.values()], label=your_name, color="b") sns.set_color_codes("muted") sns.barplot(x=labels, y=[len([msg for msg in weekday if msg.author == target_name]) for weekday in messages_per_weekday.values()], label=target_name, color="b") ax.legend(ncol=2, loc="lower right", frameon=True) ax.set(ylabel="messages") sns.despine(right=True, top=True) fig = plt.gcf() fig.set_size_inches(11, 8) fig.savefig(os.path.join(path_to_save, barplot_messages_per_weekday.__name__ + ".png"), dpi=600) # plt.show() log_line(f"{barplot_messages_per_weekday.__name__} was created.") plt.close("all")
def wordcloud(msgs, words, path_to_save): all_words_list = [] words_cnt = stools.get_words_countered(msgs) # we need to create a huge string which contains each word as many times as it encounters in messages. for word in set(words): all_words_list.extend([word] * (words_cnt[word])) random.shuffle(all_words_list, random.random) # don't forget to shuffle ! all_words_string = ' '.join(all_words_list) # the cloud will be a circle. radius = 500 x, y = np.ogrid[:2 * radius, :2 * radius] mask = (x - radius) ** 2 + (y - radius) ** 2 > radius ** 2 mask = 255 * mask.astype(int) word_cloud = wc.WordCloud(background_color="white", repeat=False, mask=mask) word_cloud.generate(all_words_string) plt.axis("off") plt.imshow(word_cloud, interpolation="bilinear") word_cloud.to_file(os.path.join(path_to_save, wordcloud.__name__ + ".png")) # plt.show() plt.close() log_line(f"{wordcloud.__name__} was created.")
def lineplot_messages(msgs, your_name, target_name, path_to_save): sns.set(style="whitegrid") (x, y_total), (xticks, xticks_labels, xlabel) = _get_plot_data(msgs), _get_xticks(msgs) y_your = [len([msg for msg in period if msg.author == your_name]) for period in y_total] y_target = [len([msg for msg in period if msg.author == target_name]) for period in y_total] plt.fill_between(x, y_your, alpha=0.3) ax = sns.lineplot(x=x, y=y_your, palette="denim blue", linewidth=2.5, label=your_name) plt.fill_between(x, y_target, alpha=0.3) sns.lineplot(x=x, y=y_target, linewidth=2.5, label=target_name) ax.set(xlabel=xlabel, ylabel="messages") ax.set_xticklabels(xticks_labels) ax.tick_params(axis='x', bottom=True, color="#A9A9A9") plt.xticks(xticks, rotation=65) ax.margins(x=0, y=0) # plt.tight_layout() fig = plt.gcf() fig.set_size_inches(13, 7) fig.savefig(os.path.join(path_to_save, lineplot_messages.__name__ + ".png"), dpi=600) # plt.show() plt.close("all") log_line(f"{lineplot_messages.__name__} was created.")
def barplot_messages_per_day(msgs, path_to_save): sns.set(style="whitegrid", palette="muted") sns.despine(top=True) messages_per_day_vals = stools.get_messages_per_day(msgs).values() xticks, xticks_labels, xlabel = _get_xticks(msgs) min_day = len(min(messages_per_day_vals, key=lambda day: len(day))) max_day = len(max(messages_per_day_vals, key=lambda day: len(day))) pal = sns.color_palette("Greens_d", max_day - min_day + 1)[::-1] ax = sns.barplot(x=list(range(len(messages_per_day_vals))), y=[len(day) for day in messages_per_day_vals], edgecolor="none", palette=np.array(pal)[[len(day) - min_day for day in messages_per_day_vals]]) _change_bar_width(ax, 1.) ax.set(xlabel=xlabel, ylabel="messages") ax.set_xticklabels(xticks_labels) ax.tick_params(axis='x', bottom=True, color="#A9A9A9") plt.xticks(xticks, rotation=65) fig = plt.gcf() fig.set_size_inches(20, 10) fig.savefig(os.path.join(path_to_save, barplot_messages_per_day.__name__ + ".png"), dpi=500) # plt.show() log_line(f"{barplot_messages_per_day.__name__} was created.") plt.close("all")
def pie_messages_per_author(msgs, your_name, target_name, path_to_save): forwarded = len([msg for msg in msgs if msg.is_forwarded]) msgs = list(filter(lambda msg: not msg.is_forwarded, msgs)) your_messages_len = len([msg for msg in msgs if msg.author == your_name]) target_messages_len = len(msgs) - your_messages_len data = [your_messages_len, target_messages_len, forwarded] labels = [f"{your_name}\n({your_messages_len})", f"{target_name}\n({target_messages_len})", f"forwarded\n({forwarded})"] explode = (.0, .0, .2) fig, ax = plt.subplots(figsize=(13, 8), subplot_kw=dict(aspect="equal")) wedges, _, autotexts = ax.pie(x=data, explode=explode, colors=["#4982BB", "#5C6093", "#53B8D7"], autopct=lambda pct: f"{pct:.1f}%", wedgeprops={"edgecolor": "black", "alpha": 0.8}) ax.legend(wedges, labels, loc="upper right", bbox_to_anchor=(1, 0, 0.5, 1)) plt.setp(autotexts, size=10, weight="bold") fig.savefig(os.path.join(path_to_save, pie_messages_per_author.__name__ + ".png"), dpi=600) # plt.show() plt.close("all") log_line(f"{pie_messages_per_author.__name__} was created.")
def barplot_words(msgs, your_name, target_name, words, topn, path_to_save): sns.set(style="whitegrid") your_msgs = [msg for msg in msgs if msg.author == your_name] target_msgs = [msg for msg in msgs if msg.author == target_name] your_words_cnt = stools.get_words_countered(your_msgs) target_words_cnt = stools.get_words_countered(target_msgs) words.sort(key=lambda w: your_words_cnt[w] + target_words_cnt[w], reverse=True) df_dict = {"name": [], "word": [], "num": []} for word in words[:topn]: df_dict["word"].extend([word, word]) df_dict["name"].append(your_name) df_dict["num"].append(your_words_cnt[word]) df_dict["name"].append(target_name) df_dict["num"].append(target_words_cnt[word]) ax = sns.barplot(x="word", y="num", hue="name", data=pd.DataFrame(df_dict), palette="PuBu") ax.legend(ncol=1, loc="upper right", frameon=True) ax.set(ylabel="messages", xlabel='') fig = plt.gcf() fig.set_size_inches(14, 8) fig.savefig(os.path.join(path_to_save, barplot_words.__name__ + ".png"), dpi=600) # plt.show() log_line(f"{barplot_words.__name__} was created.") plt.close("all")
def barplot_emojis(msgs, your_name, target_name, topn, path_to_save): sns.set(style="whitegrid") mc_emojis = stools.get_emoji_countered(msgs).most_common(topn) if not mc_emojis: return your_msgs = [msg for msg in msgs if msg.author == your_name] target_msgs = [msg for msg in msgs if msg.author == target_name] your_emojis_cnt = stools.get_emoji_countered(your_msgs) target_emojis_cnt = stools.get_emoji_countered(target_msgs) df_dict = {"name": [], "emoji": [], "num": []} for e, _ in mc_emojis: df_dict["emoji"].extend([emoji.demojize(e), emoji.demojize(e)]) df_dict["name"].append(your_name) df_dict["num"].append(your_emojis_cnt[e]) df_dict["name"].append(target_name) df_dict["num"].append(target_emojis_cnt[e]) ax = sns.barplot(x="num", y="emoji", hue="name", data=pd.DataFrame(df_dict), palette="PuBu") ax.set(ylabel="emoji name", xlabel="emojis") ax.legend(ncol=1, loc="lower right", frameon=True) fig = plt.gcf() fig.set_size_inches(11, 8) plt.tight_layout() fig.savefig(os.path.join(path_to_save, barplot_emojis.__name__ + ".png"), dpi=600) # plt.show() log_line(f"{barplot_emojis.__name__} was created.") plt.close("all")
async def _plot_words_distribution(msgs, your_name, target_name, results_directory, words): """Shows how some words are distributed among the users.""" plt.barplot_words(msgs, your_name, target_name, words, 10, results_directory) await asyncio.sleep(delay) plt.wordcloud(msgs, words, results_directory) await asyncio.sleep(delay) log_line("Words distribution was analysed.")
def get_telegram_secrets(): config_file_name = _get_config_file_name() config_parser = configparser.ConfigParser() config_parser.read(config_file_name, encoding="utf-8-sig") api_id = config_parser.get("telegram_secrets", "api_id", fallback="") api_hash = config_parser.get("telegram_secrets", "api_hash", fallback="") phone_number = config_parser.get("telegram_secrets", "phone_number", fallback="") session_name = config_parser.get("telegram_secrets", "session_name", fallback="") log_line(f"Telegram secrets were received from {config_file_name} file.") return api_id, api_hash, phone_number, session_name
async def _plot_messages_distribution_content_based(msgs, your_name, target_name, results_directory): """Shows how some characteristics of messages content are distributed.""" plt.lineplot_message_length(msgs, your_name, target_name, results_directory) await asyncio.sleep(delay) plt.barplot_emojis(msgs, your_name, target_name, 10, results_directory) await asyncio.sleep(delay) log_line("Content based messages distribution was analysed.")
def store_telegram_secrets(api_id, api_hash, phone_number, session_name="Message retriever"): config_file_name = _get_config_file_name() config_parser = configparser.ConfigParser() config_parser.read(config_file_name, encoding="utf-8-sig") config_parser.set("telegram_secrets", "api_id", api_id) config_parser.set("telegram_secrets", "api_hash", api_hash) config_parser.set("telegram_secrets", "session_name", session_name) config_parser.set("telegram_secrets", "phone_number", phone_number) with open(config_file_name, "w+", encoding="utf-8") as config_file: config_parser.write(config_file) log_line(f"Telegram secrets were stored in {config_file_name} file.")
def get_session_params(): config_file_name = _get_config_file_name() config_parser = configparser.ConfigParser() config_parser.read(config_file_name, encoding="utf-8-sig") dialog_id = config_parser.get("session_params", "dialog_id", fallback="") dialog_id = int(dialog_id) if dialog_id else -1 vkopt_file = config_parser.get("session_params", "vkopt_file", fallback="") words_file = config_parser.get("session_params", "words_file", fallback="") your_name = config_parser.get("session_params", "your_name", fallback="") target_name = config_parser.get("session_params", "target_name", fallback="") log_line(f"Session parameters were received from {config_file_name} file.") return dialog_id, vkopt_file, words_file, your_name, target_name
def store_session_params(params): config_file_name = _get_config_file_name() config_parser = configparser.ConfigParser() config_parser.read(config_file_name, encoding="utf-8-sig") assert params["from_vk"] or params["from_telegram"] config_parser.set("session_params", "dialog_id", re.compile("\(id=[0-9]+\)$").search(params["dialogue"]).group()[4:-1] if params["from_telegram"] else "") config_parser.set("session_params", "vkopt_file", params["vkopt_file"] if params["from_vk"] else "") config_parser.set("session_params", "words_file", params["words_file"] if params["plot_words"] else "") assert params["your_name"] and params["target_name"] config_parser.set("session_params", "your_name", params["your_name"]) config_parser.set("session_params", "target_name", params["target_name"]) with open(config_file_name, "w+", encoding="utf-8") as config_file: config_parser.write(config_file) log_line(f"Session parameters were stored in {config_file_name} file.")
def distplot_messages_per_hour(msgs, path_to_save): sns.set(style="whitegrid") ax = sns.distplot([msg.date.hour for msg in msgs], bins=range(25), color="m", kde=False) ax.set_xticklabels(stools.get_hours()) ax.set(xlabel="hour", ylabel="messages") ax.margins(x=0) plt.xticks(range(24), rotation=65) plt.tight_layout() fig = plt.gcf() fig.set_size_inches(11, 8) fig.savefig(os.path.join(path_to_save, distplot_messages_per_hour.__name__ + ".png"), dpi=600) # plt.show() log_line(f"{distplot_messages_per_hour.__name__} was created.") plt.close("all")
def stackplot_non_text_messages_percentage(msgs, path_to_save): sns.set(style="whitegrid", palette="muted") colors = ['y', 'b', 'c', 'r', 'g', 'm'] (x, y_total), (xticks, xticks_labels, xlabel) = _get_plot_data(msgs), _get_xticks(msgs) stacks = stools.get_non_text_messages_grouped(y_total) # Normalize values for i in range(len(stacks[0]["groups"])): total = sum(stack["groups"][i] for stack in stacks) for stack in stacks: if not total: stack["groups"][i] = 0 else: stack["groups"][i] /= total plt.stackplot(x, *[stack["groups"] for stack in stacks], labels=[stack["type"] for stack in stacks], colors=colors, alpha=0.7) plt.margins(0, 0) plt.xticks(xticks, rotation=65) plt.yticks([i / 10 for i in range(0, 11, 2)]) ax = plt.gca() ax.set_xticklabels(xticks_labels) ax.set_yticklabels([f"{i}%" for i in range(0, 101, 20)]) ax.tick_params(axis='x', bottom=True, color="#A9A9A9") ax.set(xlabel=xlabel, ylabel="non-text messages") # https://stackoverflow.com/a/4701285 # Shrink current axis by 10% box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.9, box.height]) # Put a legend to the right of the current axis ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) fig = plt.gcf() fig.set_size_inches(11, 8) fig.savefig(os.path.join(path_to_save, stackplot_non_text_messages_percentage.__name__ + ".png"), dpi=600) # plt.show() log_line(f"{stackplot_non_text_messages_percentage.__name__} was created.") plt.close("all")
def distplot_messages_per_day(msgs, path_to_save): sns.set(style="whitegrid") data = stools.get_messages_per_day(msgs) max_day_len = len(max(data.values(), key=len)) ax = sns.distplot([len(day) for day in data.values()], bins=list(range(0, max_day_len, 50)) + [max_day_len], color="m", kde=False) ax.set(xlabel="messages", ylabel="days") ax.margins(x=0) fig = plt.gcf() fig.set_size_inches(11, 8) fig.savefig(os.path.join(path_to_save, distplot_messages_per_day.__name__ + ".png"), dpi=600) # plt.show() log_line(f"{distplot_messages_per_day.__name__} was created.") plt.close("all")
def heat_map(msgs, path_to_save, seasons=False): sns.set(style="whitegrid") messages_per_day = stools.get_messages_per_day(msgs) months = stools.date_months_to_str_months(stools.get_months(msgs)) heat_calendar = {month: np.array([None] * 31, dtype=np.float64) for month in months} for day, d_msgs in messages_per_day.items(): heat_calendar[stools.str_month(day)][day.day - 1] = len(d_msgs) # min_day = len(min(messages_per_day.values(), key=len)) max_day = len(max(messages_per_day.values(), key=len)) data = np.array(list(heat_calendar.values())) mask = np.array([np.array(arr, dtype=bool) for arr in data]) cmap = cm.get_cmap("Purples") center = max_day * 0.4 # (avg([len(d) for d in messages_per_day.values()]) + (max_day - min_day) / 2) / 2 ax = sns.heatmap(data=data, cmap=cmap, center=center, xticklabels=True, yticklabels=True, square=True, linewidths=.2, cbar_kws={"shrink": .5}) # builds a mask to highlight empty days sns.heatmap(data, mask=mask, xticklabels=range(1, 32), yticklabels=months, linewidths=.2, cbar=False, cmap=mpl_colors.ListedColormap(["#ffffe6"])) if seasons: # divides heatmap on seasons season_lines = [i for i, m in enumerate(months) if m.month % 3 == 0 and i != 0] ax.hlines(season_lines, *ax.get_xlim(), colors=["b"]) ax.set(xlabel='day', ylabel="month") ax.margins(x=0) plt.tight_layout() fig = plt.gcf() fig.set_size_inches(11, 8) fig.savefig(os.path.join(path_to_save, heat_map.__name__ + ".png"), dpi=600) # plt.show() plt.close("all") log_line(f"{heat_map.__name__} was created.")
def get_mymessages_from_file(your_name, target_name, opt_file_name): """Retrieves a list of MyMessage representations of messages from a file generated by VkOpt GChrome extension. Notes: You must firstly ensure that your_name and target_name are equal to the names in opt_file_name text file. Args: your_name (str): Your name. target_name (str): Target's name. opt_file_name (str): The name of the file to read. Returns: A list of MyMessage objects. """ log_line("Start reading vkOpt messages") with open(opt_file_name, 'r', encoding="utf8") as f: lines = f.readlines() opt_message_list = _parse_lines(lines, your_name, target_name) msgs = [_opt_to_mymessage(msg) for msg in opt_message_list] log_line(len(opt_message_list), " vkOpt messages were received.") return msgs
def distplot_messages_per_month(msgs, path_to_save): sns.set(style="whitegrid") start_date = msgs[0].date.date() (xticks, xticks_labels, xlabel) = _get_xticks(msgs) ax = sns.distplot([(msg.date.date() - start_date).days for msg in msgs], bins=xticks + [(msgs[-1].date.date() - start_date).days], color="m", kde=False) ax.set_xticklabels(xticks_labels) ax.set(xlabel=xlabel, ylabel="messages") ax.margins(x=0) plt.xticks(xticks, rotation=65) plt.tight_layout() fig = plt.gcf() fig.set_size_inches(11, 8) fig.savefig(os.path.join(path_to_save, distplot_messages_per_month.__name__ + ".png"), dpi=600) # plt.show() log_line(f"{distplot_messages_per_month.__name__} was created.") plt.close("all")
async def _analyse(msgs, your_name, target_name, words_file, store_msgs=True): """Does analysis and stores results.""" log_line("Start messages analysis process.") if not len(msgs): log_line("No messages were received.") return date = datetime.datetime.today().strftime('%d-%m-%y %H-%M-%S') results_directory = os.path.join( os.path.split(os.path.normpath(os.path.dirname(__file__)))[0], "results", f"{date}_{your_name}_{target_name}") if not os.path.exists(results_directory): os.makedirs(results_directory) await asyncio.sleep(delay) if store_msgs: file_with_msgs = "messages.txt" storage.store_msgs(os.path.join(results_directory, file_with_msgs), msgs) await asyncio.sleep(delay) await _plot_all(msgs, your_name, target_name, results_directory, words_file) log_line("Done.")
def barplot_non_text_messages(msgs, path_to_save): sns.set(style="whitegrid", palette="muted") colors = ['y', 'b', 'c', 'r', 'g', 'm'] (x, y_total), (xticks, xticks_labels, xlabel) = _get_plot_data(msgs), _get_xticks(msgs, crop=False) bars = stools.get_non_text_messages_grouped(y_total) # bars are overlapping, so firstly we need to sum up the all... sum_bars = [0] * len(y_total) for bar in bars: sum_bars = list(map(operator.add, sum_bars, bar["groups"])) # ... plot and subtract one by one. for i, bar in enumerate(bars[:-1]): sns.barplot(x=xticks_labels, y=sum_bars, label=bar["type"], color=colors[i]) sum_bars = list(map(operator.sub, sum_bars, bar["groups"])) ax = sns.barplot(x=xticks_labels, y=sum_bars, label=bars[-1]["type"], color=colors[-1]) _change_bar_width(ax, 1.) # https://stackoverflow.com/a/4701285 # Shrink current axis by 10% box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.9, box.height]) # Put a legend to the right of the current axis ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) ax.set_xticklabels(xticks_labels, ha="right") ax.set(xlabel=xlabel, ylabel="messages") plt.xticks(rotation=65) fig = plt.gcf() fig.set_size_inches(16, 8) fig.savefig(os.path.join(path_to_save, barplot_non_text_messages.__name__ + ".png"), dpi=600) # plt.show() log_line(f"{barplot_non_text_messages.__name__} was created.") plt.close("all")
async def _retrieve_messages(client, target_entity, num): """Retrieves messages from client's target_entity batch by batch and return them all.""" batch_size = min(3000, num) msgs = [] batch = await client.get_messages(target_entity, limit=batch_size) while len(batch) and len(msgs) < num: offset_id = batch[-1].id msgs.extend([msg for msg in batch if isinstance(msg, Message)]) try: batch = await asyncio.wait_for(client.get_messages(target_entity, limit=min(batch_size, num - len(msgs)), offset_id=offset_id), 10*60) except ConnectionError: log_line("Internet connection was lost.") raise except asyncio.TimeoutError: log_line("Telegram timeout error.") break if not len(batch): log_line(f"{len(msgs[:num])} (100%) messages received.") else: log_line(f"{len(msgs[:num])} ({len(msgs[:num])/batch.total*100:.2f}%) messages received.") return msgs[:num][::-1]
def get_words(file_path): with open(file_path, 'r', encoding="utf-8-sig") as f: words = [word.strip() for word in f.readlines() if all([ch.isalpha() or ch == '\'' or ch == '`' for ch in word.strip()])] log_line(f"{len(words)} words were received from {file_path} file.") return words
def get_msgs(file_path): with open(file_path, 'r') as f: msgs = [MyMessage.from_dict(msg) for msg in json.loads(f.read())] log_line(f"{len(msgs)} messages were received from {file_path} file.") return msgs
def _parse_lines(lines, your_name, target_name, num=1000000): """Parses given text lines and retrieves a message list. Notes: Parses messages from vkOpt GChrome extension with a DEFAULT message format. Appropriate message format is "%username% (%date%): %message%" Appropriate datetime format is "HH:MM:ss dd/mm/yyyy". More than one nested forwarded messages are counted as ONE forwarded message. ... As well as a message with multiple photos counts as ONE photo. ... As well as a message with multiple audio files ... what the heck? Args: your_name (str): Your name. target_name (str): Target's name. lines (list of strings): Text lines of the file. num (int): Max number of the messages to retrieve. Returns: List of dictionaries such as: { "text": text of the message (str), "has_forwards": flag (bool), "attachment": text (str) of the attachment (without first line) } """ lines[0] = lines[0].replace('\ufeff', '') # remove start character # assert lines[0].startswith(target_name) or lines[0].strip().startswith(your_name) date_pattern = "[0-2][0-9]:[0-5][0-9]:[0-5][0-9] [0-3][0-9]/[0-1][0-9]/([0-9]{4})" date_regex = re.compile(date_pattern) title_ending_regex = re.compile(" \(" + date_pattern + "\):\n$") msg_title_regex = re.compile("^\t*(" + your_name + '|' + target_name + ") \(" + date_pattern + "\):\n$") msgs = [] current_msg = {"text": "", "has_forwards": False, "attachment": ""} i = 0 while i < len(lines) and len(msgs) <= num: line = lines[i] if line.startswith("Attachments:["): i += 1 current_msg["attachment"] = lines[i] else: search = title_ending_regex.search(line) if search is not None and search.span()[1] == len(line): if line[0].isspace(): current_msg["has_forwards"] = True i += 1 else: if not msg_title_regex.match(line): log_line( f"[{line}] DOES NOT MATCH ANY SUGGESTED NAME! NO VK OPT MESSAGES WILL BE RECEIVED!" ) return [] # removing redundant spaces after the message current_msg["text"] = current_msg["text"][:-3] msgs.append(current_msg) current_msg = { "text": "", "has_forwards": False, "attachment": "" } current_msg["date"] = datetime.strptime( date_regex.search(line).group(), "%H:%M:%S %d/%m/%Y") current_msg["author"] = your_name if line.startswith( your_name) else target_name elif not current_msg["has_forwards"]: if current_msg["attachment"]: current_msg["attachment"] += line else: current_msg["text"] += line i += 1 if i > 0: current_msg["text"] = current_msg["text"] if current_msg[ "attachment"] else current_msg["text"][:-3] msgs.append(current_msg) # first message is just a template and should be removed return msgs[1:]
def store_msgs(file_path, msgs): with open(file_path, 'w') as fp: json.dump(msgs, fp, default=str) log_line(f"{len(msgs)} messages were stored in {file_path} file.")
def get_words(file_path): with open(file_path, 'r', encoding="utf-8-sig") as f: words = [word.strip() for word in f.readlines()] log_line(f"{len(words)} words were received from {file_path} file.") return words