import os import logging import json ##### import logging_factory ##### logger_err = logging_factory.get_module_logger("file_manager_err", logging.ERROR) logger = logging_factory.get_module_logger("file_manager", logging.DEBUG) def count_lines_file(path: str): """ Function that returns the number of lines of a file given its path :param path: str - the path to the file :return: int - the number of lines of the file """ with open(path) as f: for i, l in enumerate(f): pass return i + 1 def clear_file(save_path: str): """ Given a path to a file, clears the contents of the file :param save_path: str - path to the file """
import logging import json import os ##### import logging_factory ##### from typing import Optional ##### logger_err = logging_factory.get_module_logger("tools_err", logging.ERROR) logger = logging_factory.get_module_logger("tools", logging.DEBUG) def obtain_usernames(subr_path: str): """ Given the path of the backup, generates one .txt file containing the authors in the backup :param subr_path: str - path to the file (i.e subreddit file) """ subr_authors = set() try: with open(subr_path, "r") as input_file: for line in input_file: try: loaded = json.loads(line) author = loaded["author"] if author != "[deleted]": subr_authors.add(author) except KeyError: logger_err.error(
import json import logging import pandas as pd ##### import logging_factory import indexer ##### from elasticsearch import Elasticsearch, ConnectionTimeout, TransportError, ConnectionError from elasticsearch_dsl import Search, Q ##### logger_err = logging_factory.get_module_logger("questioner_err", logging.ERROR) logger = logging_factory.get_module_logger("questioner", logging.DEBUG) def extract_authors_info(authors_path: str): """ Given a .txt file containing the names of the authors, searches in an Elasticsearch index their corresponding information (for reddit: account identifier, username, date of creation, date of retrieval, comment and link karma punctuation). Generates a .jsonl file containing all the authors info sorted by their account id. :param authors_path: str - path to the .txt file containing the authors """ import math host, port = "localhost", 9200 es = Elasticsearch(hosts=[{"host": host, "port": port}]) search = Search(using=es, index="reddit_users") max_query_size = 50000 authors = []
import logging import gzip import json ##### import logging_factory ##### from elasticsearch import Elasticsearch, helpers, ConnectionTimeout, ConnectionError from elasticsearch.helpers import BulkIndexError ##### logger_err = logging_factory.get_module_logger("indexer_err", logging.ERROR) logger = logging_factory.get_module_logger("indexer", logging.DEBUG) def decode_file(file_handler, is_csv: bool): """ Given a file handler (for .csv and .jsonl) formats all the info and returns an index and a dictionary with the required data :param file_handler: the file handler containing the lines to be processed :param is_csv: bool - True if the file handler is for .csv files, False if it's for .jsonl files :return: yielded index (str) and a dictionary containing the account identifier, the username, the date of creation of the account, the date of retrieval and the comment and link karma punctuations """ es_fields_keys = ("acc_id", "username", "created", "updated", "comment_karma", "link_karma") # If it's a .csv file, skip the header if is_csv: try: next(file_handler)