def run(self): """ Create the DB and tables :return: """ self.dbcreate.create_db() # self.conn_initial.close() conn_hackernews = DBConnection( user="******", password=DB_PASSWORD, db_name=DB_NAME_HACKERNEWS ) table_items = TableCreator(conn=conn_hackernews, table_name=TABLE_NAME_ITEMS) table_users = TableCreator(conn=conn_hackernews, table_name=TABLE_NAME_USERS) table_texts = TableCreator(conn=conn_hackernews, table_name=TABLE_NAME_TEXTS) tables = [table_items, table_users, table_texts] logging.info("creating all tables") for table in tables: table_name = table.get_name() query_create_table = TABLES[table_name]["QUERY_CREATE_TABLE"] query_create_index = TABLES[table_name]["QUERY_CREATE_INDEX"] was_created = table.create_table(query_create_table, query_create_index) if was_created: logging.info("table created: %s", table.get_name()) else: logging.info("table not created: %s", table.get_name()) conn_hackernews.close_conn()
def __init__(self, conn: DBConnection, table_name: str, primary_key_name: str): self.conn_obj = conn self.conn = conn.get_conn() self.cursor = conn.get_cursor() self.conn.autocommit = True self.table_name = table_name self.primary_key_name = primary_key_name
def __init__( self, conn: DBConnection, table_name: str = TABLE_NAME_ITEMS, column_name: str = PRIMARY_KEY_NAME_ITEMS, ): self.conn_obj = conn self.conn = conn.get_conn() self.cursor = conn.get_cursor() self.table_name = table_name self.col_name_id = column_name
def __init__( self, conn: DBConnection, table_name: str = TABLE_NAME_ITEMS, primary_key_name: str = PRIMARY_KEY_NAME_ITEMS, ): self.conn_obj = conn self.conn = conn.get_conn() self.cursor = conn.get_cursor() self.conn.autocommit = True self.table_name = table_name self.primary_key_name = primary_key_name
def is_table_exists(conn: DBConnection, table_name: str) -> bool: """ Check if a given table exists in the DB :param conn: DBConnection object :param table_name: cursor_name of the table to check :return: True if table exists, False otherwise """ logging.info("checking if table %s exists", table_name) try: cursor = conn.get_cursor() query = sql.SQL(""" SELECT EXISTS ( SELECT FROM pg_tables WHERE schemaname = 'public' AND tablename = '{}' ); """.format(table_name)) cursor.execute(query) res = cursor.fetchall() # table exists if res[0][0]: logging.info("table %s exists, skipping", table_name) else: logging.info("table %s does not exist, creating", table_name) return res[0][0] except Exception as e: logging.error("exception: %s", e) raise Exception("Exception: {}".format(e))
def all_values_exist(conn: DBConnection, table_name: str, column_name: str, values: tuple) -> bool: """ Check if all provided values exist in a given table in a given column :param conn: DBConnection object :param table_name: cursor_name of the table :param column_name: cursor_name of the column :param values: tuple of all values to check existence of :return: True if all values exist, False otherwise """ logging.info( "checking if range of values [%s, %s] exists in column %s in table %s", min(values), max(values), column_name, table_name, ) query = "SELECT COUNT({column}) FROM {table} WHERE {column} IN %s;" query_sql = sql.SQL(query).format(column=sql.Identifier(column_name), table=sql.Identifier(table_name)) cursor = conn.get_cursor() cursor.execute(query_sql, values) res = cursor.fetchall()[0][0] # Check if the count of returned values is the same as the count of the input values return res == len(values)
def is_value_exists(conn: DBConnection, table_name: str, column_name: str, value: Any) -> bool: """ Check if a given value exists in a given column in a given table :param conn: DBConnection object :param table_name: cursor_name of the table :param column_name: cursor_name of the column :param value: value to check existence of :return: True if the value exists, False otherwise """ logging.info( "checking if value %s exists in column %s in table %s", value, column_name, table_name, ) query_exists = sql.SQL( "SELECT EXISTS(SELECT 1 FROM {table} WHERE {column} = %s);").format( table=sql.Identifier(table_name), column=sql.Identifier(column_name), ) cursor = conn.get_cursor() cursor.execute(query_exists, (value, )) res = cursor.fetchall()[0][0] if res: logging.info("check: value %s exists", value) else: logging.info("check: value %s does not exist", value) return res
def run(self): """ Run the download task using Luigi for all user IDs currently present in the "items" table :return: """ logging.info("starting luigi task: %s", self.__class__) conn = DBConnection(user="******", password=DB_PASSWORD, db_name=DB_NAME_HACKERNEWS) user_getter = UserGetter(conn, TABLE_NAME_USERS, PRIMARY_KEY_NAME_USERS) user_inserter = UserInserter(conn, TABLE_NAME_USERS, PRIMARY_KEY_NAME_USERS) user_api = UserAPI() # user_ids_intersection = set(user_getter.get_all_user_ids(table_name=TAB)).intersection( # self.user_ids # ) for user_id in tqdm(self.user_ids): current_user = user_api.get_user(user_id=user_id) if current_user is not None: user_inserter.insert_user(current_user) logging.info("finished task: %s", self.__class__)
def data_generator(conn: DBConnection): cursor = conn.get_named_cursor("compute_sentiment_cursor") cursor.itersize = args.batch_size # Get existing id_item from texts to avoid checking them again query = """ SELECT id, title, text FROM {table_items} WHERE NOT EXISTS ( SELECT FROM {table_texts} WHERE id = {table_texts}.id_item ) ORDER BY id; """ # query = "SELECT id, title, text FROM {table} ORDER BY id ASC;" query_sql = sql.SQL(query).format( table_items=sql.Identifier(TABLE_NAME_ITEMS), table_texts=sql.Identifier(TABLE_NAME_TEXTS), ) cursor.execute(query_sql) while True: rows = cursor.fetchmany(args.batch_size) if not rows: break for row in rows: yield row
def main(): # Get all texts from DB as a generator conn = DBConnection("postgres", DB_PASSWORD, DB_NAME_HACKERNEWS) text_generator = get_column_values( conn, TABLE_NAME_ITEMS, TABLE_ITEMS["COLUMN_NAME_TEXT"] ) total_text_count = get_value_count_in_column( conn, TABLE_NAME_ITEMS, TABLE_ITEMS["COLUMN_NAME_TEXT"] ) # Process all texts text_preprocessor = TextPreprocessor() text_preprocessor.process_multiprocessing( text_generator, args.filename, total_text_count, args.batch_size )
def run(self): """ Run the download task using Luigi for a given range of item IDs :return: """ logging.info("starting luigi task: %s", self.__class__) conn = DBConnection( user="******", password=DB_PASSWORD, db_name=DB_NAME_HACKERNEWS ) item_api = ItemAPI() item_inserter = ItemInserter(conn, TABLE_NAME_ITEMS, PRIMARY_KEY_NAME_ITEMS) for item_id in tqdm(self.ids_to_download): current_item = item_api.get_item(item_id=item_id) if current_item is not None: item_inserter.insert_item(current_item) logging.info("finished task: %s", self.__class__)
def get_value_count_in_column(conn: DBConnection, table_name: str, column_name: str) -> int: """ Count the number of values in a column :param conn: DBConnection object :param table_name: cursor_name of the table :param column_name: cursor_name of the column :return: count of values """ logging.info("getting value count in column: %s, table: %s", column_name, table_name) query = "SELECT COUNT({column}) FROM {table}" query_sql = sql.SQL(query).format(column=sql.Identifier(column_name), table=sql.Identifier(table_name)) cursor = conn.get_cursor() cursor.execute(query_sql) res = cursor.fetchone() return res[0]
def get_column_values( conn: DBConnection, table_name: str, column_name: str, limit: Optional[int] = None, fetch_size: int = 10000, cursor_name: str = str(datetime.now()), ) -> Optional[Iterable[Any]]: """ Get all values in a given column in a given table :param conn: DBConnection object :param table_name: name of the table :param column_name: name of the column :param limit: maximum number of values to return :param fetch_size: number of rows to fetch in one batch :param cursor_name: optional name of the cursor :return: generator of values (if any) in the column """ logging.info("getting all values for column: %s", column_name) cursor = conn.get_named_cursor(str(cursor_name)) cursor.itersize = fetch_size if limit: query = "SELECT {column} FROM {table} LIMIT %s;" else: query = "SELECT {column} FROM {table};" query_sql = sql.SQL(query).format(column=sql.Identifier(column_name), table=sql.Identifier(table_name)) cursor.execute(query_sql, [limit] if limit else None) while True: rows = cursor.fetchmany(fetch_size) if not rows: break for row in rows: yield row[0]
def __init__(self, conn: DBConnection, db_name: str): self.db_name = db_name self.cursor = conn.get_cursor() self.conn = conn.get_conn()
def __init__(self, conn: DBConnection, table_name: str): self.table_name = table_name self.conn_object = conn self.conn = conn.get_conn() self.conn.autocommit = True self.cursor = self.conn.cursor()
def main(): """ Set up the DB and tables, download items for a given item ID range, insert them into the DB :return: """ # Set up DB setup = Setup() setup.run() conn = DBConnection("postgres", DB_PASSWORD, DB_NAME_HACKERNEWS) # Check which (if any) IDs exist in the DB already cursor = conn.get_cursor() desired_ids = set(list(range(args.startid, args.endid + 1))) # Get all distinct IDs (if any) from the DB query = "SELECT DISTINCT {} FROM {} WHERE {} >= %s AND {} < %s;" query_sql = sql.SQL(query).format( sql.Identifier(PRIMARY_KEY_NAME_ITEMS), sql.Identifier(TABLE_NAME_ITEMS), sql.Identifier(PRIMARY_KEY_NAME_ITEMS), sql.Identifier(PRIMARY_KEY_NAME_ITEMS), ) cursor.execute(query_sql, (min(desired_ids), max(desired_ids))) res_ids = cursor.fetchall() # If no IDs exist in DB if len(res_ids) == 0: ids_in_db = set() else: ids_in_db = set([row[0] for row in res_ids]) item_ids_to_download = sorted(list(desired_ids - ids_in_db)) # If no items to download, exit if len(item_ids_to_download) == 0: exit(0) # Split item id list into chunks for each worker chunk_size_items = int(ceil(len(item_ids_to_download) / args.workers)) item_ids_to_download_chunks = chunk_for_size(item_ids_to_download, chunk_size_items) logging.info( "item ranges for jobs: {}".format(item_ids_to_download_chunks)) # For each chunk, create a new Luigi task task_list = [] num_workers = 0 for chunk in item_ids_to_download_chunks: task_list.append(TaskDownloadItems(ids_to_download=chunk)) num_workers += 1 # If asked to download users, add a task if args.download_users.lower() == "y": # Get all user IDs currently in the "items" table user_getter = UserGetter(conn, TABLE_NAME_USERS, PRIMARY_KEY_NAME_USERS) user_ids_in_users_table = set( user_getter.get_all_user_ids(table_name=TABLE_NAME_USERS)) user_ids_in_items_table = set( user_getter.get_all_user_ids(table_name=TABLE_NAME_ITEMS)) user_ids_to_download = sorted( list(user_ids_in_items_table - user_ids_in_users_table)) # Build user ranges to download for each task chunk_size_users = int(len(user_ids_to_download) / args.workers) if chunk_size_users != 0: ranges_users = chunk_for_size(user_ids_to_download, chunk_size_users) for range_users in ranges_users: task_list.append(TaskDownloadUsers(user_ids=range_users)) luigi.build( task_list, workers=num_workers, local_scheduler=True, )
def run( data_generator, process_text: bool, batch_size: int = 100000, ): """ Compute and insert sentiment scores (polarity, subjectivity) into DB for all existing items :param data_generator: an iterable where each element is a single row from the DB :param process_text: whether to process the text or not before computing sentiment and inserting into DB :param batch_size: size of the batch to use for the named cursor when querying DB for data :return: """ logging.info("starting task %s", __name__) conn = DBConnection( user="******", password=DB_PASSWORD, db_name=DB_NAME_HACKERNEWS ) text_inserter = TextInserter(conn, TABLE_NAME_TEXTS, PRIMARY_KEY_NAME_TEXTS) sentiment_classifier = SentimentClassifier() text_preprocessor = TextPreprocessor() is_generator_exhausted = False if process_text: while not is_generator_exhausted: current_batch = [] for _ in range(batch_size): try: current_batch.append(next(data_generator)) except StopIteration: logging.info("generator %s exhausted, finishing", data_generator) is_generator_exhausted = True break if len(current_batch) == 0: break for item_id, title, text in tqdm(current_batch): # Preprocess "text" field if not empty, otherwise preprocess title (stories don't have text) if is_string_empty(text): raw_text = title else: raw_text = text text_preprocessed = text_preprocessor.process(raw_text) text_obj = Text(item_id, text_preprocessed) # Insert preprocessed text text_inserter.insert_text(text_obj) # Use unprocessed text for sentiment computation sentiment = sentiment_classifier.get_sentiment(raw_text) text_inserter.insert_sentiment(sentiment, item_id) else: while True: current_batch = [] for _ in range(batch_size): current_batch.append(next(data_generator)) if len(current_batch) == 0: break for item_id, title, text in current_batch: if is_string_empty(text): raw_text = title else: raw_text = text sentiment = sentiment_classifier.get_sentiment(raw_text) text_inserter.insert_sentiment(sentiment, item_id) logging.info("finished task: %s", __name__)
def __init__(self): self.conn_initial = DBConnection( user="******", password=DB_PASSWORD, db_name=DB_NAME_INITIAL ) self.dbcreate = DBCreator(conn=self.conn_initial, db_name=DB_NAME_HACKERNEWS)