示例#1
0
    def run(self):
        """
        Create the DB and tables
        :return:
        """
        self.dbcreate.create_db()
        # self.conn_initial.close()

        conn_hackernews = DBConnection(
            user="******", password=DB_PASSWORD, db_name=DB_NAME_HACKERNEWS
        )

        table_items = TableCreator(conn=conn_hackernews, table_name=TABLE_NAME_ITEMS)
        table_users = TableCreator(conn=conn_hackernews, table_name=TABLE_NAME_USERS)
        table_texts = TableCreator(conn=conn_hackernews, table_name=TABLE_NAME_TEXTS)

        tables = [table_items, table_users, table_texts]

        logging.info("creating all tables")
        for table in tables:
            table_name = table.get_name()
            query_create_table = TABLES[table_name]["QUERY_CREATE_TABLE"]
            query_create_index = TABLES[table_name]["QUERY_CREATE_INDEX"]
            was_created = table.create_table(query_create_table, query_create_index)
            if was_created:
                logging.info("table created: %s", table.get_name())
            else:
                logging.info("table not created: %s", table.get_name())

        conn_hackernews.close_conn()
示例#2
0
 def __init__(self, conn: DBConnection, table_name: str,
              primary_key_name: str):
     self.conn_obj = conn
     self.conn = conn.get_conn()
     self.cursor = conn.get_cursor()
     self.conn.autocommit = True
     self.table_name = table_name
     self.primary_key_name = primary_key_name
示例#3
0
 def __init__(
     self,
     conn: DBConnection,
     table_name: str = TABLE_NAME_ITEMS,
     column_name: str = PRIMARY_KEY_NAME_ITEMS,
 ):
     self.conn_obj = conn
     self.conn = conn.get_conn()
     self.cursor = conn.get_cursor()
     self.table_name = table_name
     self.col_name_id = column_name
示例#4
0
 def __init__(
     self,
     conn: DBConnection,
     table_name: str = TABLE_NAME_ITEMS,
     primary_key_name: str = PRIMARY_KEY_NAME_ITEMS,
 ):
     self.conn_obj = conn
     self.conn = conn.get_conn()
     self.cursor = conn.get_cursor()
     self.conn.autocommit = True
     self.table_name = table_name
     self.primary_key_name = primary_key_name
示例#5
0
def is_table_exists(conn: DBConnection, table_name: str) -> bool:
    """
    Check if a given table exists in the DB
    :param conn: DBConnection object
    :param table_name: cursor_name of the table to check
    :return: True if table exists, False otherwise
    """
    logging.info("checking if table %s exists", table_name)
    try:
        cursor = conn.get_cursor()
        query = sql.SQL("""
            SELECT EXISTS (
            SELECT FROM pg_tables
            WHERE  schemaname = 'public'
            AND    tablename  = '{}'
           );
        """.format(table_name))

        cursor.execute(query)
        res = cursor.fetchall()

        # table exists
        if res[0][0]:
            logging.info("table %s exists, skipping", table_name)
        else:
            logging.info("table %s does not exist, creating", table_name)

        return res[0][0]

    except Exception as e:
        logging.error("exception: %s", e)
        raise Exception("Exception: {}".format(e))
示例#6
0
def all_values_exist(conn: DBConnection, table_name: str, column_name: str,
                     values: tuple) -> bool:
    """
    Check if all provided values exist in a given table in a given column
    :param conn: DBConnection object
    :param table_name: cursor_name of the table
    :param column_name: cursor_name of the column
    :param values: tuple of all values to check existence of
    :return: True if all values exist, False otherwise
    """
    logging.info(
        "checking if range of values [%s, %s] exists in column %s in table %s",
        min(values),
        max(values),
        column_name,
        table_name,
    )

    query = "SELECT COUNT({column}) FROM {table} WHERE {column} IN %s;"

    query_sql = sql.SQL(query).format(column=sql.Identifier(column_name),
                                      table=sql.Identifier(table_name))

    cursor = conn.get_cursor()
    cursor.execute(query_sql, values)
    res = cursor.fetchall()[0][0]

    # Check if the count of returned values is the same as the count of the input values
    return res == len(values)
示例#7
0
def is_value_exists(conn: DBConnection, table_name: str, column_name: str,
                    value: Any) -> bool:
    """
    Check if a given value exists in a given column in a given table
    :param conn: DBConnection object
    :param table_name: cursor_name of the table
    :param column_name: cursor_name of the column
    :param value: value to check existence of
    :return: True if the value exists, False otherwise
    """
    logging.info(
        "checking if value %s exists in column %s in table %s",
        value,
        column_name,
        table_name,
    )

    query_exists = sql.SQL(
        "SELECT EXISTS(SELECT 1 FROM {table} WHERE {column} = %s);").format(
            table=sql.Identifier(table_name),
            column=sql.Identifier(column_name),
        )

    cursor = conn.get_cursor()
    cursor.execute(query_exists, (value, ))
    res = cursor.fetchall()[0][0]

    if res:
        logging.info("check: value %s exists", value)
    else:
        logging.info("check: value %s does not exist", value)

    return res
示例#8
0
    def run(self):
        """
        Run the download task using Luigi for all user IDs currently present in the "items" table
        :return:
        """
        logging.info("starting luigi task: %s", self.__class__)
        conn = DBConnection(user="******",
                            password=DB_PASSWORD,
                            db_name=DB_NAME_HACKERNEWS)

        user_getter = UserGetter(conn, TABLE_NAME_USERS,
                                 PRIMARY_KEY_NAME_USERS)
        user_inserter = UserInserter(conn, TABLE_NAME_USERS,
                                     PRIMARY_KEY_NAME_USERS)
        user_api = UserAPI()

        # user_ids_intersection = set(user_getter.get_all_user_ids(table_name=TAB)).intersection(
        #     self.user_ids
        # )

        for user_id in tqdm(self.user_ids):
            current_user = user_api.get_user(user_id=user_id)
            if current_user is not None:
                user_inserter.insert_user(current_user)

        logging.info("finished task: %s", self.__class__)
示例#9
0
    def data_generator(conn: DBConnection):
        cursor = conn.get_named_cursor("compute_sentiment_cursor")
        cursor.itersize = args.batch_size

        # Get existing id_item from texts to avoid checking them again
        query = """
        SELECT id, title, text 
        FROM {table_items}
        WHERE NOT EXISTS (
        SELECT 
        FROM {table_texts}
        WHERE id = {table_texts}.id_item
        )
        ORDER BY id;
        """

        # query = "SELECT id, title, text FROM {table} ORDER BY id ASC;"
        query_sql = sql.SQL(query).format(
            table_items=sql.Identifier(TABLE_NAME_ITEMS),
            table_texts=sql.Identifier(TABLE_NAME_TEXTS),
        )
        cursor.execute(query_sql)

        while True:
            rows = cursor.fetchmany(args.batch_size)
            if not rows:
                break
            for row in rows:
                yield row
示例#10
0
def main():
    # Get all texts from DB as a generator
    conn = DBConnection("postgres", DB_PASSWORD, DB_NAME_HACKERNEWS)
    text_generator = get_column_values(
        conn, TABLE_NAME_ITEMS, TABLE_ITEMS["COLUMN_NAME_TEXT"]
    )
    total_text_count = get_value_count_in_column(
        conn, TABLE_NAME_ITEMS, TABLE_ITEMS["COLUMN_NAME_TEXT"]
    )

    # Process all texts
    text_preprocessor = TextPreprocessor()
    text_preprocessor.process_multiprocessing(
        text_generator, args.filename, total_text_count, args.batch_size
    )
示例#11
0
    def run(self):
        """
        Run the download task using Luigi for a given range of item IDs
        :return:
        """
        logging.info("starting luigi task: %s", self.__class__)
        conn = DBConnection(
            user="******", password=DB_PASSWORD, db_name=DB_NAME_HACKERNEWS
        )
        item_api = ItemAPI()
        item_inserter = ItemInserter(conn, TABLE_NAME_ITEMS, PRIMARY_KEY_NAME_ITEMS)

        for item_id in tqdm(self.ids_to_download):
            current_item = item_api.get_item(item_id=item_id)
            if current_item is not None:
                item_inserter.insert_item(current_item)

        logging.info("finished task: %s", self.__class__)
示例#12
0
def get_value_count_in_column(conn: DBConnection, table_name: str,
                              column_name: str) -> int:
    """
    Count the number of values in a column
    :param conn: DBConnection object
    :param table_name: cursor_name of the table
    :param column_name: cursor_name of the column
    :return: count of values
    """
    logging.info("getting value count in column: %s, table: %s", column_name,
                 table_name)

    query = "SELECT COUNT({column}) FROM {table}"
    query_sql = sql.SQL(query).format(column=sql.Identifier(column_name),
                                      table=sql.Identifier(table_name))
    cursor = conn.get_cursor()
    cursor.execute(query_sql)
    res = cursor.fetchone()

    return res[0]
示例#13
0
def get_column_values(
        conn: DBConnection,
        table_name: str,
        column_name: str,
        limit: Optional[int] = None,
        fetch_size: int = 10000,
        cursor_name: str = str(datetime.now()),
) -> Optional[Iterable[Any]]:
    """
    Get all values in a given column in a given table
    :param conn: DBConnection object
    :param table_name: name of the table
    :param column_name: name of the column
    :param limit: maximum number of values to return
    :param fetch_size: number of rows to fetch in one batch
    :param cursor_name: optional name of the cursor
    :return: generator of values (if any) in the column
    """
    logging.info("getting all values for column: %s", column_name)

    cursor = conn.get_named_cursor(str(cursor_name))
    cursor.itersize = fetch_size

    if limit:
        query = "SELECT {column} FROM {table} LIMIT %s;"
    else:
        query = "SELECT {column} FROM {table};"

    query_sql = sql.SQL(query).format(column=sql.Identifier(column_name),
                                      table=sql.Identifier(table_name))
    cursor.execute(query_sql, [limit] if limit else None)

    while True:
        rows = cursor.fetchmany(fetch_size)
        if not rows:
            break
        for row in rows:
            yield row[0]
示例#14
0
 def __init__(self, conn: DBConnection, db_name: str):
     self.db_name = db_name
     self.cursor = conn.get_cursor()
     self.conn = conn.get_conn()
示例#15
0
 def __init__(self, conn: DBConnection, table_name: str):
     self.table_name = table_name
     self.conn_object = conn
     self.conn = conn.get_conn()
     self.conn.autocommit = True
     self.cursor = self.conn.cursor()
示例#16
0
def main():
    """
    Set up the DB and tables, download items for a given item ID range, insert them into the DB
    :return:
    """

    # Set up DB
    setup = Setup()
    setup.run()
    conn = DBConnection("postgres", DB_PASSWORD, DB_NAME_HACKERNEWS)

    # Check which (if any) IDs exist in the DB already
    cursor = conn.get_cursor()
    desired_ids = set(list(range(args.startid, args.endid + 1)))

    # Get all distinct IDs (if any) from the DB
    query = "SELECT DISTINCT {} FROM {} WHERE {} >= %s AND {} < %s;"
    query_sql = sql.SQL(query).format(
        sql.Identifier(PRIMARY_KEY_NAME_ITEMS),
        sql.Identifier(TABLE_NAME_ITEMS),
        sql.Identifier(PRIMARY_KEY_NAME_ITEMS),
        sql.Identifier(PRIMARY_KEY_NAME_ITEMS),
    )
    cursor.execute(query_sql, (min(desired_ids), max(desired_ids)))
    res_ids = cursor.fetchall()

    # If no IDs exist in DB
    if len(res_ids) == 0:
        ids_in_db = set()
    else:
        ids_in_db = set([row[0] for row in res_ids])

    item_ids_to_download = sorted(list(desired_ids - ids_in_db))

    # If no items to download, exit
    if len(item_ids_to_download) == 0:
        exit(0)

    # Split item id list into chunks for each worker
    chunk_size_items = int(ceil(len(item_ids_to_download) / args.workers))
    item_ids_to_download_chunks = chunk_for_size(item_ids_to_download,
                                                 chunk_size_items)
    logging.info(
        "item ranges for jobs: {}".format(item_ids_to_download_chunks))

    # For each chunk, create a new Luigi task
    task_list = []
    num_workers = 0
    for chunk in item_ids_to_download_chunks:
        task_list.append(TaskDownloadItems(ids_to_download=chunk))
        num_workers += 1

    # If asked to download users, add a task
    if args.download_users.lower() == "y":
        # Get all user IDs currently in the "items" table
        user_getter = UserGetter(conn, TABLE_NAME_USERS,
                                 PRIMARY_KEY_NAME_USERS)
        user_ids_in_users_table = set(
            user_getter.get_all_user_ids(table_name=TABLE_NAME_USERS))
        user_ids_in_items_table = set(
            user_getter.get_all_user_ids(table_name=TABLE_NAME_ITEMS))
        user_ids_to_download = sorted(
            list(user_ids_in_items_table - user_ids_in_users_table))

        # Build user ranges to download for each task
        chunk_size_users = int(len(user_ids_to_download) / args.workers)
        if chunk_size_users != 0:
            ranges_users = chunk_for_size(user_ids_to_download,
                                          chunk_size_users)
            for range_users in ranges_users:
                task_list.append(TaskDownloadUsers(user_ids=range_users))

    luigi.build(
        task_list,
        workers=num_workers,
        local_scheduler=True,
    )
def run(
    data_generator,
    process_text: bool,
    batch_size: int = 100000,
):
    """
    Compute and insert sentiment scores (polarity, subjectivity) into DB for all existing items
    :param data_generator: an iterable where each element is a single row from the DB
    :param process_text: whether to process the text or not before computing sentiment and inserting into DB
    :param batch_size: size of the batch to use for the named cursor when querying DB for data
    :return:
    """
    logging.info("starting task %s", __name__)
    conn = DBConnection(
        user="******", password=DB_PASSWORD, db_name=DB_NAME_HACKERNEWS
    )
    text_inserter = TextInserter(conn, TABLE_NAME_TEXTS, PRIMARY_KEY_NAME_TEXTS)
    sentiment_classifier = SentimentClassifier()
    text_preprocessor = TextPreprocessor()
    is_generator_exhausted = False

    if process_text:
        while not is_generator_exhausted:
            current_batch = []
            for _ in range(batch_size):
                try:
                    current_batch.append(next(data_generator))
                except StopIteration:
                    logging.info("generator %s exhausted, finishing", data_generator)
                    is_generator_exhausted = True
                    break

            if len(current_batch) == 0:
                break

            for item_id, title, text in tqdm(current_batch):
                # Preprocess "text" field if not empty, otherwise preprocess title (stories don't have text)
                if is_string_empty(text):
                    raw_text = title
                else:
                    raw_text = text

                text_preprocessed = text_preprocessor.process(raw_text)

                text_obj = Text(item_id, text_preprocessed)

                # Insert preprocessed text
                text_inserter.insert_text(text_obj)

                # Use unprocessed text for sentiment computation
                sentiment = sentiment_classifier.get_sentiment(raw_text)
                text_inserter.insert_sentiment(sentiment, item_id)
    else:
        while True:
            current_batch = []
            for _ in range(batch_size):
                current_batch.append(next(data_generator))

            if len(current_batch) == 0:
                break

            for item_id, title, text in current_batch:
                if is_string_empty(text):
                    raw_text = title
                else:
                    raw_text = text
                sentiment = sentiment_classifier.get_sentiment(raw_text)
                text_inserter.insert_sentiment(sentiment, item_id)

    logging.info("finished task: %s", __name__)
示例#18
0
 def __init__(self):
     self.conn_initial = DBConnection(
         user="******", password=DB_PASSWORD, db_name=DB_NAME_INITIAL
     )
     self.dbcreate = DBCreator(conn=self.conn_initial, db_name=DB_NAME_HACKERNEWS)