예제 #1
0
 def __init__(self, dbm, session_duration, stopwords_file=None):
     """
     @param dbm a DatabaseManager
     @param session_duration valid session duration in seconds
     """
     self.dbm = dbm
     self.session_duration = session_duration
     stopwords = None
     if stopwords_file:
         stopwords = load_stop_words(stopwords_file)
     self.query_extractor = QueryExtractor(stopwords)
예제 #2
0
 def __init__(self, dbm, session_duration, stopwords_file=None):
     """
     @param dbm a DatabaseManager
     @param session_duration valid session duration in seconds
     """
     self.dbm = dbm
     self.session_duration = session_duration
     stopwords = None
     if stopwords_file:
         stopwords = load_stop_words(stopwords_file)
     self.query_extractor = QueryExtractor(stopwords)
예제 #3
0
class RawDataPreprocessor(object):
    """Populate query, product, query_product tables.
    Make sure to source tables.sql before using this class."""
    def __init__(self, dbm, session_duration, stopwords_file=None):
        """
        @param dbm a DatabaseManager
        @param session_duration valid session duration in seconds
        """
        self.dbm = dbm
        self.session_duration = session_duration
        stopwords = None
        if stopwords_file:
            stopwords = load_stop_words(stopwords_file)
        self.query_extractor = QueryExtractor(stopwords)

    def run(self):
        self.__populate_query_table()
        self.__populate_product_table()
        self.__populate_query_product_table()

    @timeit
    def __populate_query_table(self):
        self.dbm.begin()
        self.dbm.query('TRUNCATE TABLE query')

        # get all user with non-empty referrer
        # take care! besides NULL, there's 'null'
        invalid = 0
        for row in self.dbm.get_rows(
                "SELECT userid, refer FROM user WHERE refer IS NOT NULL AND refer != '' AND refer != 'null'"
        ):
            time_row = self.dbm.get_one_row(
                'SELECT vtime - utime AS diff FROM (SELECT MIN(V.time) AS vtime, u.time AS utime FROM visit V JOIN user U ON V.userid = U.userid WHERE U.userid = %s GROUP BY V.userid) T',
                (row['userid'], ))
            if time_row == None or time_row['diff'] != 0:
                # data corrupted: first visit's time is inconsistent in user and visit tables
                # actually we can allow 0 <= diff < THRESHOLD (say 24h), but let't be simple here
                invalid += 1
                continue

            visit_count = self.dbm.get_value(
                "SELECT COUNT(id) visit_count FROM visit WHERE userid = %s AND pagetype = 'product'",
                (row['userid'], ))
            if visit_count == 0:
                # the user needs to visit more than one product page to be valid
                invalid += 1
                continue

            query = self.query_extractor.extract_query(row['refer'],
                                                       escaped=True)
            # avoid cases when we cannot extract a meaningful query
            if query:
                try:
                    self.dbm.query(
                        "INSERT INTO query (user_id, query) VALUES (%s, %s)",
                        (row['userid'], query))
                except:
                    invalid += 1
                    # prevent corrupted unicode string
                    print 'Corrupted string', query

        self.dbm.commit()

        self.query_count = self.dbm.get_value('select COUNT(*) FROM query')
        print 'Query table statistics (inserted/discarded): %d/%d (%.2f%%)' % (
            self.query_count, invalid, 100.0 * self.query_count /
            (self.query_count + invalid))

    @timeit
    def __populate_product_table(self):
        self.dbm.begin()
        self.dbm.query('TRUNCATE TABLE product')
        self.dbm.query(
            "INSERT INTO product (name) SELECT DISTINCT(pageinfo) FROM visit WHERE pagetype = 'product'"
        )
        self.dbm.commit()

        self.product_count = self.dbm.get_value('select COUNT(*) FROM product')
        max_count = self.dbm.get_value('select COUNT(*) FROM products')
        print '\nProduct table statistics (inserted/maximum): %d/%d (%.2f%%)' % (
            self.product_count, max_count,
            100.0 * self.product_count / max_count)

    @timeit
    def __populate_query_product_table(self):
        self.dbm.begin()
        self.dbm.query('TRUNCATE TABLE query_product')
        for qrow in self.dbm.get_rows("SELECT id, user_id FROM query"):
            sequence = 1
            # in __populate_query_table, we guarantee user.time is the same as MIN(visit.time)
            start_time = self.dbm.get_value(
                "SELECT time FROM user WHERE userid = %s", (qrow['user_id'], ))
            session_end_time = self.get_session_end(start_time)
            for vrow in self.dbm.get_rows(
                    "SELECT pageinfo, pagetype FROM visit WHERE userid = %s AND time <= %s ORDER BY time ASC",
                (qrow['user_id'], session_end_time)):
                # only consider product visits, but don't filter out others in SQL
                # to get the actual sequence
                if vrow['pagetype'] == 'product':
                    order_count = self.dbm.get_value(
                        "SELECT COUNT(id) order_count FROM orderrecord WHERE userid = %s AND item = %s",
                        (qrow['user_id'], vrow['pageinfo']))
                    # the bought field can actually be updated with one query
                    bought = 2 if order_count > 0 else 0
                    self.dbm.insert(
                        "INSERT INTO query_product (query_id, product_name, bought, sequence) VALUES (%s, %s, %s, %s)",
                        (qrow['id'], vrow['pageinfo'], bought, sequence))
                sequence += 1

        self.dbm.commit()

        count = self.dbm.get_value('select COUNT(*) FROM query_product')
        max_count = self.query_count * self.product_count
        print '\nQuery-Product table statistics (inserted/maximum): %d/%d (%.2f%%)' % (
            count, max_count, 100.0 * count / max_count)

    def get_session_end(self, start_time):
        """Get session end time.
        @param start_time string of timestamp in milliseconds
        @return string of the same format
        """
        start_ts = int(start_time)
        end_ts = start_ts + self.session_duration * 1000
        return str(end_ts)
예제 #4
0
class RawDataPreprocessor(object):
    """Populate query, product, query_product tables.
    Make sure to source tables.sql before using this class."""

    def __init__(self, dbm, session_duration, stopwords_file=None):
        """
        @param dbm a DatabaseManager
        @param session_duration valid session duration in seconds
        """
        self.dbm = dbm
        self.session_duration = session_duration
        stopwords = None
        if stopwords_file:
            stopwords = load_stop_words(stopwords_file)
        self.query_extractor = QueryExtractor(stopwords)

    def run(self):
        self.__populate_query_table()
        self.__populate_product_table()
        self.__populate_query_product_table()

    @timeit
    def __populate_query_table(self):
        self.dbm.begin()
        self.dbm.query('TRUNCATE TABLE query');

        # get all user with non-empty referrer
        # take care! besides NULL, there's 'null'
        invalid = 0
        for row in self.dbm.get_rows("SELECT userid, refer FROM user WHERE refer IS NOT NULL AND refer != '' AND refer != 'null'"):
            time_row = self.dbm.get_one_row('SELECT vtime - utime AS diff FROM (SELECT MIN(V.time) AS vtime, u.time AS utime FROM visit V JOIN user U ON V.userid = U.userid WHERE U.userid = %s GROUP BY V.userid) T', (row['userid'],))
            if time_row == None or time_row['diff'] != 0:
                # data corrupted: first visit's time is inconsistent in user and visit tables
                # actually we can allow 0 <= diff < THRESHOLD (say 24h), but let't be simple here
                invalid += 1
                continue

            visit_count = self.dbm.get_value("SELECT COUNT(id) visit_count FROM visit WHERE userid = %s AND pagetype = 'product'", (row['userid'],))
            if visit_count == 0:
                # the user needs to visit more than one product page to be valid
                invalid += 1
                continue

            query = self.query_extractor.extract_query(row['refer'], escaped=True)
            # avoid cases when we cannot extract a meaningful query
            if query:
                try:
                    self.dbm.query("INSERT INTO query (user_id, query) VALUES (%s, %s)", (row['userid'], query))
                except:
                    invalid += 1
                    # prevent corrupted unicode string
                    print 'Corrupted string', query

        self.dbm.commit()

        self.query_count = self.dbm.get_value('select COUNT(*) FROM query')
        print 'Query table statistics (inserted/discarded): %d/%d (%.2f%%)' % (self.query_count, invalid, 100.0*self.query_count/(self.query_count+invalid))

    @timeit
    def __populate_product_table(self):
        self.dbm.begin()
        self.dbm.query('TRUNCATE TABLE product');
        self.dbm.query("INSERT INTO product (name) SELECT DISTINCT(pageinfo) FROM visit WHERE pagetype = 'product'")
        self.dbm.commit()

        self.product_count = self.dbm.get_value('select COUNT(*) FROM product')
        max_count = self.dbm.get_value('select COUNT(*) FROM products')
        print '\nProduct table statistics (inserted/maximum): %d/%d (%.2f%%)' % (self.product_count, max_count, 100.0*self.product_count/max_count)

    @timeit
    def __populate_query_product_table(self):
        self.dbm.begin()
        self.dbm.query('TRUNCATE TABLE query_product');
        for qrow in self.dbm.get_rows("SELECT id, user_id FROM query"):
            sequence = 1
            # in __populate_query_table, we guarantee user.time is the same as MIN(visit.time)
            start_time = self.dbm.get_value("SELECT time FROM user WHERE userid = %s", (qrow['user_id'],))
            session_end_time = self.get_session_end(start_time)
            for vrow in self.dbm.get_rows("SELECT pageinfo, pagetype FROM visit WHERE userid = %s AND time <= %s ORDER BY time ASC", (qrow['user_id'], session_end_time)):
                # only consider product visits, but don't filter out others in SQL
                # to get the actual sequence
                if vrow['pagetype'] == 'product':
                    order_count = self.dbm.get_value("SELECT COUNT(id) order_count FROM orderrecord WHERE userid = %s AND item = %s", (qrow['user_id'], vrow['pageinfo']))
                    # the bought field can actually be updated with one query
                    bought = 2 if order_count > 0 else 0
                    self.dbm.insert("INSERT INTO query_product (query_id, product_name, bought, sequence) VALUES (%s, %s, %s, %s)", (qrow['id'], vrow['pageinfo'], bought, sequence))
                sequence += 1

        self.dbm.commit()

        count = self.dbm.get_value('select COUNT(*) FROM query_product')
        max_count = self.query_count * self.product_count
        print '\nQuery-Product table statistics (inserted/maximum): %d/%d (%.2f%%)' % (count, max_count, 100.0*count/max_count)

    def get_session_end(self, start_time):
        """Get session end time.
        @param start_time string of timestamp in milliseconds
        @return string of the same format
        """
        start_ts = int(start_time)
        end_ts = start_ts + self.session_duration * 1000
        return str(end_ts)