예제 #1
0
 def save_feature(self, feat_name, obs_field, target_field, dim, x, y):
     fname = "%s_%s_x_%s_%dD" % (feat_name, obs_field, target_field, dim)
     table_utils._write(
         os.path.join(self.feat_dir, fname + config.FEAT_FILE_SUFFIX), x)
     if dim == 1:
         corr = np_utils._corr(x, y)
         self.logger.info("%s (%dD): corr=%.6f" % (fname, dim, corr))
예제 #2
0
def main():
    # uses inner to ensure we have complete rows. Otherwise
    # we would have NaN mixed in with the relevance scores,
    # and we can't train on those
    dfClicks = table_utils._read(config.CLICK_DATA)
    dfRel = table_utils._read(config.RELEVANCE_DATA)
    if config.REMOVE_WALL:
        dfRel = dfRel[(dfRel['relevance'] < 0.24) | (dfRel['relevance'] > 0.25)]

    dfAll = dfClicks \
            .join(dfRel.set_index(['norm_query', 'hit_title']),
                  on=['norm_query', 'hit_title'], how='inner')
    dfClicks_len = len(dfClicks)
    rows_w_rel = len(dfAll)
    del dfClicks
    del dfRel

    # Filter out pages that couldn't be loaded as docs/termvecs
    es_docs_keys = set(map(int, shelve_keys(config.ES_PAGE_DOCS_SHELVE)))
    dfAll = dfAll[dfAll['hit_page_id'].isin(es_docs_keys)]

    es_termvec_keys = set(map(int, shelve_keys(config.ES_PAGE_TERM_VEC_SHELVE)))
    dfAll = dfAll[dfAll['hit_page_id'].isin(es_termvec_keys)]

    # drop some unnecessary columns
    dfAll.drop(['session_id', 'clicked'], axis=1, inplace=True)

    # average out hit_position and hit_score to improve de-duplication.
    # on 10k queries this is a 90% reduction in data
    dfAll = dfAll.join(dfAll.groupby(['query', 'hit_page_id'])['hit_position'].mean(), on=['query', 'hit_page_id'], rsuffix='_mean')
    dfAll = dfAll.join(dfAll.groupby(['query', 'hit_page_id'])['hit_score'].mean(), on=['query', 'hit_page_id'], rsuffix='_mean')
    dfAll.drop(['hit_position', 'hit_score'], axis=1, inplace=True)

    # turn duplicates into a 'weight' column
    dfAll = dfAll.groupby(map(str, dfAll.columns)).size().reset_index()
    dfAll.rename(columns={0: 'weight'}, inplace=True)

    # Assign id's to all normalized queries, to be used to generate pairwise
    # preferences for the ranking learners
    norm_query_id_df = pd.DataFrame(list(enumerate(dfAll["norm_query"].unique())),
                                    columns=["norm_query_id", "norm_query"])
    dfAll = dfAll.join(norm_query_id_df.set_index("norm_query"), on="norm_query")
    # xgboost requires the data to be in groups by id
    dfAll.sort_values("norm_query_id", inplace=True)

    table_utils._write(config.ALL_DATA, dfAll)

    dfInfo = dfAll[["relevance", "weight", "norm_query_id"]].copy()
    table_utils._write(config.INFO_DATA, dfInfo)

    print 'Source clicks len: %d' % (dfClicks_len)
    print 'Rows with relevance: %d' % (rows_w_rel)
    print 'Final data len: %d' % (len(dfAll))
    print 'Ratio: %.3f' % (float(len(dfAll))/dfClicks_len)
예제 #3
0
 def save(self):
     data_dict = {
         "X": self.X,
         "y": self.y,
         "weights": self.weights,
         "query_ids": self.query_ids,
         "feature_names": self.feature_names,
     }
     fname = os.path.join(config.FEAT_DIR, 'Combine',
                          self.feature_name + config.FEAT_FILE_SUFFIX)
     table_utils._write(fname, data_dict)
     self.logger.info("Save to %s" % (fname))
예제 #4
0
def main():
    df = table_utils._read(config.CLICK_DATA)
    # TODO: using only session_id might be more efficient, but (query, session_id)
    # is more obvious to debug
    grouped = df.groupby(['norm_query', 'session_id'])
    clickmodel_data_file = os.path.join(config.TMP_DIR, 'clickmodel.txt')
    pos = 0
    with codecs.open(clickmodel_data_file, 'w', 'utf8') as f:
        with progressbar.ProgressBar() as bar:
            for (norm_query, session_id), group in bar(grouped):
                assert "\t" not in norm_query
                assert type(norm_query) == unicode
                results = []
                clicks = []
                # TODO: groupby still necessary? check all group lengths, they might be 1
                # after adjustments to the source hive query
                for title, rows in group \
                        .sort_values(['hit_score'], ascending=False) \
                        .groupby(['hit_title'], sort=False):
                    # TODO: using page id instead of title might be more efficient,
                    # but title is easier for debugging
                    results.append(title)
                    clicks.append(True in list(rows['clicked']))
                    if len(results) >= config.DBN_CONFIG['MAX_DOCS_PER_QUERY']:
                        break
                # exclude groups with no clicks
                if not any(clicks):
                    continue
                # exclude too small result sets as well
                if len(results) < config.DBN_CONFIG['MIN_DOCS_PER_QUERY']:
                    continue
                f.write("\t".join([
                    str(pos),  # hash digest
                    norm_query,  # query
                    '0',  # region
                    '0',  # intent weight
                    json.dumps(results),  # urls
                    json.dumps([False] * len(results)),  # layout
                    json.dumps(clicks)  # clicks
                ]) + "\n")
                pos += 1

    del df
    with codecs.open(clickmodel_data_file, 'r', 'utf8') as f:
        reader = InputReader(config.DBN_CONFIG['MIN_DOCS_PER_QUERY'],
                             config.DBN_CONFIG['MAX_DOCS_PER_QUERY'],
                             False,
                             config.DBN_CONFIG['SERP_SIZE'],
                             False,
                             discard_no_clicks=True)
        sessions = reader(f)
        dbn_config = config.DBN_CONFIG.copy()
        dbn_config['MAX_QUERY_ID'] = reader.current_query_id + 1
        model = DbnModel((0.9, 0.9, 0.9, 0.9), config=dbn_config)
        model.train(sessions)

        f.seek(0)
        results = []
        # This is a bit ugly and hackish ... but trying to not explode memory
        # by flipping the giant url_to_id and query_to_id dicts.
        seen = set()
        # hax
        with progressbar.ProgressBar(max_value=pos) as bar:
            pos = 0
            for line in f:
                bar.update(pos)
                pos += 1

                _, norm_query, _, _, titles, _, clicks = line.rstrip().split(
                    '\t')
                titles = json.loads(titles)
                if len(titles) < dbn_config['MIN_DOCS_PER_QUERY']:
                    continue
                query_id = reader.query_to_id[(norm_query, "0")]
                title_ids = [reader.url_to_id[t] for t in titles]
                session = SessionItem(0, query_id, title_ids, 0, [], {})
                relevances = model.get_model_relevances(session)
                for title, relevance in zip(titles, relevances):
                    if (norm_query, title) in seen:
                        continue
                    results.append([norm_query, title, relevance])
                    # alternatly could use drop_duplicates, not sure which
                    # is cheaper on memory usage
                    seen.add((norm_query, title))
        df = pd.DataFrame(results,
                          columns=['norm_query', 'hit_title', 'relevance'])
        print 'Hits with relevance: %d' % (len(results))
        table_utils._write(config.RELEVANCE_DATA, df)
예제 #5
0
def main():
    # Fetch input data from hive
    # Attempting to select all clicks for a limited number of queries makes
    # this rather complicated...but simpler than doing the top few sub-queries
    # in pandas

    sql = """
SELECT
    z.query AS query,
    z.norm_query AS norm_query,
    z.session_id AS session_id,
    hit.pageid AS hit_page_id,
    hit.title AS hit_title,
    -- Some things can have the same score, so this isn't the same order we displayed, should
    -- add to CirrusSearchRequestSet to know the real hit position, or find the right hive solution
    ROW_NUMBER() OVER(PARTITION BY z.query, z.session_id ORDER BY -AVG(hit.score)) AS hit_position,
    -- If the user performed the query multiple times (forward, read, back navigation) the
    -- scores could be slightly different, just take a reasonable one they are probably
    -- close anyways.
    AVG(hit.score) AS hit_score,
    -- group contains multiple searches (forward, read, back navigation, or pagination)
    -- so this collects potentially multiple click's and checks if any were this hit
    ARRAY_CONTAINS(COLLECT_LIST(z.click_page_id), hit.pageid) AS clicked
FROM (
    SELECT
        top_query_clicks.query,
        top_query_clicks.norm_query,
        top_query_clicks.click_page_id,
        top_query_clicks.search_timestamp,
        top_query_clicks.click_timestamp,
        top_query_clicks.hits,
        top_query_clicks.session_id
    FROM
        ebernhardson.top_query_clicks
    JOIN (
            -- Randomly select N unique normalized queries
        SELECT
            x.project, x.norm_query
        FROM (
            SELECT
                project,
                norm_query,
                count(distinct year, month, day, session_id) as num_searchs
            FROM
                ebernhardson.top_query_clicks
            WHERE
                year = 2016
                AND project = '%s'
            GROUP BY
                project,
                norm_query
            ) x
        WHERE
            x.num_searchs >= %d
        DISTRIBUTE BY
            rand()
        SORT BY
            rand()
        LIMIT
            %d
        ) y
    ON
        y.norm_query = top_query_clicks.norm_query
        AND y.project = top_query_clicks.project
    WHERE
        year = 2016
    ) z
LATERAL VIEW
    -- ideally we want to know the order within hits, as this is the display
    -- order, but how?
    EXPLODE(z.hits) h AS hit
GROUP BY
    z.query,
    z.norm_query,
    z.session_id,
    hit.pageid,
    hit.title
;
""" % (config.WIKI_PROJECT, config.MIN_NUM_SEARCHES, config.MAX_QUERIES)

    if not os.path.isfile(config.CLICK_DATA_TSV):
        with tempfile.TemporaryFile() as tmp:
            command = []
            if False:
                command += [
                    'ssh', '-o', 'Compression=yes', 'stat1002.eqiad.wmnet'
                ]
            command += ['hive', '-S', '-e', '"' + sql + '"']
            p = subprocess.Popen(command,
                                 stdin=None,
                                 stdout=tmp,
                                 stderr=subprocess.PIPE)
            _, stderr = p.communicate(input=sql)
            if not os.fstat(tmp.fileno()).st_size > 0:
                print stderr
                raise Exception
            tmp.seek(0)
            with codecs.open(config.CLICK_DATA_TSV, 'w', 'utf-8') as f:
                for line in tmp:
                    try:
                        f.write(line.decode('utf-8'))
                    except UnicodeDecodeError:
                        pass

    # Read the tsv into pandas
    df = pd.read_csv(config.CLICK_DATA_TSV,
                     sep="\t",
                     index_col=False,
                     encoding="utf-8")

    # and write it back out as hdf5
    table_utils._write(config.CLICK_DATA, df)