def save_feature(self, feat_name, obs_field, target_field, dim, x, y): fname = "%s_%s_x_%s_%dD" % (feat_name, obs_field, target_field, dim) table_utils._write( os.path.join(self.feat_dir, fname + config.FEAT_FILE_SUFFIX), x) if dim == 1: corr = np_utils._corr(x, y) self.logger.info("%s (%dD): corr=%.6f" % (fname, dim, corr))
def main(): # uses inner to ensure we have complete rows. Otherwise # we would have NaN mixed in with the relevance scores, # and we can't train on those dfClicks = table_utils._read(config.CLICK_DATA) dfRel = table_utils._read(config.RELEVANCE_DATA) if config.REMOVE_WALL: dfRel = dfRel[(dfRel['relevance'] < 0.24) | (dfRel['relevance'] > 0.25)] dfAll = dfClicks \ .join(dfRel.set_index(['norm_query', 'hit_title']), on=['norm_query', 'hit_title'], how='inner') dfClicks_len = len(dfClicks) rows_w_rel = len(dfAll) del dfClicks del dfRel # Filter out pages that couldn't be loaded as docs/termvecs es_docs_keys = set(map(int, shelve_keys(config.ES_PAGE_DOCS_SHELVE))) dfAll = dfAll[dfAll['hit_page_id'].isin(es_docs_keys)] es_termvec_keys = set(map(int, shelve_keys(config.ES_PAGE_TERM_VEC_SHELVE))) dfAll = dfAll[dfAll['hit_page_id'].isin(es_termvec_keys)] # drop some unnecessary columns dfAll.drop(['session_id', 'clicked'], axis=1, inplace=True) # average out hit_position and hit_score to improve de-duplication. # on 10k queries this is a 90% reduction in data dfAll = dfAll.join(dfAll.groupby(['query', 'hit_page_id'])['hit_position'].mean(), on=['query', 'hit_page_id'], rsuffix='_mean') dfAll = dfAll.join(dfAll.groupby(['query', 'hit_page_id'])['hit_score'].mean(), on=['query', 'hit_page_id'], rsuffix='_mean') dfAll.drop(['hit_position', 'hit_score'], axis=1, inplace=True) # turn duplicates into a 'weight' column dfAll = dfAll.groupby(map(str, dfAll.columns)).size().reset_index() dfAll.rename(columns={0: 'weight'}, inplace=True) # Assign id's to all normalized queries, to be used to generate pairwise # preferences for the ranking learners norm_query_id_df = pd.DataFrame(list(enumerate(dfAll["norm_query"].unique())), columns=["norm_query_id", "norm_query"]) dfAll = dfAll.join(norm_query_id_df.set_index("norm_query"), on="norm_query") # xgboost requires the data to be in groups by id dfAll.sort_values("norm_query_id", inplace=True) table_utils._write(config.ALL_DATA, dfAll) dfInfo = dfAll[["relevance", "weight", "norm_query_id"]].copy() table_utils._write(config.INFO_DATA, dfInfo) print 'Source clicks len: %d' % (dfClicks_len) print 'Rows with relevance: %d' % (rows_w_rel) print 'Final data len: %d' % (len(dfAll)) print 'Ratio: %.3f' % (float(len(dfAll))/dfClicks_len)
def save(self): data_dict = { "X": self.X, "y": self.y, "weights": self.weights, "query_ids": self.query_ids, "feature_names": self.feature_names, } fname = os.path.join(config.FEAT_DIR, 'Combine', self.feature_name + config.FEAT_FILE_SUFFIX) table_utils._write(fname, data_dict) self.logger.info("Save to %s" % (fname))
def main(): df = table_utils._read(config.CLICK_DATA) # TODO: using only session_id might be more efficient, but (query, session_id) # is more obvious to debug grouped = df.groupby(['norm_query', 'session_id']) clickmodel_data_file = os.path.join(config.TMP_DIR, 'clickmodel.txt') pos = 0 with codecs.open(clickmodel_data_file, 'w', 'utf8') as f: with progressbar.ProgressBar() as bar: for (norm_query, session_id), group in bar(grouped): assert "\t" not in norm_query assert type(norm_query) == unicode results = [] clicks = [] # TODO: groupby still necessary? check all group lengths, they might be 1 # after adjustments to the source hive query for title, rows in group \ .sort_values(['hit_score'], ascending=False) \ .groupby(['hit_title'], sort=False): # TODO: using page id instead of title might be more efficient, # but title is easier for debugging results.append(title) clicks.append(True in list(rows['clicked'])) if len(results) >= config.DBN_CONFIG['MAX_DOCS_PER_QUERY']: break # exclude groups with no clicks if not any(clicks): continue # exclude too small result sets as well if len(results) < config.DBN_CONFIG['MIN_DOCS_PER_QUERY']: continue f.write("\t".join([ str(pos), # hash digest norm_query, # query '0', # region '0', # intent weight json.dumps(results), # urls json.dumps([False] * len(results)), # layout json.dumps(clicks) # clicks ]) + "\n") pos += 1 del df with codecs.open(clickmodel_data_file, 'r', 'utf8') as f: reader = InputReader(config.DBN_CONFIG['MIN_DOCS_PER_QUERY'], config.DBN_CONFIG['MAX_DOCS_PER_QUERY'], False, config.DBN_CONFIG['SERP_SIZE'], False, discard_no_clicks=True) sessions = reader(f) dbn_config = config.DBN_CONFIG.copy() dbn_config['MAX_QUERY_ID'] = reader.current_query_id + 1 model = DbnModel((0.9, 0.9, 0.9, 0.9), config=dbn_config) model.train(sessions) f.seek(0) results = [] # This is a bit ugly and hackish ... but trying to not explode memory # by flipping the giant url_to_id and query_to_id dicts. seen = set() # hax with progressbar.ProgressBar(max_value=pos) as bar: pos = 0 for line in f: bar.update(pos) pos += 1 _, norm_query, _, _, titles, _, clicks = line.rstrip().split( '\t') titles = json.loads(titles) if len(titles) < dbn_config['MIN_DOCS_PER_QUERY']: continue query_id = reader.query_to_id[(norm_query, "0")] title_ids = [reader.url_to_id[t] for t in titles] session = SessionItem(0, query_id, title_ids, 0, [], {}) relevances = model.get_model_relevances(session) for title, relevance in zip(titles, relevances): if (norm_query, title) in seen: continue results.append([norm_query, title, relevance]) # alternatly could use drop_duplicates, not sure which # is cheaper on memory usage seen.add((norm_query, title)) df = pd.DataFrame(results, columns=['norm_query', 'hit_title', 'relevance']) print 'Hits with relevance: %d' % (len(results)) table_utils._write(config.RELEVANCE_DATA, df)
def main(): # Fetch input data from hive # Attempting to select all clicks for a limited number of queries makes # this rather complicated...but simpler than doing the top few sub-queries # in pandas sql = """ SELECT z.query AS query, z.norm_query AS norm_query, z.session_id AS session_id, hit.pageid AS hit_page_id, hit.title AS hit_title, -- Some things can have the same score, so this isn't the same order we displayed, should -- add to CirrusSearchRequestSet to know the real hit position, or find the right hive solution ROW_NUMBER() OVER(PARTITION BY z.query, z.session_id ORDER BY -AVG(hit.score)) AS hit_position, -- If the user performed the query multiple times (forward, read, back navigation) the -- scores could be slightly different, just take a reasonable one they are probably -- close anyways. AVG(hit.score) AS hit_score, -- group contains multiple searches (forward, read, back navigation, or pagination) -- so this collects potentially multiple click's and checks if any were this hit ARRAY_CONTAINS(COLLECT_LIST(z.click_page_id), hit.pageid) AS clicked FROM ( SELECT top_query_clicks.query, top_query_clicks.norm_query, top_query_clicks.click_page_id, top_query_clicks.search_timestamp, top_query_clicks.click_timestamp, top_query_clicks.hits, top_query_clicks.session_id FROM ebernhardson.top_query_clicks JOIN ( -- Randomly select N unique normalized queries SELECT x.project, x.norm_query FROM ( SELECT project, norm_query, count(distinct year, month, day, session_id) as num_searchs FROM ebernhardson.top_query_clicks WHERE year = 2016 AND project = '%s' GROUP BY project, norm_query ) x WHERE x.num_searchs >= %d DISTRIBUTE BY rand() SORT BY rand() LIMIT %d ) y ON y.norm_query = top_query_clicks.norm_query AND y.project = top_query_clicks.project WHERE year = 2016 ) z LATERAL VIEW -- ideally we want to know the order within hits, as this is the display -- order, but how? EXPLODE(z.hits) h AS hit GROUP BY z.query, z.norm_query, z.session_id, hit.pageid, hit.title ; """ % (config.WIKI_PROJECT, config.MIN_NUM_SEARCHES, config.MAX_QUERIES) if not os.path.isfile(config.CLICK_DATA_TSV): with tempfile.TemporaryFile() as tmp: command = [] if False: command += [ 'ssh', '-o', 'Compression=yes', 'stat1002.eqiad.wmnet' ] command += ['hive', '-S', '-e', '"' + sql + '"'] p = subprocess.Popen(command, stdin=None, stdout=tmp, stderr=subprocess.PIPE) _, stderr = p.communicate(input=sql) if not os.fstat(tmp.fileno()).st_size > 0: print stderr raise Exception tmp.seek(0) with codecs.open(config.CLICK_DATA_TSV, 'w', 'utf-8') as f: for line in tmp: try: f.write(line.decode('utf-8')) except UnicodeDecodeError: pass # Read the tsv into pandas df = pd.read_csv(config.CLICK_DATA_TSV, sep="\t", index_col=False, encoding="utf-8") # and write it back out as hdf5 table_utils._write(config.CLICK_DATA, df)