def create_graph_domain(): """ Creates graph linking (domain searched, domain bought) """ """ Fetch data """ from input.read_input import read_item_data df = read_item_data() df['item_id'] = df.index dct_title = df['title'].to_dict() dct_domain = df['domain_id'].to_dict() dct_cat= df['category_id'].to_dict() dct_price = df['price'].to_dict() """ Ratio stuff """ from input.create_ratio import get_ratio dct_ratio_dom = get_ratio(which='domain_id') ratio_df = get_ratio(which='item_id',full=True) ratio_df['popularity'] = 100.0*ratio_df['bought'] + ratio_df['searched'] dct_ratio_item_b = ratio_df['popularity'].to_dict() """ JSON """ check = lambda x: x <= np.round(413163*0.8).astype(np.int32) DATA_PATH = path.join(DATA_DIR,'train_dataset.jl') line_i = 0 """ Create graph vertices """ g = ig.Graph() from input.read_input import get_mappings counter, f_map_func, r_map_func = get_mappings() num_items = df.shape[0] for k in dct_title.keys(): g.add_vertex(value=k,deg=dct_ratio_item_b[k],domain_id=dct_domain[k],price=dct_price[k],cat='item_id') """ ['item_id','domain_id','category_id','product_id'] """ for k in pd.unique(df['domain_id']): g.add_vertex(value=k,cat='domain_id') for k in pd.unique(df['category_id']): g.add_vertex(value=k,cat='category_id') for k in pd.unique(df['product_id']): g.add_vertex(value=k,cat='product_id') """ Create edges """ E1 = [] E2 = [] with jsonlines.open(DATA_PATH) as reader: for line_i, obj in enumerate(reader): if check(line_i): print(line_i) L = [] for h in obj['user_history']: if h['event_type'] == 'view': #print("Viewed {}".format(dct[h['event_info']])) L.append(h['event_info']) elif h['event_type'] == 'search': #print("Searched {}".format(h['event_info'])) pass L_domain = [dct_domain[k] for k in L] L_domain = pd.unique(L_domain) L_cat = [dct_cat[k] for k in L] L_cat = pd.unique(L_cat) for i in range(len(L)): E1.append(dct_domain[L[i]]) E2.append(dct_domain[obj['item_bought']] ) E1 = f_map_func['domain_id'](E1) E2 = f_map_func['domain_id'](E2) E = pd.Series(list(zip(E1,E2))).value_counts() g.add_edges(E.index) g.es["weight"] = E.values g.write_pickle(fname=path.join(DATA_DIR,'graph_domain_to_domain.pkl'))
def train_neural_domain_prediction(): import tensorflow as tf """ Create graph """ from input.read_input import read_item_data df = read_item_data() dct_condition = df['condition'].to_dict() df2 = load_language_df() dct_lan_pt = df2['score_pt'].to_dict() dct_lan_en = df2['score_en'].to_dict() dct_lan_es = df2['score_es'].to_dict() NUM_ITEMS = read_item_data().shape[0] NUM_FEATURES = 1 from input.read_input import get_mappings, NUM_DOMS counter, f_map_func, r_map_func = get_mappings() NUM_DOMS = pd.unique(df['domain_id']).shape[0] NUM_CATS = pd.unique(df['category_id']).shape[0] """ Load graph """ graph_fname = path.join(DATA_DIR,'graph_domain_to_domain.pkl') if not path.isfile(graph_fname): input("Did not find graph at {}. Will have to create it from scratch... (Any key to continue)".format(graph_fname)) G = create_graph_domain() else: G = ig.Graph.Read_Pickle(path.join(DATA_DIR,'graph_domain_to_domain.pkl')) #weights = np.log(1+np.array(G.es["weight"])) weights = np.array(G.es["weight"]) indices = np.array([ np.array(e.tuple) for e in G.es]) - NUM_ITEMS indices = np.transpose(indices) """ Create sparse matrix W """ from scipy.sparse import coo_matrix import scipy.sparse row = indices[0,:] col = indices[1,:] W = coo_matrix((weights, (row, col)),shape=(NUM_DOMS,NUM_DOMS)) """ Normalize rows """ #W = deg_matrix(W,pwr=-1) @ W W = W.transpose() W = scipy.sparse.csr_matrix(W) assert scipy.sparse.issparse(W) @tf.function def smooth_labels(labels, factor=0.001): # smooth the labels labels = tf.cast(labels,tf.float32) labels *= (1 - factor) labels += (factor / tf.cast(tf.shape(labels)[1],tf.float32)) # returned the smoothed labels return labels @tf.function def compute_loss(labels,logits): logits = tf.reshape(logits,(-1,NUM_DOMS)) labels = tf.reshape(labels,(-1,NUM_DOMS)) #logits = tf.nn.softmax(logits) #print(logits) logits = smooth_labels(logits) labels = smooth_labels(labels) losses = -tf.reduce_sum(logits*tf.math.log(labels),axis=1) return tf.reduce_mean(losses) @tf.function def evaluate(labels,logits): logits = tf.reshape(logits,(-1,NUM_DOMS)) labels = tf.reshape(labels,(-1,NUM_DOMS)) #logits = tf.nn.softmax(logits) #print(logits) logits = smooth_labels(logits) labels = smooth_labels(labels) acc = tf.metrics.categorical_accuracy(labels,logits) return tf.reduce_mean(acc) """ Read data, yadda yadda """ from input.create_ratio import get_ratio ratio_df = get_ratio(which='item_id',full=True,alternate=False) dct_ratio_item_b = ratio_df['bought'].to_dict() dct_ratio_item_s = ratio_df['searched'].to_dict() dct_ratio_item_r = ratio_df['searched'].to_dict() dct = df['title'].to_dict() dct_domain = df['domain_id'].to_dict() dct_cat = df['category_id'].to_dict() dct_price = df['price'].to_dict() """ Ratio stuff """ from input.create_ratio import get_ratio category_df = get_ratio(which='category_id',full=True) domain_df = get_ratio(which='domain_id', full = True) feat_1, feat_2, feat_3 = domain_df['searched'].values, domain_df['bought'].values, domain_df['rat'].values feat_4, feat_5 = domain_df['out_bought'].values,domain_df['rat2'].values feat_1_1, feat_2_1, feat_3_1 = category_df['searched'].values, category_df['bought'].values, category_df['rat'].values def standardize(x): return (x - np.min(x)) / (np.max(x)+1e-06 - np.min(x)) feat_1, feat_2, feat_3 = [standardize(x) for x in [feat_1,feat_2,feat_3]] feat_1_1, feat_2_1, feat_3_1 = [standardize(x) for x in [feat_1_1,feat_2_1,feat_3_1]] #dom_ratios = np.array([dct_ratio_dom[k] for k in pd.unique(df['domain_id'].values)]) #dom_ratios = (dom_ratios - np.mean(dom_ratios)) / np.std(dom_ratios) from nn.domain_string_identifier import load_model domain_prediction_model = load_model() def my_generator(mode='train'): if mode == 'train': check = lambda x: x <= np.round(413163*0.8).astype(np.int32) elif mode == 'val': check = lambda x: x > np.round(413163*0.8).astype(np.int32) else: check = lambda x: True DATA_PATH = path.join(DATA_DIR,'test_dataset.jl' if mode == 'test' else 'train_dataset.jl') print("Reading....") with jsonlines.open(DATA_PATH) as reader: for line_i, obj in enumerate(reader): if check(line_i): L = [] S = [] C =[] IDS = [] for h in obj['user_history']: if h['event_type'] == 'view': L.append(dct_domain[h['event_info']]) C.append(dct_cat[h['event_info']]) IDS.append(h['event_info']) elif h['event_type'] == 'search': S.append(h['event_info']) L = f_map_func['domain_id'](L) C = f_map_func['category_id'](C) df = pd.DataFrame( {"domain_id":L, "feat_1_1":[feat_1_1[C[i]-NUM_ITEMS-NUM_DOMS] for i in range(len(L))], "feat_2_1":[feat_2_1[C[i]-NUM_ITEMS-NUM_DOMS] for i in range(len(L))], "feat_3_1":[feat_3_1[C[i]-NUM_ITEMS-NUM_DOMS] for i in range(len(L))], }, index=IDS) df['recency'] = range(len(L)) df['freq'] = np.ones((len(L),)) df['price'] = [ dct_price[k] for k in IDS] df['item_b'] =[ dct_ratio_item_b[k] for k in IDS] df['item_s'] =[ dct_ratio_item_s[k] for k in IDS] df['condition'] =[dct_condition[k] for k in IDS] df['lan_pt'] = [dct_lan_pt[k] for k in IDS] df['lan_en'] = [dct_lan_en[k] for k in IDS] df['lan_es'] = [dct_lan_es[k] for k in IDS] """ Adjust graph """ Y = np.zeros((NUM_DOMS,1)).astype(np.float32) X = np.zeros((NUM_DOMS,55+55)).astype(np.float32) X[:,0] = feat_1 X[:,1] = feat_2 X[:,2] = feat_3 X[:,3] = feat_4 i=4 for g, df2 in df.groupby(["domain_id"]): i=4 v = df2.to_numpy()[:,1:] X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.sum(v,axis=0) i += v.shape[1] X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.mean(v,axis=0) i += v.shape[1] X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.nanstd(v,axis=0) i += v.shape[1] X[g-NUM_ITEMS,i:i+(v.shape[1])] = np.max(v,axis=0) i += v.shape[1] if len(S) > 0: s_pred = predict_model(domain_prediction_model,S,return_numeric=True) else: s_pred = np.zeros_like((1,NUM_DOMS)) if len(S) > 0: X[:,i] = np.mean(s_pred,axis=0) X[:,i+1] = np.max(s_pred,axis=0) try: X[:,i+2] = np.nanstd(s_pred,axis=0) except: X[:,i+2] = X[:,i+2] i += 3 X[:,55:] = np.reshape(np.asarray(W @ X[:,55:]),(-1,X.shape[1]-55)) if not mode == 'test': Y[ f_map_func['domain_id']( [ dct_domain[obj['item_bought']] ] )[0] - NUM_ITEMS,0 ] = 1.0 #X[:,:8] = 0 for i in range(55+3): X[:,i] = (X[:,i] - np.min(X[:,i])) / (1e-06+ np.max(X[:,i]) - np.min(X[:,i])) #X = X -0.5 yield X,Y """ Optimize """ BS = 64 step = 0 def batch_generator(mode, loop =True,batch_size=BS): BATCH_X = [] BATCH_Y = [] i = 0 while True: for x,y in my_generator(mode): BATCH_X.append(x[None,:,:]) BATCH_Y.append(y[None,:,:]) i+= 1 if i % batch_size == 0: yield np.concatenate(BATCH_X,axis=0), np.concatenate(BATCH_Y,axis=0) BATCH_X = [] BATCH_Y = [] i = 0 if loop == False: yield np.concatenate(BATCH_X,axis=0), np.concatenate(BATCH_Y,axis=0) break """ Define model """ import tensorflow.keras as keras import tensorflow.keras.layers as layers inp_x = keras.Input((NUM_DOMS,55+55)) x = layers.Dense(64,activation='relu')(inp_x) x = layers.Dense(64,activation='relu')(x) x = layers.Dense(64,activation='relu')(x) x = layers.Dense(1)(x) x = layers.Flatten()(x) x = layers.Softmax(axis=-1)(x) model = keras.Model(inputs=[inp_x],outputs=[x]) print(model.summary()) lr_schedule = keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=0.5*1e-2, decay_steps=1000, decay_rate=0.9) optimizer = tf.keras.optimizers.Adam(learning_rate=0.1*1e-2) model_fname = path.join(DATA_DIR,'model',"NEURAL_DOMAIN_PRED.h5") model.compile(optimizer=optimizer,loss=compute_loss,metrics=[evaluate]) from functools import partial from input.read_input import TRAIN_LINES #model.load_weights(path.join(DATA_DIR,"MY_MODEL_2.h5")) if not path.isfile(model_fname): input("Warning!!! Did not find model weights at {}. Training takes many, many, many hours! (Press ENTER)".format(model_fname)) model.fit_generator(batch_generator('train',True), steps_per_epoch=TRAIN_LINES//BS, epochs=5 ) model.save_weights(model_fname) else: model.load_weights(model_fname) print("Testing fit... should be about 0.41 to 0.45") model.fit_generator(batch_generator('train',True), steps_per_epoch=25, epochs=1 ) def predict(mode): PREDS = [] CONFS = [] NUM_SELECT = 10 batch_size = 320 for batch_id, X in enumerate(batch_generator(mode,batch_size=batch_size,loop=False)): x = X[0] print("Predicting {} - Batch {}".format(mode,batch_id)) pred = model.predict_on_batch(x) if batch_id == 0: print(pred) PREDS.append(tf.argsort(pred,axis=-1)[:,-NUM_SELECT:]) CONFS.append(tf.sort(pred,axis=-1)[:,-NUM_SELECT:]) PREDS = np.concatenate(PREDS,axis=0) CONFS = np.concatenate(CONFS,axis=0) PREDS = np.concatenate([PREDS,CONFS],axis=1) cols = ['pred_{}'.format(k) for k in range(NUM_SELECT)] + \ ['conf_{}'.format(k) for k in range(NUM_SELECT)] fname = os.path.join(DATA_DIR,'dom_pred_{}.csv'.format(mode)) pd.DataFrame(PREDS,index=range(PREDS.shape[0]),columns=cols).to_csv(fname) predict('val') predict('test') predict('train')
def final_prediction(mode='train', use_graph=True, debug=False): """ Combines all classifiers in a hierarchical manner to create the final predictions. First, we create many rankings for items seen during the object history, such as ones based on frequency and recency. Perhaps the most important rankings are the ones related to the predictions of the RNN and LGB. I have hardcoded some coefficients that attained good validation accuracy. The top 10 items are selected. Then, I use the Neural Domain Classifier's predictions to eliminate items among those 10, specifically ones whose domain is very unlikely to be the one. Once again, there is a hardcoded cutoff that may need some tuning if you train the classifier from scratch, as it can have a significant effect on the NDCG. """ TRAIN_LINES = 413163 TEST_LINES = 177070 df = read_item_data() from input.create_ratio import load_language_df df2 = load_language_df() dct_lan_pt = df2['score_pt'].to_dict() dct_lan_en = df2['score_en'].to_dict() dct_lan_es = df2['score_es'].to_dict() dct_condition = df['condition'].to_dict() dct = df['title'].to_dict() dct_domain = df['domain_id'].to_dict() dct_cat = df['category_id'].to_dict() dct_price = df['price'].to_dict() dct_pid = df['product_id'].to_dict() """ Ratio stuff """ from input.create_ratio import get_ratio dct_ratio_dom = get_ratio(which='domain_id') ratio_df = get_ratio(which='item_id', full=True) ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched'] dct_ratio_item_p = ratio_df['popularity'].to_dict() """ Most common embeddings. """ ratio_df = ratio_df.sort_values(['popularity'], ascending=False) most_common_emb = get_emb( [first_two_words(dct[k]) for k in ratio_df.index[0:100]], [-1] * 100) ratio_df = get_ratio(which='item_id', full=True, alternate=False) dct_ratio_item_b = ratio_df['bought'].to_dict() dct_ratio_item_s = ratio_df['searched'].to_dict() dct_ratio_item_r = ratio_df['rat'].to_dict() df['item_popularity'] = [dct_ratio_item_p[k] for k in df.index] dct_ratio_cat = get_ratio(which='category_id') dct_ratio_item = get_ratio(which='item_id') dct_domain_df = {} dct_cat_df = {} for dom, df2 in df.groupby('domain_id'): df2 = df2.sort_values(['item_popularity'], ascending=False) #.iloc[0:10,:] dct_domain_df[dom] = df2 for cat, df2 in df.groupby('category_id'): df2 = df2.sort_values(['item_popularity'], ascending=False) #.iloc[0:10,:] dct_cat_df[cat] = df2 #print(df2) """ RNN stuff. """ from input.rnn_item_ranker import SEQ_LEN, CANDIDATES from input.rnn_item_ranker import read_predictions rnn_pred = read_predictions(mode) assert rnn_pred.shape[1] == 2 * CANDIDATES if mode == 'train' or mode == 'val': assert rnn_pred.shape[0] == TRAIN_LINES """ LGB stuff """ import lightgbm as lgb from sklearn.externals import joblib lgbc = joblib.load(path.join(DATA_DIR, 'model', 'lgb.pkl')) """ Graph-related initialization """ graph_fname = path.join(DATA_DIR, 'graph_domain_id.pkl') if not path.isfile(graph_fname): print("Creating item-to-item graph") create_item_graph(mode='train') G1 = ig.Graph.Read_Pickle(graph_fname) _, f_map_func, r_map_func = get_mappings() if mode == 'test': DF_DOM_PRED = pd.read_csv(path.join(DATA_DIR, 'domain_pred_test.csv'), index_col=0) else: DF_DOM_PRED = pd.concat([ pd.read_csv(path.join(DATA_DIR, 'domain_pred_train.csv'), index_col=0), pd.read_csv(path.join(DATA_DIR, 'domain_pred_val.csv'), index_col=0) ], ignore_index=True) DF_CONF_PRED = DF_DOM_PRED.loc[:, [ 'conf_{}'.format(i) for i in range(10)[::-1] ]] DF_DOM_PRED = DF_DOM_PRED.loc[:, [ 'pred_{}'.format(i) for i in range(10)[::-1] ]] vals = pd.unique(df['domain_id'].values) for c in DF_DOM_PRED.columns: DF_DOM_PRED[c] = DF_DOM_PRED[c].values.astype(np.int32) DF_DOM_PRED[c] = [vals[k] for k in DF_DOM_PRED[c]] """ EMB stuff """ from gcn.domain_string_identifier import predict_model, load_model domain_identifier = load_model() if mode == 'train': check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32) elif mode == 'val': check = lambda x: x > np.round(413163 * 0.8).astype(np.int32) else: check = lambda x: True DATA_PATH = path.join( DATA_DIR, 'test_dataset.jl' if mode == 'test' else 'train_dataset.jl') i = 0 def rank_to_order(L, rank): assert rank.shape[0] == L.shape[0] ids = (-rank).argsort(kind='mergesort') return L[ids], rank[ids] pred = {} res = [] actual = [] domain_ids = [] lgb_acc = 0 rnn_acc = 0 counter = 0 del df del df2 #_scores = np.zeros((10,)).astype(np.float32) with jsonlines.open(DATA_PATH) as reader: for line_id, obj in enumerate(reader): def score(k): if k == obj['item_bought']: return 12 elif dct_domain[k] == dct_domain[obj['item_bought']]: return 1 else: return 0 if check(line_id): print("Current line {}".format(line_id)) L = [ h['event_info'] for h in obj['user_history'] if h['event_type'] == 'view' ] S = [ h['event_info'] for h in obj['user_history'] if h['event_type'] == 'search' ] L_k = pd.unique(L[::-1])[::-1] """ Calculate ranks """ if len(L_k) > 0: rank_ratio_dom = pd.Series([ dct_ratio_dom[dct_domain[k]] for k in L_k ]).rank(method="average").to_numpy() rank_ratio_cat = pd.Series([ dct_ratio_cat[dct_cat[k]] for k in L_k ]).rank(method="average").to_numpy() rank_ratio_item = pd.Series([ dct_ratio_item_p[k] for k in L_k ]).rank(method="average").to_numpy() rank_freq = pd.Series(L, index=range( len(L))).value_counts(sort=False) rank_freq = rank_freq.rank(method="average").to_dict() rank_freq = np.array([rank_freq[k] for k in L_k]) rank_latest = np.arange(len(L_k)) rank_price = pd.Series( [-dct_price[k] for k in L_k]).rank(method="average").to_numpy() vals = DF_DOM_PRED.iloc[line_id, :].values RANK_DOM = [ np.where(vals == dct_domain[k])[0] for k in L_k ] RANK_DOM = [ vals.shape[0] - k[0] if len(k) > 0 else 0 for k in RANK_DOM ] RANK_DOM = pd.Series(RANK_DOM).rank( method="average").to_numpy() from input.rnn_item_ranker import SEQ_LEN, CANDIDATES dct_rnn = dict([ (int(x), y) for x, y in zip(rnn_pred.iloc[line_id, 0:CANDIDATES], rnn_pred.iloc[line_id, -CANDIDATES:]) ]) if len(L_k) <= CANDIDATES: try: rank_ratio_rnn = pd.Series([ dct_rnn[k] for k in L_k ]).rank(method="average").to_numpy() except: print(L_k) print(rnn_pred.iloc[(line_id - 5):(line_id + 10), :]) raise ValueError( "Did not find keys in RNN prediction") raise "" else: rank_ratio_rnn = pd.Series( [1.0 for k in L_k]).rank(method="average").to_numpy() x = [] x.append([dct_ratio_dom[dct_domain[k]] for k in L_k]) x.append(rank_ratio_dom) x.append(rank_ratio_cat) x.append(rank_price) x.append([dct_ratio_item_b[k] for k in L_k]) x.append([dct_ratio_cat[dct_cat[k]] for k in L_k]) x.append([dct_ratio_item_b[k] for k in L_k]) x.append([dct_ratio_item_s[k] for k in L_k]) x.append([dct_ratio_item_r[k] for k in L_k]) x.append(list(rank_latest / len(L_k))) x.append([-dct_price[k] for k in L_k]) x.append([-dct_condition[k] for k in L_k]) x.append([-dct_lan_en[k] for k in L_k]) x.append([-dct_lan_es[k] for k in L_k]) x.append([-dct_lan_pt[k] for k in L_k]) x = np.transpose(np.reshape(np.array(x), (-1, len(L_k)))) rank_lgb = pd.Series( lgbc.predict(x)).rank(method="average").to_numpy() if ( not mode == 'test' ) and obj['item_bought'] in L_k and len(L_k) <= CANDIDATES: if L_k[np.argmax(rank_lgb)] == obj['item_bought']: lgb_acc += 1 if L_k[np.argmax( rank_ratio_rnn)] == obj['item_bought']: rnn_acc += 1 counter += 1 COEFFS = [1.5, 1.5, 4.5, 0.4, 0.4, 0.6, 0.8, 0.0] COEFFS = np.array(COEFFS) / np.sum(COEFFS) final_rank = COEFFS[0]*rank_freq + \ COEFFS[1]*(rank_lgb) + \ COEFFS[2]*(rank_ratio_rnn) + \ COEFFS[3]*(rank_ratio_dom) +\ COEFFS[4]*(rank_ratio_cat)+\ COEFFS[5]*(rank_ratio_item)+\ COEFFS[6]*(rank_latest)+\ COEFFS[7]*(rank_price) """ Yield rank """ L, L_ranks = rank_to_order(L_k, final_rank) #L = L[rank_freq.argsort(kind='mergesort')] #L = np.array([d for d in L if (dct_ratio_dom[dct_domain[d]] > 0.01 and dct_ratio_cat[dct_cat[d]] > 0.01)]) #DF_DOM_PRED.iloc[line_id,:] = DF_DOM_PRED.iloc[line_id,:]/np.max(DF_DOM_PRED.iloc[line_id,:]) #print(DF_CONF_PRED.iloc[line_id,:]) #print(DF_CONF_PRED.iloc[line_id,:] > 0.001) #print(np.where(DF_CONF_PRED.iloc[line_id,:] > 0.001)[0]) b = np.where(DF_CONF_PRED.iloc[line_id, :] > 0)[0] vals = DF_DOM_PRED.iloc[line_id, :].values[b] L = np.array([k for k in L if dct_domain[k] in vals]) L = np.array([k for k in L if dct_rnn.get(k, 1) > 1e-02]) L = L[:10] P = np.zeros((10, ), dtype=np.int32) P[0:L.shape[0]] = L else: P = np.zeros((10, ), dtype=np.int32) L = np.array(L) TEMP_MAX = 101 if len(obj['user_history']) > 0: temp = [] doms = [dct_domain[k] for k in L] if len(L) > 0: score_en = np.nanmean([dct_lan_en[k] for k in L]) score_es = np.nanmean([dct_lan_es[k] for k in L]) score_pt = np.nanmean([dct_lan_pt[k] for k in L]) else: score_en, score_es, score_pt = 0, 0, 0 b = np.where(DF_CONF_PRED.iloc[line_id, :] > 1e-05)[0] doms = DF_DOM_PRED.iloc[line_id, :].values[b] cats = [ x[1] for x in sorted([(-dct_ratio_cat[k], str(k)) for k in [dct_cat[k] for k in L]]) ] cat_rating = dict([(k, -dct_ratio_cat[k]) for k in cats]) if use_graph: roots = pd.unique([k for k in L]) roots = f_map_func['item_id'](roots) for dom in doms: if use_graph and len(roots) > 0: c_score = {} candidates = [] for k in roots: source_vert = G1.vs[k] es = G1.incident(source_vert, mode='OUT') es = G1.es[es] vs = [e.target for e in es] N = len(vs) vs = G1.vs[vs].select(domain_id=dom) vs = [v['value'] for v in vs] candidates.extend(vs) if len(candidates) > 0: candidates = pd.Series([k for k in candidates ]).value_counts() candidates = candidates[candidates.values > 1] _temp = [ k for k in list(candidates.index) if not k in temp ] temp.extend(_temp) if dom in dct_domain_df.keys(): if len(temp) > 40: break x = dct_domain_df[dom].index[0:TEMP_MAX] """ Here we try to restrict to items in the same language. This had minimal effect on the NDCG. """ if score_pt - score_es > 0.4: x = [ k for k in x if score_pt - dct_lan_pt[k] < 0.2 ] elif score_es - score_pt > 0.4: [ k for k in x if score_es - dct_lan_es[k] < 0.2 ] x = sorted(x, key=lambda k: cat_rating[dct_cat[k]] if dct_cat[k] in cats else 0) temp.extend(x) ############################################################## """ Add more items if there aren't enough""" temp = temp[0:TEMP_MAX] temp = [k for k in temp if k not in L] x = 0 while len(pd.unique(temp)) < 10: if isinstance(DF_DOM_PRED.iloc[line_id, x], str): temp.extend( dct_domain_df[DF_DOM_PRED.iloc[line_id, x]].index[0:10]) x += 1 temp = [k for k in temp if k not in L] temp = pd.unique(temp) ########################################################3 """ Finally, add the ranked items to our prediction. """ P[L.shape[0]:] = temp[:(10 - L.shape[0])] else: """ Special case for empty search and item""" x = 0 while len(pd.unique(temp)) < 10: if isinstance(DF_DOM_PRED.iloc[line_id, x], str): temp.extend( dct_domain_df[DF_DOM_PRED.iloc[line_id, x]].index[0:10]) x += 1 temp = [k for k in temp if k not in L] temp = pd.unique(temp) """ Set prediction """ pred[line_id] = P actual.append(obj.get('item_bought', 0)) if len(actual) > 10000 and debug: #print(lgb_acc/counter,rnn_acc/counter) break #print("Item bought: {}".format(dct[obj['item_bought']])) #L.append(obj) """ Now we calculate NDCG and save our prediction DataFrame. """ if mode == 'test': pred = np.reshape(np.asarray(list(pred.values())), (-1, 10)) OUT_PATH = path.join(SUBMISSIONS_DIR, 'submission.csv') out_df = pd.DataFrame(data=pred, index=range(pred.shape[0]), columns=range(pred.shape[1])) out_df.to_csv(OUT_PATH, index=False, header=False) else: pred = np.reshape(np.asarray(list(pred.values())), (-1, 10)) print(pred) actual = np.asarray(actual) res = ndcg(pred, actual) print("Number of objects: {}".format(pred.shape[0])) print(COEFFS) print("NDCG: {}".format(res)) return -res
def create_item_graph(mode='train'): """ Creates graph, whose vertices correspond to items. For each purchase, an edge is added from each searched item to the one that was bought. Edges may be repeated. """ """ Fetch data """ TRAIN_LINES = 413163 TEST_LINES = 177070 df = read_item_data() df['item_id'] = df.index dct_title = df['title'].to_dict() dct_domain = df['domain_id'].to_dict() dct_price = df['price'].to_dict() """ Ratio stuff """ from input.create_ratio import get_ratio dct_ratio_dom = get_ratio(which='domain_id') ratio_df = get_ratio(which='item_id', full=True) ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched'] dct_ratio_item_b = ratio_df['popularity'].to_dict() """ JSON """ if mode == 'train': check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32) elif mode == 'val': check = lambda x: x > np.round(413163 * 0.8).astype(np.int32) else: check = lambda x: True DATA_PATH = path.join( DATA_DIR, 'test_dataset.jl' if mode == 'test' else 'train_dataset.jl') line_i = 0 """ Create graph vertices """ g = ig.Graph() counter, f_map_func, r_map_func = get_mappings() for k in dct_title.keys(): g.add_vertex(value=k, deg=dct_ratio_item_b[k], domain_id=dct_domain[k], price=dct_price[k], cat='item_id') """ ['item_id','domain_id','category_id','product_id'] """ for k in pd.unique(df['domain_id']): g.add_vertex(value=k, cat='domain_id') for k in pd.unique(df['category_id']): g.add_vertex(value=k, cat='category_id') for k in pd.unique(df['product_id']): g.add_vertex(value=k, cat='product_id') """ Create edges """ E1 = [] E2 = [] with jsonlines.open(DATA_PATH) as reader: for line_i, obj in enumerate(reader): if check(line_i): print(line_i) L = [] for h in obj['user_history']: if h['event_type'] == 'view': #print("Viewed {}".format(dct[h['event_info']])) L.append(h['event_info']) elif h['event_type'] == 'search': #print("Searched {}".format(h['event_info'])) pass L = pd.unique(L) #L_domain = [dct_domain[k] for k in L] for i in range(len(L)): E1.append(L[i]) E2.append(obj['item_bought']) E1 = f_map_func['item_id'](E1) E2 = f_map_func['item_id'](E2) E = list(zip(E1, E2)) g.add_edges(E) #g = g.as_undirected() g.write_pickle(fname=path.join(DATA_DIR, 'graph_domain_id.pkl'))
def get_lgb_data(avoid_overfit=True): """ Gets all the features necessary to train the LGB ranker, arranging them into matrices. Args: avoid_overfit (bool): If ``True``, avoid overfitting by decreasing the item/domain/category bought/searched count for the elements from the history of a given purchase. Default is ``True``. Returns: List with size 3: X (NDArray[float].shape[N,D]): Features Y (NDArray[float].shape[N,1]): Labels M (NDArray[float].shape[N]): Indicator variable (1 if train, 0 if validation) """ from input.create_ratio import load_language_df mode = 'train' TRAIN_LINES = 413163 TEST_LINES = 177070 df = read_item_data() dct_condition = df['condition'].to_dict() df2 = load_language_df() dct_lan_pt = df2['score_pt'].to_dict() dct_lan_en = df2['score_en'].to_dict() dct_lan_es = df2['score_es'].to_dict() dct = df['title'].to_dict() dct_domain = df['domain_id'].to_dict() dct_cat = df['category_id'].to_dict() dct_price = df['price'].to_dict() """ Ratio stuff """ from input.create_ratio import get_ratio dct_ratio_dom = get_ratio(which='domain_id') ratio_df = get_ratio(which='item_id', full=True) ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched'] dct_ratio_item_p = ratio_df['popularity'].to_dict() ratio_df = get_ratio(which='item_id', full=True, alternate=False) dct_ratio_item_b = ratio_df['bought'].to_dict() dct_ratio_item_s = ratio_df['searched'].to_dict() dct_ratio_item_r = ratio_df['rat'].to_dict() df['item_bought'] = [dct_ratio_item_b[k] for k in df.index] dct_ratio_cat = get_ratio(which='category_id', full=True) dct_ratio_cat_s, dct_ratio_cat_b, dct_ratio_cat = dct_ratio_cat['searched'].to_dict(),\ dct_ratio_cat['bought'].to_dict(),\ dct_ratio_cat['rat'].to_dict(),\ dct_ratio_dom = get_ratio(which='domain_id', full=True) dct_ratio_dom_s, dct_ratio_dom_b, dct_ratio_dom = dct_ratio_dom['searched'].to_dict(),\ dct_ratio_dom['bought'].to_dict(),\ dct_ratio_dom['rat'].to_dict(),\ dct_ratio_item = get_ratio(which='item_id') dct_domain_df = {} dct_cat_df = {} for dom, df2 in df.groupby('domain_id'): df2 = df2.sort_values(['item_bought'], ascending=False) #.iloc[0:10,:] dct_domain_df[dom] = df2 for cat, df2 in df.groupby('category_id'): df2 = df2.sort_values(['item_bought'], ascending=False) #.iloc[0:10,:] dct_cat_df[cat] = df2 #print(df2) """ RNN stuff. """ from input.rnn_item_ranker import read_predictions rnn_pred = read_predictions(mode) #assert rnn_pred.shape[0] == TRAIN_LINES DATA_PATH = path.join( DATA_DIR, 'test_dataset.jl' if mode == 'test' else 'train_dataset.jl') i = 0 def rank_to_order(L, rank): assert rank.shape[0] == L.shape[0] ids = (-rank).argsort(kind='mergesort') return L[ids], rank[ids] pred = {} actual = [] domain_ids = [] X = [] Y = [] M = [] with jsonlines.open(DATA_PATH) as reader: for line_id, obj in enumerate(reader): if True: print(line_id) L = [ h['event_info'] for h in obj['user_history'] if h['event_type'] == 'view' ] S = [ h['event_info'] for h in obj['user_history'] if h['event_type'] == 'search' ] L_k = pd.unique(L[::-1])[::-1] """ OVERFITTING AVOIDANCE """ if avoid_overfit: if line_id <= 330530: target_item = obj['item_bought'] target_dom = dct_domain[obj['item_bought']] target_cat = dct_cat[obj['item_bought']] for this_item in L_k: """ Bought """ if this_item == target_item: assert dct_ratio_item_b[this_item] > 0 dct_ratio_item_b[this_item] -= 1 """ Search """ dct_ratio_item_s[this_item] -= 1 assert dct_ratio_item_s[this_item] >= 0 """ Ratio """ dct_ratio_item_r[this_item] = dct_ratio_item_b[ this_item] / (dct_ratio_item_s[this_item] + 1) for this_dom in pd.unique([dct_domain[k] for k in L_k]): if not isinstance(this_dom, str): continue """ Bought """ if this_dom == target_dom: assert dct_ratio_dom_b[this_dom] > 0 dct_ratio_dom_b[this_dom] -= 1 """ Search """ dct_ratio_dom_s[this_dom] -= 1 assert dct_ratio_dom_s[this_dom] >= 0 """ Ratio """ dct_ratio_dom[this_dom] = dct_ratio_dom_b[ this_dom] / (dct_ratio_dom_s[this_dom] + 1) for this_cat in pd.unique([dct_cat[k] for k in L_k]): """ Bought """ if this_cat == target_cat: assert dct_ratio_cat_b[this_cat] > 0 dct_ratio_cat_b[this_cat] -= 1 """ Search """ dct_ratio_cat_s[this_cat] -= 1 assert dct_ratio_cat_s[this_cat] >= 0 """ Ratio """ dct_ratio_cat[this_cat] = dct_ratio_cat_b[ this_cat] / (dct_ratio_cat_s[this_cat] + 1) """ Calculate ranks """ dct_rnn = dict([ (int(x), y) for x, y in zip(rnn_pred.iloc[i, 0:10], rnn_pred.iloc[i, -10:]) ]) if len(L_k) <= 10: rank_ratio_rnn = pd.Series([ dct_rnn.get(k, 0) for k in L_k ]).rank(method="average").to_numpy() else: rank_ratio_rnn = pd.Series( [1.0 for k in L_k]).rank(method="average").to_numpy() rank_ratio_dom = pd.Series([ dct_ratio_dom[dct_domain[k]] for k in L_k ]).rank(method="average").to_numpy() rank_ratio_cat = pd.Series([ dct_ratio_cat[dct_cat[k]] for k in L_k ]).rank(method="average").to_numpy() rank_ratio_item = pd.Series( [dct_ratio_item_p[k] for k in L_k]).rank(method="average").to_numpy() rank_freq = pd.Series(L, index=range( len(L))).value_counts(sort=False) rank_freq = rank_freq.rank(method="average").to_dict() rank_freq = np.array([rank_freq[k] for k in L_k]) rank_latest = np.arange(len(L_k)) rank_price = pd.Series([-dct_price[k] for k in L_k ]).rank(method="average").to_numpy() x = [] x.append([dct_ratio_dom[dct_domain[k]] for k in L_k]) x.append(rank_ratio_dom) x.append(rank_ratio_cat) x.append(rank_price) x.append([dct_ratio_dom[dct_domain[k]] for k in L_k]) x.append([dct_ratio_cat[dct_cat[k]] for k in L_k]) x.append([dct_ratio_item_b[k] for k in L_k]) x.append([dct_ratio_item_s[k] for k in L_k]) x.append([dct_ratio_item_r[k] for k in L_k]) x.append(list(rank_latest / len(L_k))) x.append([-dct_price[k] for k in L_k]) x.append([-dct_condition[k] for k in L_k]) x.append([-dct_lan_en[k] for k in L_k]) x.append([-dct_lan_es[k] for k in L_k]) x.append([-dct_lan_pt[k] for k in L_k]) """ Overfitting avoidance - pt 2 """ if line_id <= 330530: target_item = obj['item_bought'] target_dom = dct_domain[obj['item_bought']] target_cat = dct_cat[obj['item_bought']] for this_item in L_k: """ Bought """ if this_item == target_item: #assert dct_ratio_item_b[this_item] >= 0 dct_ratio_item_b[this_item] += 1 """ Search """ #assert dct_ratio_item_s[this_item] >= 0 dct_ratio_item_s[this_item] += 1 """ Ratio """ dct_ratio_item_r[this_item] = dct_ratio_item_b[ this_item] / (dct_ratio_item_s[this_item] + 1) for this_dom in pd.unique([dct_domain[k] for k in L_k]): if not isinstance(this_dom, str): continue """ Bought """ if this_dom == target_dom: #assert dct_ratio_dom_b[this_dom] >= 0 dct_ratio_dom_b[this_dom] += 1 """ Search """ #assert dct_ratio_dom_s[this_dom] >= 0 dct_ratio_dom_s[this_dom] += 1 """ Ratio """ dct_ratio_dom[this_dom] = dct_ratio_dom_b[this_dom] / ( dct_ratio_dom_s[this_dom] + 1) for this_cat in pd.unique([dct_cat[k] for k in L_k]): """ Bought """ if this_cat == target_cat: #assert dct_ratio_cat_b[this_cat] >= 0 dct_ratio_cat_b[this_cat] += 1 """ Search """ #assert dct_ratio_cat_s[this_cat] >= 0 dct_ratio_cat_s[this_cat] += 1 """ Ratio """ dct_ratio_cat[this_cat] = dct_ratio_cat_b[this_cat] / ( dct_ratio_cat_s[this_cat] + 1) if len(L_k) == 0: continue x = np.transpose(np.reshape(np.array(x), (-1, len(L_k)))) def score(k): if k == obj['item_bought']: return 2 elif dct_domain[k] == dct_domain[obj['item_bought']]: return 1 else: return 0 y = np.array([score(k) for k in L_k])[:, None] #print(y.shape) if np.sum(y) >= 0: X.append(x) Y.append(y) M.append(np.array([line_id] * len(L_k))) X = np.concatenate(X, axis=0) Y = np.concatenate(Y, axis=0) M = np.concatenate(M) return X, Y, M
def meli_iterator(mode='train', batch_size=BATCH_SIZE, full=False): from input.read_input import get_sentence_model, get_emb from input.create_ratio import load_language_df TRAIN_LINES = 413163 TEST_LINES = 177070 df = read_item_data() dct_condition = df['condition'].to_dict() df2 = load_language_df() dct_lan_pt = df2['score_pt'].to_dict() dct_lan_en = df2['score_en'].to_dict() dct_lan_es = df2['score_es'].to_dict() dct = df['title'].to_dict() dct_domain = df['domain_id'].to_dict() dct_cat = df['category_id'].to_dict() dct_price = df['price'].to_dict() """ Ratio stuff """ from input.create_ratio import get_ratio dct_ratio_dom = get_ratio(which='domain_id') ratio_df = get_ratio(which='item_id', full=True) ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched'] dct_ratio_item_p = ratio_df['popularity'].to_dict() ratio_df = get_ratio(which='item_id', full=True, alternate=False) dct_ratio_item_b = ratio_df['bought'].to_dict() dct_ratio_item_s = ratio_df['searched'].to_dict() dct_ratio_item_r = ratio_df['rat'].to_dict() df['item_bought'] = [dct_ratio_item_b[k] for k in df.index] dct_ratio_cat = get_ratio(which='category_id', full=True) dct_ratio_cat_s, dct_ratio_cat_b, dct_ratio_cat = dct_ratio_cat['searched'].to_dict(),\ dct_ratio_cat['bought'].to_dict(),\ dct_ratio_cat['rat'].to_dict(),\ dct_ratio_dom = get_ratio(which='domain_id', full=True) dct_ratio_dom_s, dct_ratio_dom_b, dct_ratio_dom = dct_ratio_dom['searched'].to_dict(),\ dct_ratio_dom['bought'].to_dict(),\ dct_ratio_dom['rat'].to_dict(),\ dct_ratio_item = get_ratio(which='item_id') dct_domain_df = {} dct_cat_df = {} for dom, df2 in df.groupby('domain_id'): df2 = df2.sort_values(['item_bought'], ascending=False) #.iloc[0:10,:] dct_domain_df[dom] = df2 for cat, df2 in df.groupby('category_id'): df2 = df2.sort_values(['item_bought'], ascending=False) #.iloc[0:10,:] dct_cat_df[cat] = df2 del df del df2 def _begin_overfit_avoid(L_k): if not mode == 'train': return target_item = obj['item_bought'] target_dom = dct_domain[obj['item_bought']] target_cat = dct_cat[obj['item_bought']] for this_item in L_k: """ Bought """ if this_item == target_item: assert dct_ratio_item_b[this_item] > 0 dct_ratio_item_b[this_item] -= 1 """ Search """ dct_ratio_item_s[this_item] -= 1 assert dct_ratio_item_s[this_item] >= 0 """ Ratio """ dct_ratio_item_r[this_item] = dct_ratio_item_b[this_item] / ( dct_ratio_item_s[this_item] + 1) for this_dom in pd.unique([dct_domain[k] for k in L_k]): if not isinstance(this_dom, str): continue """ Bought """ if this_dom == target_dom: assert dct_ratio_dom_b[this_dom] > 0 dct_ratio_dom_b[this_dom] -= 1 """ Search """ dct_ratio_dom_s[this_dom] -= 1 assert dct_ratio_dom_s[this_dom] >= 0 """ Ratio """ dct_ratio_dom[this_dom] = dct_ratio_dom_b[this_dom] / ( dct_ratio_dom_s[this_dom] + 1) for this_cat in pd.unique([dct_cat[k] for k in L_k]): """ Bought """ if this_cat == target_cat: assert dct_ratio_cat_b[this_cat] > 0 dct_ratio_cat_b[this_cat] -= 1 """ Search """ dct_ratio_cat_s[this_cat] -= 1 assert dct_ratio_cat_s[this_cat] >= 0 """ Ratio """ dct_ratio_cat[this_cat] = dct_ratio_cat_b[this_cat] / ( dct_ratio_cat_s[this_cat] + 1) def _end_overfit_avoid(L_k): if not mode == 'train': return target_item = obj['item_bought'] target_dom = dct_domain[obj['item_bought']] target_cat = dct_cat[obj['item_bought']] for this_item in L_k: """ Bought """ if this_item == target_item: #assert dct_ratio_item_b[this_item] >= 0 dct_ratio_item_b[this_item] += 1 """ Search """ #assert dct_ratio_item_s[this_item] >= 0 dct_ratio_item_s[this_item] += 1 """ Ratio """ dct_ratio_item_r[this_item] = dct_ratio_item_b[this_item] / ( dct_ratio_item_s[this_item] + 1) for this_dom in pd.unique([dct_domain[k] for k in L_k]): if not isinstance(this_dom, str): continue """ Bought """ if this_dom == target_dom: #assert dct_ratio_dom_b[this_dom] >= 0 dct_ratio_dom_b[this_dom] += 1 """ Search """ #assert dct_ratio_dom_s[this_dom] >= 0 dct_ratio_dom_s[this_dom] += 1 """ Ratio """ dct_ratio_dom[this_dom] = dct_ratio_dom_b[this_dom] / ( dct_ratio_dom_s[this_dom] + 1) for this_cat in pd.unique([dct_cat[k] for k in L_k]): """ Bought """ if this_cat == target_cat: #assert dct_ratio_cat_b[this_cat] >= 0 dct_ratio_cat_b[this_cat] += 1 """ Search """ #assert dct_ratio_cat_s[this_cat] >= 0 dct_ratio_cat_s[this_cat] += 1 """ Ratio """ dct_ratio_cat[this_cat] = dct_ratio_cat_b[this_cat] / ( dct_ratio_cat_s[this_cat] + 1) if mode == 'train': check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32) elif mode == 'val': check = lambda x: x > np.round(413163 * 0.8).astype(np.int32) else: check = lambda x: True DATA_PATH = path.join( DATA_DIR, 'test_dataset.jl' if mode == 'test' else 'train_dataset.jl') def rank_to_order(L, rank): assert rank.shape[0] == L.shape[0] return L[(-rank).argsort(kind='mergesort')] pred = {} actual = [] X = [] Y = [] MASK = [] LKS = [] ACTUAL = [] while True: with jsonlines.open(DATA_PATH) as reader: print("Start!!!") for line_id, obj in enumerate(reader): if check(line_id): #print(i) L = [] timestamps = [] dct_emb = {} if mode == 'test': obj['item_bought'] = -999 for h in obj['user_history']: if h['event_type'] == 'view': L.append(h['event_info']) timestamps.append( pd.Timestamp(h['event_timestamp'])) elif h['event_type'] == 'search': pass def divide_time(d): d = pd.Timedelta(d).total_seconds() MINUTE_M = 60 HOUR_M = MINUTE_M * 60 DAY_M = HOUR_M * 24 div = [1, 24, 60] res = [0, 0, 0] for i, M in enumerate([DAY_M, HOUR_M, MINUTE_M]): res[i] = np.floor(d / M) d -= M * res[i] res[i] /= div[i] #res[i] -= 0.5 return tuple(res) if not full and len(L) < 2: continue """ Create attributes """ if len(L) == 0: attrs = np.zeros( (1, (CANDIDATES + 1) + ATTR_SIZE + EMB_SIZE)) targets = np.zeros((1, (CANDIDATES + 1))) targets[0, -1] = 0 L_k = [] else: delta = [ timestamps[-1] - timestamps[i] for i in range(0, len(timestamps)) ] """ We'll use the latest delta """ L = L[::-1] u, unique_id = np.unique(np.array(L), return_index=True) #delta_day, delta_hour, delta_minute = zip(*[divide_time(d) for d in delta]) deltas = np.array([divide_time(d) for d in delta]) deltas = deltas[unique_id][:SEQ_LEN] L_k = np.array(L)[unique_id][:CANDIDATES] _begin_overfit_avoid(L_k) """ rank_freq initial calculation needs whole L """ rank_freq = pd.Series(L, index=range( len(L))).value_counts(sort=False, normalize=True) rank_freq = rank_freq.rank(method="average").to_dict() L = np.array(L)[unique_id][:SEQ_LEN] """ Calculate ranks """ condition = np.array([ 1.0 if dct_condition[k] == 'new' else 0.0 for k in L ])[:, None] #ratio_dom = np.array([dct_ratio_dom[dct_domain[k]] for k in L])[:,None] #ratio_cat = np.array([dct_ratio_cat[dct_cat[k]] for k in L])[:,None] #ratio_item = np.array([dct_ratio_item[k] for k in L])[:,None] price = np.log( np.array([ 1 + np.abs(fix_na(dct_price[k])) for k in L ])[:, None]) rank_freq = np.array([rank_freq[k] for k in L])[:, None] #rank_latest = (1.0 - np.arange(len(L))/len(L))[:,None] rank_ratio_dom = pd.Series([ dct_ratio_dom[dct_domain[k]] for k in L_k ]).rank(method="average").to_numpy() rank_ratio_cat = pd.Series([ dct_ratio_cat[dct_cat[k]] for k in L_k ]).rank(method="average").to_numpy() rank_ratio_item = pd.Series([ dct_ratio_item_r[k] for k in L_k ]).rank(method="average").to_numpy() rank_latest = (1.0 - np.arange(len(L)) / len(L)) x = [] x.append([dct_ratio_dom[dct_domain[k]] for k in L_k]) x.append(rank_ratio_dom) x.append(rank_ratio_cat) x.append(rank_ratio_item) x.append([dct_ratio_dom[dct_domain[k]] for k in L_k]) x.append([dct_ratio_cat[dct_cat[k]] for k in L_k]) x.append([dct_ratio_item_b[k] for k in L_k]) x.append([dct_ratio_item_s[k] for k in L_k]) x.append([dct_ratio_item_r[k] for k in L_k]) x.append(list(rank_latest / len(L_k))) x.append([-dct_price[k] for k in L_k]) x.append([-dct_condition[k] for k in L_k]) x.append([-dct_lan_en[k] for k in L_k]) x.append([-dct_lan_es[k] for k in L_k]) x.append([-dct_lan_pt[k] for k in L_k]) """ """ #true_val = (np.array([str(dct_domain[k]) for k in L]) == dct_domain[obj['item_bought']]) #true_val = np.logical_and(true_val,[k in L_k for k in L]) #true_val = true_val[:,None] #true_val = np.ones_like(true_val) #true_val = np.random.rand(*(true_val.shape)) assert all([k in L for k in L_k]) ids = [ np.where( L_k == l)[0][0] if l in L_k else CANDIDATES for l in L ] ids_onehot = np.zeros((len(L), (CANDIDATES + 1))) ids_onehot[np.arange(len(L)), ids] = 1 #ids_onehot = ids_onehot[:,0:10] """ Create numeric attributes plus embeddings """ attr_list = [ ids_onehot, deltas, condition, price, rank_freq ] + [np.array(_x)[:, None] for _x in x] if USE_EMB: emb = predict_model( get_sentence_model(), query_list=[dct[k] for k in L_k], return_emb=True) emb = np.reshape(emb[:, 0:(EMB_SIZE // 512), :], (emb.shape[0], EMB_SIZE)) attr_list.append(emb) attrs = np.concatenate(attr_list, axis=1) """ Create targets """ if mode == 'test': targets = np.zeros((1, (CANDIDATES + 1))) else: _b1 = (np.array(list(L_k == obj['item_bought']))) _b2 = (np.array( list([str(dct_domain[k]) for k in L_k ])) == dct_domain[obj['item_bought']]) targets = _b1.astype( np.float32 ) * 1.0 #+ _b2.astype(np.float32)*0.0 if np.sum(targets) == 0: targets = np.zeros((1, (CANDIDATES + 1))) targets[0, -1] = 1 if not full: _end_overfit_avoid(L_k) continue else: targets = np.array(targets) / np.sum(targets) targets = np.concatenate([ targets[None, :], np.zeros((1, CANDIDATES + 1 - len(L_k))) ], axis=1) """ Add attributes, targets. """ if attrs.shape[0] < SEQ_LEN: attrs = np.concatenate([ np.zeros(( SEQ_LEN - attrs.shape[0], attrs.shape[1], )), attrs ], axis=0) attrs = attrs[-SEQ_LEN:, :] attrs = attrs.astype(np.float32) _end_overfit_avoid(L_k) X.append(attrs[None, :]) Y.append(targets) mask = np.concatenate([ np.ones((len(L_k))), np.zeros((CANDIDATES + 1) - len(L_k)) ]).astype(np.float32)[None, :] MASK.append(mask) LKS.append( np.concatenate([ L_k, -1 * np.ones(((CANDIDATES + 1) - len(L_k), )) ])[None, :]) ACTUAL.append(np.array([obj['item_bought']])[None, :]) if len(X) == batch_size: X = np.concatenate(X, axis=0) Y = np.concatenate(Y, axis=0) MASK = np.concatenate(MASK, axis=0) LKS = np.concatenate(np.array(LKS).astype(np.int32), axis=0) ACTUAL = np.concatenate(np.array(ACTUAL).astype(np.int32), axis=0) yield (X, MASK, LKS, ACTUAL), Y X = [] Y = [] MASK = [] LKS = [] ACTUAL = [] #print(attrs.shape) if full: check = (lambda i: True)
def fit_RNN(): import tensorflow as tf from tensorflow import keras import tf_geometric as tfg """ Create graph """ df = read_item_data() NUM_ITEMS = read_item_data().shape[0] NUM_FEATURES = 1 counter, f_map_func, r_map_func = get_mappings() NUM_DOMS = pd.unique(df['domain_id']).shape[0] """ Load graph """ G = ig.Graph.Read_Pickle(path.join(DATA_DIR, 'graph_item_to_item.pkl')) #weights = np.log(1+np.array(G.es["weight"])) weights = np.array(G.es["weight"]) indices = np.array([np.array(e.tuple) for e in G.es]) indices = np.transpose(indices) """ Create sparse matrix W """ from scipy.sparse import coo_matrix import scipy.sparse row = indices[0, :] col = indices[1, :] W = coo_matrix((weights, (row, col)), shape=(NUM_ITEMS, NUM_ITEMS)) """ Normalize rows """ #W = deg_matrix(W,pwr=-1) @ W W = W.transpose() W = scipy.sparse.csr_matrix(W) assert scipy.sparse.issparse(W) @tf.function def smooth_labels(labels, factor=0.001): # smooth the labels labels = tf.cast(labels, tf.float32) labels *= (1 - factor) labels += (factor / tf.cast(tf.shape(labels)[1], tf.float32)) # returned the smoothed labels return labels @tf.function def compute_loss(labels, logits): logits = tf.reshape(logits, (-1, NUM_ITEMS)) labels = tf.reshape(labels, (-1, NUM_ITEMS)) #logits = tf.nn.softmax(logits) #print(logits) logits = smooth_labels(logits) labels = smooth_labels(labels) losses = -tf.reduce_sum(logits * tf.math.log(labels), axis=1) return tf.reduce_mean(losses) @tf.function def evaluate(labels, logits): logits = tf.reshape(logits, (-1, NUM_ITEMS)) labels = tf.reshape(labels, (-1, NUM_ITEMS)) #logits = tf.nn.softmax(logits) #print(logits) logits = smooth_labels(logits) labels = smooth_labels(labels) acc = tf.metrics.categorical_accuracy(labels, logits) return tf.reduce_mean(acc) """ Read data, yadda yadda """ from input.create_ratio import get_ratio ratio_df = get_ratio(which='item_id', full=True) ratio_df['popularity'] = 100.0 * ratio_df['bought'] + ratio_df['searched'] dct_ratio_item_b = ratio_df['popularity'].to_dict() dct = df['title'].to_dict() dct_domain = df['domain_id'].to_dict() dct_cat = df['category_id'].to_dict() dct_price = df['price'].to_dict() """ Ratio stuff """ from input.create_ratio import get_ratio category_df = get_ratio(which='category_id', full=True) domain_df = get_ratio(which='domain_id', full=True) feat_1, feat_2, feat_3 = domain_df['searched'].to_dict( ), domain_df['bought'].to_dict(), domain_df['rat'].to_dict() feat_1, feat_2, feat_3 = [[X[dct_domain[k]] for k in df.index] for X in [feat_1, feat_2, feat_3]] feat_1_1, feat_2_1, feat_3_1 = category_df['searched'].to_dict( ), category_df['bought'].to_dict(), category_df['rat'].to_dict() feat_1_1, feat_2_1, feat_3_1 = [[X[dct_cat[k]] for k in df.index] for X in [feat_1_1, feat_2_1, feat_3_1]] def standardize(x): return (x - np.min(x)) / (np.max(x) + 1e-06 - np.min(x)) feat_1, feat_2, feat_3 = [standardize(x) for x in [feat_1, feat_2, feat_3]] feat_1_1, feat_2_1, feat_3_1 = [ standardize(x) for x in [feat_1_1, feat_2_1, feat_3_1] ] del df del domain_df del category_df del G #dom_ratios = np.array([dct_ratio_dom[k] for k in pd.unique(df['domain_id'].values)]) #dom_ratios = (dom_ratios - np.mean(dom_ratios)) / np.std(dom_ratios) from nn.domain_string_identifier import load_model domain_prediction_model = load_model() def my_generator(mode='train'): if mode == 'train': check = lambda x: x <= np.round(413163 * 0.8).astype(np.int32) elif mode == 'val': check = lambda x: x > np.round(413163 * 0.8).astype(np.int32) else: check = lambda x: True DATA_PATH = path.join( DATA_DIR, 'test_dataset.jl' if mode == 'test' else 'train_dataset.jl') print("Reading....") X = np.zeros((NUM_ITEMS, 10)).astype(np.float32) with jsonlines.open(DATA_PATH) as reader: for line_i, obj in enumerate(reader): if check(line_i): L = [] S = [] C = [] IDS = [] for h in obj['user_history']: if h['event_type'] == 'view': L.append(dct_domain[h['event_info']]) C.append(dct_cat[h['event_info']]) IDS.append(h['event_info']) elif h['event_type'] == 'search': S.append(h['event_info']) if obj['item_bought'] in L: continue L = f_map_func['domain_id'](L) C = f_map_func['category_id'](C) IDS_map = f_map_func['item_id'](IDS) """ Adjust graph """ Y = np.zeros((NUM_ITEMS, 1)).astype(np.float32) """ X[:,0] = feat_1 X[:,1] = feat_2 X[:,2] = feat_3 X[:,6] = feat_1_1 X[:,7] = feat_2_1 X[:,8] = feat_3_1 #if len(S) > 0: # X[:,8] = np.mean(predict_model(domain_prediction_model,S,return_numeric=True),axis=0) """ target_id = f_map_func['item_id']([obj['item_bought']])[0] if not mode == 'test': Y[target_id, 0] = 1.0 """ for i,k in enumerate(IDS_map): X[k,3] += 1 X[k,4] += dct_ratio_item_b[IDS[i]]/len(C) X[k,5] = dct_price[IDS[i]] #W[target_id,:] = (np.clip(np.array(W[target_id,:].todense())-1,a_min=0.0,a_max=None)) X[:,9] = np.reshape(np.asarray(W @ X[:,3]),(-1,)) X[:,9] = X[:,8] * X[:,2] #X[:,:8] = 0 for i in range(10): X[:,i] = (X[:,i] - np.min(X[:,i])) / (1e-06+ np.max(X[:,i]) - np.min(X[:,i])) """ if not mode == 'test': Y[target_id, 0] = 0.0 #X = X -0.5 yield X, Y """ Optimize """ BS = 2 step = 0 def batch_generator(mode, loop=True, batch_size=BS): BATCH_X = [] BATCH_Y = [] i = 0 while True: for x, y in my_generator(mode): BATCH_X.append(x[None, :, :]) BATCH_Y.append(y[None, :, :]) i += 1 if i % batch_size == 0: yield np.concatenate(BATCH_X, axis=0), np.concatenate(BATCH_Y, axis=0) BATCH_X = [] BATCH_Y = [] i = 0 if loop == False: yield np.concatenate(BATCH_X, axis=0), np.concatenate(BATCH_Y, axis=0) break """ Define train_model """ import tensorflow.keras as keras import tensorflow.keras.layers as layers inp_x = keras.Input((NUM_ITEMS, 10)) x = layers.Dense(32, activation='relu')(inp_x) x = layers.Dense(32, activation='relu')(x) x = layers.Dense(1)(x) x = layers.Flatten()(x) x = layers.Softmax(axis=-1)(x) train_model = keras.Model(inputs=[inp_x], outputs=[x]) print(train_model.summary()) lr_schedule = keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=0.5 * 1e-2, decay_steps=1000, decay_rate=0.9) optimizer = tf.keras.optimizers.Adam(learning_rate=0.2 * 1e-2) train_model.compile(optimizer=optimizer, loss=compute_loss, metrics=[evaluate]) from functools import partial from input.read_input import TRAIN_LINES train_model.fit_generator(batch_generator('train', True), steps_per_epoch=TRAIN_LINES // BS, epochs=1) ITEM_PATH = path.join(DATA_DIR, 'train_model', 'item_classifier.h5') train_model.save_weights(ITEM_PATH) def predict(mode): PREDS = [] CONFS = [] NUM_SELECT = 10 batch_size = 1 for batch_id, X in enumerate( batch_generator(mode, batch_size=batch_size, loop=False)): x = X[0] print("Predicting {} - Batch {}".format(mode, batch_id)) pred = train_model.predict_on_batch(x) if batch_id == 0: print(pred) PREDS.append(tf.argsort(pred, axis=-1)[:, -NUM_SELECT:]) CONFS.append(tf.sort(pred, axis=-1)[:, -NUM_SELECT:]) PREDS = np.concatenate(PREDS, axis=0) CONFS = np.concatenate(CONFS, axis=0) #PREDS = np.concatenate([PREDS,CONFS],axis=1) cols = ['pred_{}'.format(k) for k in range(NUM_SELECT)] fname = os.path.join(DATA_DIR, 'item_pred_{}.csv'.format(mode)) pd.DataFrame(PREDS, index=range(PREDS.shape[0]), columns=cols).to_csv(fname) predict('train') predict('val') predict('test')