def load_data(): path = config.origin_csv print('load data') data = read_cut(path) # cut word data = data_2id(data) # 2id data = add_hum_feats(data, config.train_featdires) # 生成特征并加入 return data
def make_w2v(path, cut_char_level): if not os.path.exists(path): #我私自去除了CutWord #data = pd.read_csv('../'+config.origin_csv) data = read_cut('../' + config.origin_csv) if cut_char_level: content = list(data['q_cut']) + list(data['a_cut']) else: content = list(data['q_cut_word']) + list(data['a_cut_word']) model = Word2Vec( content, size=config.w2v_vec_dim, window=5, min_count=5, ) model.save(path) else: model = Word2Vec.load(path) weights = model.wv.syn0 vocab = dict([(k, v.index + 1) for k, v in model.wv.vocab.items()]) vocab['<-UNKNOW->'] = len(vocab) + 1 embed_weights = np.zeros(shape=(weights.shape[0] + 2, weights.shape[1])) embed_weights[1:weights.shape[0] + 1] = weights unk_vec = np.random.random(size=weights.shape[1]) * 0.5 pading_vec = np.random.random(size=weights.shape[1]) * 0 embed_weights[weights.shape[0] + 1] = unk_vec - unk_vec.mean() embed_weights[0] = pading_vec return vocab, embed_weights
def save_my_w2v(path): data = read_cut(path) vocab = read_words(data) # Read top n word vectors. Read all vectors when topn is 0 vectors, iw, wi, dim = read_vectors('data/pre_w2v/sgns.zhihu.word', 0) m = load_pre_train_embeddings(vocab, vectors) np.save(config.word_embed_vocab, vocab) np.save(config.word_embed_weight, m)
def load_data(): path = config.origin_csv print('load data') data = read_cut(path) # cut word data = data_2id(data) # 2id data = add_hum_feats(data, config.train_feats) # 生成特征并加入 x_train, y_train = get_X_Y_from_df(data, config.data_augment) print(len(x_train[2])) return x_train, y_train
def submit_inteface(in_path, out_path, model_name, cv=False): data_df = read_cut(in_path, config.test_data_cut_hdf) data_df = data2id(data_df) data_df.label = data_df.label.fillna(0) X, _ = get_X_Y_from_df(data_df) data = [X, _] model = get_model(model_name) print('load model and predict') if not cv: test_pred = model.single_predict(data) else: test_pred = model.make_test_cv_data(data) test_model_pred = np.squeeze(test_pred) data_df['label'] = np.argmax(test_model_pred, axis=1) + 1 data_df[['label']].to_csv(out_path, index=False, header=None, sep='\t')
def train(cv, model_name): data_df = read_cut(config.origin_csv, config.train_data_cut_hdf) data_df = data2id(data_df) data_df = data_df.sample(frac=1, random_state=18) model = get_model(model_name) if cv: kfolds = 5 x_train, y_train = get_X_Y_from_df(data_df) model.make_train_cv_data([x_train, y_train], kfolds) else: train, dev = train_test(data_df) x_train, y_train = get_X_Y_from_df(train) x_dev, y_dev = get_X_Y_from_df(dev) model.single_train([x_train, y_train, x_dev, y_dev])
data = magic1(data) feats0_path = 'data/cache/feats/feats0_train.csv' feats0 = [ 'q1_freq', 'q2_freq', 'freq_mean', 'freq_cross', 'q1_freq_sq', 'q2_freq_sq' ] data[['id'] + feats0].to_csv(feats0_path, index=False) def merge_feats(): df0 = pd.read_csv('data/cache/feats/feats0_train.csv') df1 = pd.read_csv('data/cache/feats/feats1_train.csv') df = pd.merge(df0, df1, on='id') return df def merge_test(data): data = feats1_gen(data) data = extract_features(data) data = magic1(data) return data[config.feats] if __name__ == '__main__': path = config.origin_csv data = read_cut(path) # cut word #data = data_2id(data) # 2id save_feats0(data) save_feats1(data)