def save_smat(features, ft_pt): """ save features to disk in SMAT format :param features: the matrix of features :param ft_pt: features file path :return: none """ (row_num, col_num) = features.shape data = features.data indice = features.indices indptr = features.indptr f = open(ft_pt, 'w') f.write("%d %d\n" % (row_num, col_num)) ind_indptr = 1 begin_line = True for ind_data in range(len(data)): while ind_data == indptr[ind_indptr]: f.write('\n') begin_line = True ind_indptr += 1 if (data[ind_data] < 1e-12) and (data[ind_data] > -1e-12): continue if (not begin_line) and (ind_data != indptr[ind_indptr - 1]): f.write(' ') f.write("%d:%s" % (indice[ind_data], data[ind_data])) begin_line = False while ind_indptr < len(indptr): f.write("\n") ind_indptr += 1 LogUtil.log("INFO", "save smat feature file done (%s)" % ft_pt) f.close()
def load_all(feature_pt, feature_names, rawset_name, will_save=False): index_begin = 0 features = None for index in reversed(range(1, len(feature_names))): f_names_s = '|'.join( feature_names[0:index + 1]) + '|' + rawset_name f_names_md5 = hashlib.md5(f_names_s).hexdigest() if isfile('%s/md5_%s.smat.npz' % (feature_pt, f_names_md5)): index_begin = index features = Feature.load('%s/md5_%s.smat' % (feature_pt, f_names_md5)) break LogUtil.log( 'INFO', 'load %s features [%s, %s)' % (rawset_name, feature_names[0], feature_names[index_begin])) if 1 > index_begin: features = Feature.load( '%s/%s.%s.smat' % (feature_pt, feature_names[0], rawset_name)) for index in range(index_begin + 1, len(feature_names)): features = Feature.merge_col( features, Feature.load('%s/%s.%s.smat' % (feature_pt, feature_names[index], rawset_name))) features = features.tocsr() if will_save and (index_begin < len(feature_names) - 1): f_names_s = '|'.join(feature_names) + '|' + rawset_name f_names_md5 = hashlib.md5(f_names_s).hexdigest() Feature.save_npz(features, '%s/md5_%s.smat' % (feature_pt, f_names_md5)) return features
def load_smat(ft_fp): """ load features from disk, the format: row_num col_num f1_index:f1_value f2_index:f2_value ... """ data = [] indice = [] indptr = [0] f = open(ft_fp) [row_num, col_num] = [int(num) for num in f.readline().strip().split()] for line in f: line = line.strip() subs = line.split() for sub in subs: [f_index, f_value] = sub.split(":") f_index = int(f_index) f_value = float(f_value) data.append(f_value) indice.append(f_index) indptr.append(len(data)) f.close() features = csr_matrix((data, indice, indptr), shape=(row_num, col_num), dtype=float) LogUtil.log("INFO", "load smat feature file done (%s)" % ft_fp) return features
def load_npz(ft_fp): loader = np.load('%s.npz' % ft_fp) features = csr_matrix( (loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) LogUtil.log("INFO", "load npz feature file done (%s)" % ft_fp) return features
def init_powerful_word_dside(pword, thresh_num, thresh_rate): pword_dside = [] pword = filter(lambda x: x[1][0] * x[1][5] >= thresh_num, pword) pword_sort = sorted(pword, key=lambda d: d[1][6], reverse=True) pword_dside.extend(map(lambda x: x[0], filter(lambda x: x[1][6] >= thresh_rate, pword_sort))) LogUtil.log('INFO', 'Double side power words(%d): %s' % (len(pword_dside), str(pword_dside))) return pword_dside
def load_all(feature_pt, feature_names, rawset_name, will_save=False): index_begin = 0 features = None for index in reversed(range(1, len(feature_names))): f_names_s = '|'.join(feature_names[0:index + 1]) + '|' + rawset_name f_names_md5 = hashlib.md5(f_names_s).hexdigest() if isfile('%s/md5_%s.smat.npz' % (feature_pt, f_names_md5)): index_begin = index features = Feature.load('%s/md5_%s.smat' % (feature_pt, f_names_md5)) break LogUtil.log('INFO', 'load %s features [%s, %s)' % (rawset_name, feature_names[0], feature_names[index_begin])) if 1 > index_begin: features = Feature.load('%s/%s.%s.smat' % (feature_pt, feature_names[0], rawset_name)) for index in range(index_begin + 1, len(feature_names)): features = Feature.merge_col(features, Feature.load( '%s/%s.%s.smat' % (feature_pt, feature_names[index], rawset_name))) features = features.tocsr() if will_save and (index_begin < len(feature_names) - 1): f_names_s = '|'.join(feature_names) + '|' + rawset_name f_names_md5 = hashlib.md5(f_names_s).hexdigest() Feature.save_npz(features, '%s/md5_%s.smat' % (feature_pt, f_names_md5)) return features
def __generate_index(self, row_num): index_all = [list()] * self.cv_num for i in range(row_num): index_all[int(random.random() * self.cv_num)].append(i) for i in range(self.cv_num): LogUtil.log( 'INFO', 'generate cv index, size(part%d)=%d' % (i, len(index_all[i]))) index_pt = self.config.get('DEFAULT', 'index_pt') for i in range(self.cv_num): fold_id = i # train fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % ( index_pt, self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(fp, list(), 'w') for j in range(self.cv_num - 2): part_id = (i + j) % self.cv_num DataUtil.save_vector(fp, index_all[part_id], 'a') # valid fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % ( index_pt, self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) part_id = (fold_id + self.cv_num - 2) % self.cv_num DataUtil.save_vector(fp, index_all[part_id], 'w') # test fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % ( index_pt, self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) part_id = (fold_id + self.cv_num - 1) % self.cv_num DataUtil.save_vector(fp, index_all[part_id], 'w')
def extract(self, data_set_name, part_num=1, part_id=0): """ Extract the feature from original data set :param data_set_name: name of data set :param part_num: number of partitions of data :param part_id: partition ID which will be extracted :return: """ # load data set from disk data = pd.read_csv('%s/%s.csv' % (self.config.get('DEFAULT', 'source_pt'), data_set_name)).fillna(value="") begin_id = int(1. * len(data) / part_num * part_id) end_id = int(1. * len(data) / part_num * (part_id + 1)) # set feature file path feature_pt = self.config.get('DEFAULT', 'feature_pt') if 1 == part_num: self.data_feature_fp = '%s/%s.%s.smat' % (feature_pt, self.feature_name, data_set_name) else: self.data_feature_fp = '%s/%s.%s.smat.%03d_%03d' % (feature_pt, self.feature_name, data_set_name, part_num, part_id) feature_file = open(self.data_feature_fp, 'w') feature_file.write('%d %d\n' % (end_id - begin_id, int(self.get_feature_num()))) # extract feature for index, row in data[begin_id:end_id].iterrows(): feature = self.extract_row(row) Feature.save_feature(feature, feature_file) feature_file.close() LogUtil.log('INFO', 'save features (%s, %s, %d, %d) done' % (self.feature_name, data_set_name, part_num, part_id))
def init_powerful_word_oside(pword, thresh_num, thresh_rate): pword_oside = [] pword = filter(lambda x: x[1][0] * x[1][3] >= thresh_num, pword) pword_oside.extend( map(lambda x: x[0], filter(lambda x: x[1][4] >= thresh_rate, pword))) LogUtil.log('INFO', 'One side power words(%d): %s' % ( len(pword_oside), str(pword_oside))) return pword_oside
def load_npz(ft_fp): loader = np.load('%s.npz' % ft_fp) features = csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) LogUtil.log("INFO", "load npz feature file done (%s)" % ft_fp) return features
def __unlock(self): lock_name = self.config.get('MODEL', 'lock_name') lock_pt = self.config.get('MODEL', 'lock_pt') lock_fp = '%s/%s.lock' % (lock_pt, lock_name) if isfile(lock_fp): os.remove(lock_fp) LogUtil.log('INFO', 'delete lock, lock_name=%s' % lock_name) else: LogUtil.log('WARNING', 'missing lock, lock_name=%s' % lock_name)
def get_labels(df): """ Get labels of data set :param df: original data set :return: label list of data set """ labels = df['is_duplicate'].tolist() LogUtil.log("INFO", "num(1)=%d, num(0)=%d" % (sum(labels), len(labels) - sum(labels))) return labels
def init_powerful_word_oside(pword, thresh_num, thresh_rate): pword_oside = [] pword = filter(lambda x: x[1][0] * x[1][3] >= thresh_num, pword) pword_oside.extend( map(lambda x: x[0], filter(lambda x: x[1][4] >= thresh_rate, pword))) LogUtil.log( 'INFO', 'One side power words(%d): %s' % (len(pword_oside), str(pword_oside))) return pword_oside
def generate_powerful_word(data, subset_indexs): """ 计算数据中词语的影响力,格式如下: 词语 --> [0. 出现语句对数量,1. 出现语句对比例,2. 正确语句对比例,3. 单侧语句对比例,4. 单侧语句对正确比例,5. 双侧语句对比例,6. 双侧语句对正确比例] """ words_power = {} train_subset_data = data.iloc[subset_indexs, :] for index, row in train_subset_data.iterrows(): label = int(row['is_duplicate']) q1_words = str(row['question1']).lower().split() q2_words = str(row['question2']).lower().split() all_words = set(q1_words + q2_words) q1_words = set(q1_words) q2_words = set(q2_words) for word in all_words: if word not in words_power: words_power[word] = [0. for i in range(7)] # 计算出现语句对数量 words_power[word][0] += 1. words_power[word][1] += 1. if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)): # 计算单侧语句数量 words_power[word][3] += 1. if 0 == label: # 计算正确语句对数量 words_power[word][2] += 1. # 计算单侧语句正确比例 words_power[word][4] += 1. if (word in q1_words) and (word in q2_words): # 计算双侧语句数量 words_power[word][5] += 1. if 1 == label: # 计算正确语句对数量 words_power[word][2] += 1. # 计算双侧语句正确比例 words_power[word][6] += 1. for word in words_power: # 计算出现语句对比例 words_power[word][1] /= len(subset_indexs) # 计算正确语句对比例 words_power[word][2] /= words_power[word][0] # 计算单侧语句对正确比例 if words_power[word][3] > 1e-6: words_power[word][4] /= words_power[word][3] # 计算单侧语句对比例 words_power[word][3] /= words_power[word][0] # 计算双侧语句对正确比例 if words_power[word][5] > 1e-6: words_power[word][6] /= words_power[word][5] # 计算双侧语句对比例 words_power[word][5] /= words_power[word][0] sorted_words_power = sorted(words_power.iteritems(), key=lambda d: d[1][0], reverse=True) LogUtil.log("INFO", "power words calculation done, len(words_power)=%d" % len(sorted_words_power)) return sorted_words_power
def merge_col(features_1, features_2): """ merge features made split by column :param features_1: the first part of features :param features_2: the second part of features :return: feature matrix """ features = hstack([features_1, features_2]) (row_num, col_num) = features.shape LogUtil.log("INFO", "merge col done, shape=(%d,%d)" % (row_num, col_num)) return features
def extract(self): version = self.config.get('INFO', 'version') cv_num = self.config.get('INFO', 'cv_num') offline_rawset_name = self.config.get('MODEL', 'offline_rawset_name') index_fp = self.config.get('DIRECTORY', 'feature_pt') feature_name = '%s_%s' % (self.__class__.__name__, version) # load prediction of offline tests offline_test_pred_all_fp = '%s/pred/cv_n%d_test.%s.pred' % ( self.config.get('DIRECTORY', 'out_pt'), cv_num, offline_rawset_name) offline_test_pred_all_origin = PostProcessor.read_result_list( offline_test_pred_all_fp) offline_test_pred_all = [0] * len(offline_test_pred_all_origin) # load index of offline tests offline_test_index_all = list() for fold_id in range(cv_num): offline_test_indexs_fp = '%s/cv_n%d_f%d_test.%s.index' % ( index_fp, cv_num, fold_id, offline_rawset_name) offline_test_indexs = Feature.load_index(offline_test_indexs_fp) offline_test_index_all.extend(offline_test_indexs) for index in range(len(offline_test_pred_all)): offline_test_pred_all[offline_test_index_all[ index]] = offline_test_pred_all_origin[index] # load prediction of online data set online_preds = list() for fold_id in range(cv_num): online_pred_fp = '%s/cv_n%d_f%d_online.%s.pred' % ( self.config.get('DIRECTORY', 'pred_pt'), cv_num, fold_id, self.config.get('MODEL', 'online_test_rawset_name')) online_pred_one = PostProcessor.read_result_list(online_pred_fp) online_preds.append(online_pred_one) # sample for online prediction online_pred = [] for i in range(len(online_preds[0])): cv_id = int(random.random() * cv_num) online_pred.append(online_preds[cv_id][i]) offline_pred = [[fv] for fv in offline_test_pred_all] online_pred = [[fv] for fv in online_pred] # directory of features feature_pt = self.config.get('DIRECTORY', 'feature_pt') train_feature_fp = '%s/%s.train.smat' % (feature_pt, feature_name) test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name) train_features = sparse.csr_matrix(np.array(offline_pred)) Feature.save_smat(train_features, train_feature_fp) LogUtil.log('INFO', 'save train features (%s) done' % feature_name) test_features = sparse.csr_matrix(np.array(online_pred)) Feature.save_smat(test_features, test_feature_fp) LogUtil.log('INFO', 'save test features (%s) done' % feature_name)
def init_powerful_word_dside(pword, thresh_num, thresh_rate): pword_dside = [] pword = filter(lambda x: x[1][0] * x[1][5] >= thresh_num, pword) pword_sort = sorted(pword, key=lambda d: d[1][6], reverse=True) pword_dside.extend( map(lambda x: x[0], filter(lambda x: x[1][6] >= thresh_rate, pword_sort))) LogUtil.log( 'INFO', 'Double side power words(%d): %s' % (len(pword_dside), str(pword_dside))) return pword_dside
def stat_dul_question(df): """ Make statistics to duplication of questions :param df: original data set :return: none """ questions = df['question1'].tolist() + df['question2'].tolist() len_questions = len(questions) len_uniq_questions = len(set(questions)) LogUtil.log("INFO", "len(questions)=%d, len(unique_questions)=%d, rate=%f" % ( len_questions, len_uniq_questions, 1.0 * len_uniq_questions / len_questions))
def init_tfidf(self): train_data = pd.read_csv('%s/train.csv' % self.config.get('DIRECTORY', 'origin_pt')).fillna(value="") # [:100] test_data = pd.read_csv('%s/test.csv' % self.config.get('DIRECTORY', 'origin_pt')).fillna(value="") # [:100] tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1)) tfidf_txt = pd.Series( train_data['question1'].tolist() + train_data['question2'].tolist() + test_data['question1'].tolist() + test_data['question2'].tolist()).astype(str) tfidf.fit_transform(tfidf_txt) LogUtil.log("INFO", "init tfidf done ") return tfidf
def __lock(self): lock_name = self.config.get('MODEL', 'lock_name') lock_time = self.config.getint('MODEL', 'lock_time') lock_pt = self.config.get('MODEL', 'lock_pt') if '' != lock_name: lock_fp = '%s/%s.lock' % (lock_pt, lock_name) while isfile(lock_fp): LogUtil.log('INFO', 'model is running, lock_name=%s, waiting %d ...' % (lock_name, lock_time)) time.sleep(lock_time) f = open(lock_fp, 'w') f.close() LogUtil.log('INFO', 'generate lock, lock_name=%s' % lock_name)
def generate_graph_clique(G): n2clique = {} cliques = [] for clique in nx.find_cliques(G): for n in clique: if n not in n2clique: n2clique[n] = [] n2clique[n].append(len(cliques)) cliques.append(clique) LogUtil.log('INFO', 'init graph cliques done, len(cliques)=%d' % len(cliques)) return n2clique, cliques
def generate_graph_cc(G): n2cc = {} ccs = [] for cc in nx.connected_components(G): for n in cc: if n in n2cc: LogUtil.log('WARNING', '%d already in n2cc(=%d)' % (n, n2cc[n])) n2cc[n] = len(ccs) ccs.append(cc) LogUtil.log('INFO', 'init graph cc done, len(cliques)=%d' % len(ccs)) return n2cc, ccs
def generate_idf(data_fp): data = csv.reader(data_fp) idf = {} for index, row in data.iterrows(): words = str(row['question']).strip().split() if WordEmbedding.to_lower else str( row['question']).lower().strip().split() for word in words: idf[word] = idf.get(word, 0) + 1 num_docs = len(data) for word in idf: idf[word] = math.log(num_docs / (idf[word] + 1.)) / math.log(2.) LogUtil.log("INFO", "IDF calculation done, len(idf)=%d" % len(idf)) return idf
def get_qid2question(df): """ Get map(qid, question) :param df: original data set :return: map(qid, question) """ qid2question = {} qids = df['qid1'].tolist() + df['qid2'].tolist() questions = df['question1'].tolist() + df['question2'].tolist() for ind in range(len(qids)): qid2question[qids[ind]] = questions[ind] LogUtil.log("INFO", "len(qids)=%d, len(unique_qids)=%d" % (len(qids), len(qid2question))) return qid2question
def save_npz(features, ft_fp): """ save features to disk in binary format :param features: :param ft_fp: :return: """ np.savez(ft_fp, data=features.data, indices=features.indices, indptr=features.indptr, shape=features.shape) LogUtil.log('INFO', 'save npz feature file done (%s)' % ft_fp)
def __lock(self): lock_name = self.config.get('MODEL', 'lock_name') lock_time = self.config.getint('MODEL', 'lock_time') lock_pt = self.config.get('MODEL', 'lock_pt') if '' != lock_name: lock_fp = '%s/%s.lock' % (lock_pt, lock_name) while isfile(lock_fp): LogUtil.log( 'INFO', 'model is running, lock_name=%s, waiting %d ...' % (lock_name, lock_time)) time.sleep(lock_time) f = open(lock_fp, 'w') f.close() LogUtil.log('INFO', 'generate lock, lock_name=%s' % lock_name)
def generate_idf(data_fp): data = csv.reader(data_fp) idf = {} for index, row in data.iterrows(): words = str(row['question']).strip().split( ) if WordEmbedding.to_lower else str( row['question']).lower().strip().split() for word in words: idf[word] = idf.get(word, 0) + 1 num_docs = len(data) for word in idf: idf[word] = math.log(num_docs / (idf[word] + 1.)) / math.log(2.) LogUtil.log("INFO", "IDF calculation done, len(idf)=%d" % len(idf)) return idf
def merge_file(feature_pt, feature_name, data_set_name, part_num): features = None for part_id in range(part_num): features_part_fp = '%s/%s.%s.smat.%03d_%03d' % (feature_pt, feature_name, data_set_name, part_num, part_id) features_part = Feature.load(features_part_fp) if features is None: features = features_part else: features = Feature.merge_row(features, features_part) features_fp = '%s/%s.%s.smat' % (feature_pt, feature_name, data_set_name) Feature.save_smat(features, features_fp) LogUtil.log('INFO', 'merge features (%s, %s, %d) done' % (feature_name, data_set_name, part_num))
def init_tfidf(self): train_data = pd.read_csv( '%s/train.csv' % self.config.get('DEFAULT', 'origin_pt')).fillna( value="") # [:100] test_data = pd.read_csv( '%s/test.csv' % self.config.get('DEFAULT', 'origin_pt')).fillna( value="") # [:100] tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1)) tfidf_txt = pd.Series(train_data['question1'].tolist() + train_data['question2'].tolist() + test_data['question1'].tolist() + test_data['question2'].tolist()).astype(str) tfidf.fit_transform(tfidf_txt) LogUtil.log("INFO", "init tfidf done ") return tfidf
def generate_cv_subset_index(cf, argv): """ Generate index used for 5-fold cross validation :param cf: configuration file :param argv: parameter list :return: none """ tag = argv[0] cv_num = 5 cv_rawset_name = 'train_with_swap' train_data_size = 404290 index_all = [] for i in range(cv_num): index_all.append([]) for i in range(train_data_size): index_all[int(random.random() * cv_num)].append(i) for i in range(cv_num): LogUtil.log('INFO', 'size(part%d)=%d' % (i, len(index_all[i]))) index_fp = cf.get('DEFAULT', 'feature_index_pt') for i in range(cv_num): fold_id = i # train fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % ( index_fp, tag, cv_num, fold_id, cv_rawset_name) for j in range(cv_num - 2): part_id = (i + j) % cv_num DataUtil.save_vector(fp, index_all[part_id], 'a') for j in range(cv_num - 2): part_id = (i + j) % cv_num DataUtil.save_vector( fp, [index + train_data_size for index in index_all[part_id]], 'a') # valid fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % ( index_fp, tag, cv_num, fold_id, cv_rawset_name) part_id = (fold_id + cv_num - 2) % cv_num DataUtil.save_vector(fp, index_all[part_id], 'w') # test fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % ( index_fp, tag, cv_num, fold_id, cv_rawset_name) part_id = (fold_id + cv_num - 1) % cv_num DataUtil.save_vector(fp, index_all[part_id], 'w')
def merge_file(feature_pt, feature_name, data_set_name, part_num): features = None for part_id in range(part_num): features_part_fp = '%s/%s.%s.smat.%03d_%03d' % ( feature_pt, feature_name, data_set_name, part_num, part_id) features_part = Feature.load(features_part_fp) if features is None: features = features_part else: features = Feature.merge_row(features, features_part) features_fp = '%s/%s.%s.smat' % (feature_pt, feature_name, data_set_name) Feature.save_smat(features, features_fp) LogUtil.log( 'INFO', 'merge features (%s, %s, %d) done' % (feature_name, data_set_name, part_num))
def __init_out_dir(self): # generate output tag self.out_tag = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime(time.time())) self.config.set('DIRECTORY', str(self.out_tag)) # generate output directory out_pt = self.config.get('DIRECTORY', 'out_pt') out_pt_exists = os.path.exists(out_pt) if out_pt_exists: LogUtil.log("ERROR", 'out path (%s) already exists ' % out_pt) return else: os.mkdir(out_pt) os.mkdir(self.config.get('DIRECTORY', 'pred_pt')) os.mkdir(self.config.get('DIRECTORY', 'model_pt')) os.mkdir(self.config.get('DIRECTORY', 'fault_pt')) os.mkdir(self.config.get('DIRECTORY', 'conf_pt')) os.mkdir(self.config.get('DIRECTORY', 'score_pt')) LogUtil.log('INFO', 'out path (%s) created ' % out_pt) # save config self.config.write(open(self.config.get('DIRECTORY', 'conf_pt') + 'featwheel.conf', 'w'))
def balance_index(indexs, labels, positive_rate): """ balance indexs to adjust the positive rate :param indexs: index vector to sample raw data set :param labels: label vector of raw data set :param positive_rate: positive rate :return: index vector after balanced """ if positive_rate < 1e-6 or positive_rate > 1. - 1e-6: return indexs pos_indexs = [index for index in indexs if labels[index] == 1.] neg_indexs = [index for index in indexs if labels[index] == 0.] origin_rate = 1.0 * len(pos_indexs) / len(indexs) LogUtil.log( "INFO", "original: len(pos)=%d, len(neg)=%d, rate=%.2f%%" % (len(pos_indexs), len(neg_indexs), 100.0 * origin_rate)) if origin_rate < positive_rate: pos_indexs, neg_indexs = neg_indexs, pos_indexs origin_rate = 1.0 - origin_rate positive_rate = 1.0 - positive_rate LogUtil.log("INFO", "increase postive instances ...") else: LogUtil.log("INFO", "increase negtive instances ...") k = (1. - positive_rate) * origin_rate / positive_rate / (1 - origin_rate) LogUtil.log("INFO", "k=%.4f" % k) balance_indexs = pos_indexs while k > 1e-6: if k > 1. - 1e-6: balance_indexs.extend(neg_indexs) else: balance_indexs.extend( random.sample(neg_indexs, int(k * len(neg_indexs)))) k -= 1. pos_indexs = [index for index in balance_indexs if labels[index] == 1.] neg_indexs = [index for index in balance_indexs if labels[index] == 0.] balanced_rate = 1.0 * len(pos_indexs) / len(balance_indexs) LogUtil.log( "INFO", "balanced: len(pos)=%d, len(neg)=%d, rate=%.2f%%" % (len(pos_indexs), len(neg_indexs), 100.0 * balanced_rate)) return balance_indexs
def init_idf(data): idf = {} q_set = set() for index, row in data.iterrows(): q1 = str(row['question1']) q2 = str(row['question2']) if q1 not in q_set: q_set.add(q1) words = q1.lower().split() for word in words: idf[word] = idf.get(word, 0) + 1 if q2 not in q_set: q_set.add(q2) words = q2.lower().split() for word in words: idf[word] = idf.get(word, 0) + 1 num_docs = len(data) for word in idf: idf[word] = math.log(num_docs / (idf[word] + 1.)) / math.log(2.) LogUtil.log("INFO", "idf calculation done, len(idf)=%d" % len(idf)) return idf
def balance_index(indexs, labels, positive_rate): """ balance indexs to adjust the positive rate :param indexs: index vector to sample raw data set :param labels: label vector of raw data set :param positive_rate: positive rate :return: index vector after balanced """ if positive_rate < 1e-6 or positive_rate > 1. - 1e-6: return indexs pos_indexs = [index for index in indexs if labels[index] == 1.] neg_indexs = [index for index in indexs if labels[index] == 0.] origin_rate = 1.0 * len(pos_indexs) / len(indexs) LogUtil.log("INFO", "original: len(pos)=%d, len(neg)=%d, rate=%.2f%%" % ( len(pos_indexs), len(neg_indexs), 100.0 * origin_rate)) if origin_rate < positive_rate: pos_indexs, neg_indexs = neg_indexs, pos_indexs origin_rate = 1.0 - origin_rate positive_rate = 1.0 - positive_rate LogUtil.log("INFO", "increase postive instances ...") else: LogUtil.log("INFO", "increase negtive instances ...") k = (1. - positive_rate) * origin_rate / positive_rate / (1 - origin_rate) LogUtil.log("INFO", "k=%.4f" % k) balance_indexs = pos_indexs while k > 1e-6: if k > 1. - 1e-6: balance_indexs.extend(neg_indexs) else: balance_indexs.extend(random.sample(neg_indexs, int(k * len(neg_indexs)))) k -= 1. pos_indexs = [index for index in balance_indexs if labels[index] == 1.] neg_indexs = [index for index in balance_indexs if labels[index] == 0.] balanced_rate = 1.0 * len(pos_indexs) / len(balance_indexs) LogUtil.log("INFO", "balanced: len(pos)=%d, len(neg)=%d, rate=%.2f%%" % ( len(pos_indexs), len(neg_indexs), 100.0 * balanced_rate)) return balance_indexs
def __generate_index(self, row_num): index_all = [list()] * self.cv_num for i in range(row_num): index_all[int(random.random() * self.cv_num)].append(i) for i in range(self.cv_num): LogUtil.log('INFO', 'generate cv index, size(part%d)=%d' % (i, len(index_all[i]))) index_pt = self.config.get('DEFAULT', 'index_pt') for i in range(self.cv_num): fold_id = i # train fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (index_pt, self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(fp, list(), 'w') for j in range(self.cv_num - 2): part_id = (i + j) % self.cv_num DataUtil.save_vector(fp, index_all[part_id], 'a') # valid fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (index_pt, self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) part_id = (fold_id + self.cv_num - 2) % self.cv_num DataUtil.save_vector(fp, index_all[part_id], 'w') # test fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (index_pt, self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) part_id = (fold_id + self.cv_num - 1) % self.cv_num DataUtil.save_vector(fp, index_all[part_id], 'w')
def __init_out_dir(self): # generate output tag self.out_tag = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime(time.time())) self.config.set('DIRECTORY', str(self.out_tag)) # generate output directory out_pt = self.config.get('DIRECTORY', 'out_pt') out_pt_exists = os.path.exists(out_pt) if out_pt_exists: LogUtil.log("ERROR", 'out path (%s) already exists ' % out_pt) return else: os.mkdir(out_pt) os.mkdir(self.config.get('DIRECTORY', 'pred_pt')) os.mkdir(self.config.get('DIRECTORY', 'model_pt')) os.mkdir(self.config.get('DIRECTORY', 'fault_pt')) os.mkdir(self.config.get('DIRECTORY', 'conf_pt')) os.mkdir(self.config.get('DIRECTORY', 'score_pt')) LogUtil.log('INFO', 'out path (%s) created ' % out_pt) # save config self.config.write( open( self.config.get('DIRECTORY', 'conf_pt') + 'featwheel.conf', 'w'))
def run_offline(self): LogUtil.log('INFO', 'cv_tag(%s)' % self.cv_tag) # load feature matrix offline_features = Feature.load_all( self.config.get('DIRECTORY', 'feature_pt'), self.config.get('FEATURE', 'feature_selected').split(), self.config.get('MODEL', 'offline_rawset_name'), self.config.get('FEATURE', 'will_save')) # load labels offline_labels = DataUtil.load_vector( '%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'), self.config.get('MODEL', 'offline_rawset_name')), True) # generate index file if '' == self.cv_tag: self.cv_tag = self.out_tag self.__generate_index(offline_features.shape[0]) # cross validation offline_valid_preds_all = [0.] * offline_features.shape[0] offline_test_preds_all = [0.] * offline_features.shape[0] for fold_id in range(self.cv_num): LogUtil.log('INFO', 'cross validation fold_id(%d) begin' % fold_id) # generate training data set offline_train_pos_rate = float( self.config.get('MODEL', 'train_pos_rate')) offline_train_indexs_fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % ( self.config.get('DIRECTORY', 'index_pt'), self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) offline_train_indexs = DataUtil.load_vector( offline_train_indexs_fp, 'int') offline_train_features, offline_train_labels, offline_train_balanced_indexs = \ CrossValidation.__generate_data(offline_train_indexs, offline_labels, offline_features, offline_train_pos_rate) LogUtil.log('INFO', 'offline train data generation done') # generate validation data set offline_valid_pos_rate = float( self.config.get('MODEL', 'valid_pos_rate')) offline_valid_indexs_fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % ( self.config.get('DIRECTORY', 'index_pt'), self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) offline_valid_indexs = DataUtil.load_vector( offline_valid_indexs_fp, 'int') offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \ CrossValidation.__generate_data(offline_valid_indexs, offline_labels, offline_features, offline_valid_pos_rate) LogUtil.log('INFO', 'offline valid data generation done') # generate test data set offline_test_pos_rate = float( self.config.get('MODEL', 'test_pos_rate')) offline_test_indexs_fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % ( self.config.get('DIRECTORY', 'index_pt'), self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp, 'int') offline_test_features, offline_test_labels, offline_test_balanced_indexs = \ CrossValidation.__generate_data(offline_test_indexs, offline_labels, offline_features, offline_test_pos_rate) LogUtil.log('INFO', 'offline test data generation done') model = Model.new(self.config.get('MODEL', 'model_name'), self.config) model_fp = self.config.get('DIRECTORY', 'model_pt') + '/cv_n%d_f%d.%s.model' % \ (self.cv_num, fold_id, self.config.get('MODEL', 'model_name')) model.save(model_fp) offline_train_preds, offline_valid_preds, offline_test_preds = model.fit( offline_train_features, offline_train_labels, offline_valid_features, offline_valid_labels, offline_test_features, offline_test_labels) offline_train_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_train_labels, offline_train_preds) offline_valid_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_valid_labels, offline_valid_preds) offline_test_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_test_labels, offline_test_preds) score_fp = '%s/%s.score' % (self.config.get( 'DIRECTORY', 'score_pt'), 'cv') score_file = open(score_fp, 'a') score_file.write('fold:%d\ttrain:%s\tvalid:%s\ttest:%s\n' % (fold_id, offline_train_score, offline_valid_score, offline_test_score)) score_file.close() # merge prediction results for index in range(len(offline_valid_balanced_indexs)): offline_valid_preds_all[offline_valid_balanced_indexs[ index]] = offline_valid_preds[index] for index in range(len(offline_test_balanced_indexs)): offline_test_preds_all[offline_test_balanced_indexs[ index]] = offline_test_preds[index] LogUtil.log('INFO', 'cross test fold_id(%d) done' % fold_id) # save prediction results offline_valid_preds_all_fp = '%s/cv_n%d_valid.%s.pred' % ( self.config.get('DIRECTORY', 'pred_pt'), self.cv_num, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_valid_preds_all_fp, offline_valid_preds_all, 'w') offline_test_preds_all_fp = '%s/cv_n%d_test.%s.pred' % ( self.config.get('DIRECTORY', 'pred_pt'), self.cv_num, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_test_preds_all_fp, offline_test_preds_all, 'w') # evaluate offline_valid_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_labels, offline_valid_preds_all) offline_test_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_labels, offline_test_preds_all) score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv') score_file = open(score_fp, 'a') score_file.write('cross_validation\tvalid:%s\ttest:%s\n' % (offline_valid_score, offline_test_score)) score_file.close()
def sample_col(features, indexs): features_sampled = features[:, indexs] (row_num, col_num) = features_sampled.shape LogUtil.log("INFO", "col sample done, shape=(%d,%d)" % (row_num, col_num)) return features_sampled
def run_offline(self): # load feature matrix offline_features = Feature.load_all( self.config.get('DIRECTORY', 'feature_pt'), self.config.get('FEATURE', 'feature_selected').split(), self.config.get('MODEL', 'offline_rawset_name'), self.config.get('FEATURE', 'will_save')) # load labels offline_labels = DataUtil.load_vector( '%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'), self.config.get('MODEL', 'offline_rawset_name')), True) # generate index file if '' == self.se_tag: self.se_tag = self.out_tag self.__generate_index(offline_features.shape[0]) index_pt = self.config.get('DIRECTORY', 'index_pt') # generate training data set offline_train_pos_rate = float( self.config.get('MODEL', 'train_pos_rate')) offline_train_indexs_fp = '%s/se_tag%s_train.%s.index' % ( index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp, 'int') offline_train_features, offline_train_labels, offline_train_balanced_indexs = \ SingleExec.__generate_data(offline_train_indexs, offline_labels, offline_features, offline_train_pos_rate) LogUtil.log('INFO', 'offline train data generation done') # generate validation data set offline_valid_pos_rate = float( self.config.get('MODEL', 'valid_pos_rate')) offline_valid_indexs_fp = '%s/se_tag%s_valid.%s.index' % ( index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp, 'int') offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \ SingleExec.__generate_data(offline_valid_indexs, offline_labels, offline_features, offline_valid_pos_rate) LogUtil.log('INFO', 'offline valid data generation done') # generate test data set offline_test_pos_rate = float(self.config.get('MODEL', 'test_pos_rate')) offline_test_indexs_fp = '%s/se_tag%s_test.%s.index' % ( index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp, 'int') offline_test_features, offline_test_labels, offline_test_balanced_indexs = \ SingleExec.__generate_data(offline_test_indexs, offline_labels, offline_features, offline_test_pos_rate) LogUtil.log('INFO', 'offline test data generation done') model = Model.new(self.config.get('MODEL', 'model_name'), self.config) model_fp = self.config.get( 'DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get( 'MODEL', 'model_name') model.save(model_fp) offline_train_preds, offline_valid_preds, offline_test_preds = model.fit( offline_train_features, offline_train_labels, offline_valid_features, offline_valid_labels, offline_test_features, offline_test_labels) offline_train_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_train_labels, offline_train_preds) offline_valid_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_valid_labels, offline_valid_preds) offline_test_score = Evaluator.evaluate( self.config.get('MODEL', 'evaluator_name'), offline_test_labels, offline_test_preds) score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv') score_file = open(score_fp, 'a') score_file.write( 'single_exec\ttrain:%s\tvalid:%s\ttest:%s\n' % (offline_train_score, offline_valid_score, offline_test_score)) score_file.close() # save prediction results offline_valid_preds_fp = '%s/se_valid.%s.pred' % (self.config.get( 'DIRECTORY', 'pred_pt'), self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_valid_preds_fp, offline_valid_preds, 'w') offline_test_preds_fp = '%s/se_test.%s.pred' % (self.config.get( 'DIRECTORY', 'pred_pt'), self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_test_preds_fp, offline_test_preds, 'w')
def sample_row(features, indexs): features_sampled = features[indexs, :] (row_num, col_num) = features_sampled.shape LogUtil.log("INFO", "row sample done, shape=(%d,%d)" % (row_num, col_num)) return features_sampled
def run_offline(self): # load feature matrix offline_features = Feature.load_all(self.config.get('DIRECTORY', 'feature_pt'), self.config.get('FEATURE', 'feature_selected').split(), self.config.get('MODEL', 'offline_rawset_name'), self.config.get('FEATURE', 'will_save')) # load labels offline_labels = DataUtil.load_vector('%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'), self.config.get('MODEL', 'offline_rawset_name')), True) # generate index file if '' == self.se_tag: self.se_tag = self.out_tag self.__generate_index(offline_features.shape[0]) index_pt = self.config.get('DIRECTORY', 'index_pt') # generate training data set offline_train_pos_rate = float(self.config.get('MODEL', 'train_pos_rate')) offline_train_indexs_fp = '%s/se_tag%s_train.%s.index' % (index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp, 'int') offline_train_features, offline_train_labels, offline_train_balanced_indexs = \ SingleExec.__generate_data(offline_train_indexs, offline_labels, offline_features, offline_train_pos_rate) LogUtil.log('INFO', 'offline train data generation done') # generate validation data set offline_valid_pos_rate = float(self.config.get('MODEL', 'valid_pos_rate')) offline_valid_indexs_fp = '%s/se_tag%s_valid.%s.index' % (index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp, 'int') offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \ SingleExec.__generate_data(offline_valid_indexs, offline_labels, offline_features, offline_valid_pos_rate) LogUtil.log('INFO', 'offline valid data generation done') # generate test data set offline_test_pos_rate = float(self.config.get('MODEL', 'test_pos_rate')) offline_test_indexs_fp = '%s/se_tag%s_test.%s.index' % (index_pt, self.se_tag, self.config.get('MODEL', 'offline_rawset_name')) offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp, 'int') offline_test_features, offline_test_labels, offline_test_balanced_indexs = \ SingleExec.__generate_data(offline_test_indexs, offline_labels, offline_features, offline_test_pos_rate) LogUtil.log('INFO', 'offline test data generation done') model = Model.new(self.config.get('MODEL', 'model_name'), self.config) model_fp = self.config.get('DIRECTORY', 'model_pt') + '/se.%s.model' % self.config.get('MODEL', 'model_name') model.save(model_fp) offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(offline_train_features, offline_train_labels, offline_valid_features, offline_valid_labels, offline_test_features, offline_test_labels) offline_train_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_train_labels, offline_train_preds) offline_valid_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_valid_labels, offline_valid_preds) offline_test_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_test_labels, offline_test_preds) score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv') score_file = open(score_fp, 'a') score_file.write('single_exec\ttrain:%s\tvalid:%s\ttest:%s\n' % (offline_train_score, offline_valid_score, offline_test_score)) score_file.close() # save prediction results offline_valid_preds_fp = '%s/se_valid.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'), self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_valid_preds_fp, offline_valid_preds, 'w') offline_test_preds_fp = '%s/se_test.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'), self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_test_preds_fp, offline_test_preds, 'w')
def run_offline(self): LogUtil.log('INFO', 'cv_tag(%s)' % self.cv_tag) # load feature matrix offline_features = Feature.load_all(self.config.get('DIRECTORY', 'feature_pt'), self.config.get('FEATURE', 'feature_selected').split(), self.config.get('MODEL', 'offline_rawset_name'), self.config.get('FEATURE', 'will_save')) # load labels offline_labels = DataUtil.load_vector('%s/%s.label' % (self.config.get('DIRECTORY', 'label_pt'), self.config.get('MODEL', 'offline_rawset_name')), True) # generate index file if '' == self.cv_tag: self.cv_tag = self.out_tag self.__generate_index(offline_features.shape[0]) # cross validation offline_valid_preds_all = [0.] * offline_features.shape[0] offline_test_preds_all = [0.] * offline_features.shape[0] for fold_id in range(self.cv_num): LogUtil.log('INFO', 'cross validation fold_id(%d) begin' % fold_id) # generate training data set offline_train_pos_rate = float(self.config.get('MODEL', 'train_pos_rate')) offline_train_indexs_fp = '%s/cv_tag%s_n%d_f%d_train.%s.index' % (self.config.get('DIRECTORY', 'index_pt'), self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) offline_train_indexs = DataUtil.load_vector(offline_train_indexs_fp, 'int') offline_train_features, offline_train_labels, offline_train_balanced_indexs = \ CrossValidation.__generate_data(offline_train_indexs, offline_labels, offline_features, offline_train_pos_rate) LogUtil.log('INFO', 'offline train data generation done') # generate validation data set offline_valid_pos_rate = float(self.config.get('MODEL', 'valid_pos_rate')) offline_valid_indexs_fp = '%s/cv_tag%s_n%d_f%d_valid.%s.index' % (self.config.get('DIRECTORY', 'index_pt'), self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) offline_valid_indexs = DataUtil.load_vector(offline_valid_indexs_fp, 'int') offline_valid_features, offline_valid_labels, offline_valid_balanced_indexs = \ CrossValidation.__generate_data(offline_valid_indexs, offline_labels, offline_features, offline_valid_pos_rate) LogUtil.log('INFO', 'offline valid data generation done') # generate test data set offline_test_pos_rate = float(self.config.get('MODEL', 'test_pos_rate')) offline_test_indexs_fp = '%s/cv_tag%s_n%d_f%d_test.%s.index' % (self.config.get('DIRECTORY', 'index_pt'), self.cv_tag, self.cv_num, fold_id, self.config.get('MODEL', 'offline_rawset_name')) offline_test_indexs = DataUtil.load_vector(offline_test_indexs_fp, 'int') offline_test_features, offline_test_labels, offline_test_balanced_indexs = \ CrossValidation.__generate_data(offline_test_indexs, offline_labels, offline_features, offline_test_pos_rate) LogUtil.log('INFO', 'offline test data generation done') model = Model.new(self.config.get('MODEL', 'model_name'), self.config) model_fp = self.config.get('DIRECTORY', 'model_pt') + '/cv_n%d_f%d.%s.model' % \ (self.cv_num, fold_id, self.config.get('MODEL', 'model_name')) model.save(model_fp) offline_train_preds, offline_valid_preds, offline_test_preds = model.fit(offline_train_features, offline_train_labels, offline_valid_features, offline_valid_labels, offline_test_features, offline_test_labels) offline_train_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_train_labels, offline_train_preds) offline_valid_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_valid_labels, offline_valid_preds) offline_test_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_test_labels, offline_test_preds) score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv') score_file = open(score_fp, 'a') score_file.write('fold:%d\ttrain:%s\tvalid:%s\ttest:%s\n' % (fold_id, offline_train_score, offline_valid_score, offline_test_score)) score_file.close() # merge prediction results for index in range(len(offline_valid_balanced_indexs)): offline_valid_preds_all[offline_valid_balanced_indexs[index]] = offline_valid_preds[index] for index in range(len(offline_test_balanced_indexs)): offline_test_preds_all[offline_test_balanced_indexs[index]] = offline_test_preds[index] LogUtil.log('INFO', 'cross test fold_id(%d) done' % fold_id) # save prediction results offline_valid_preds_all_fp = '%s/cv_n%d_valid.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'), self.cv_num, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_valid_preds_all_fp, offline_valid_preds_all, 'w') offline_test_preds_all_fp = '%s/cv_n%d_test.%s.pred' % (self.config.get('DIRECTORY', 'pred_pt'), self.cv_num, self.config.get('MODEL', 'offline_rawset_name')) DataUtil.save_vector(offline_test_preds_all_fp, offline_test_preds_all, 'w') # evaluate offline_valid_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_labels, offline_valid_preds_all) offline_test_score = Evaluator.evaluate(self.config.get('MODEL', 'evaluator_name'), offline_labels, offline_test_preds_all) score_fp = '%s/%s.score' % (self.config.get('DIRECTORY', 'score_pt'), 'cv') score_file = open(score_fp, 'a') score_file.write('cross_validation\tvalid:%s\ttest:%s\n' % (offline_valid_score, offline_test_score)) score_file.close()
def generate_pagerank(G, alpha, max_iter): pr = nx.pagerank(G, alpha=alpha, max_iter=max_iter) LogUtil.log('INFO', 'Graph cal pagerank done') return pr
def generate_graph(config, weight_feature_name, weight_feature_id, reverse): q2id = {} e2weight = {} G = nx.Graph() train_wfs_fs = None test_wfs_fs = None if weight_feature_name is not None: train_wfs_fs = Feature.load( '%s/%s.train.smat' % (config.get('DIRECTORY', 'feature_question_pair_pt'), weight_feature_name)).toarray() test_wfs_fs = Feature.load( '%s/%s.test.smat' % (config.get('DIRECTORY', 'feature_question_pair_pt'), weight_feature_name)).toarray() if 'True' == reverse: LogUtil.log('INFO', 'will reverse') for index in range(len(train_wfs_fs)): train_wfs_fs[index][weight_feature_id] = 1. - train_wfs_fs[ index][weight_feature_id] for index in range(len(test_wfs_fs)): test_wfs_fs[index][weight_feature_id] = 1. - test_wfs_fs[ index][weight_feature_id] fin = csv.reader( open('%s/train.csv' % config.get('DIRECTORY', 'origin_pt'))) fin.next() index = 0 for p in fin: q1 = str(p[3]).strip() q2 = str(p[4]).strip() weight = 0 if train_wfs_fs is None else train_wfs_fs[index][ weight_feature_id] if q1 not in q2id: q2id[q1] = len(q2id) if q2 not in q2id: q2id[q2] = len(q2id) G.add_edge(q2id[q1], q2id[q2], weight=weight) e2weight[(q2id[q1], q2id[q2])] = weight e2weight[(q2id[q2], q2id[q1])] = weight index += 1 fin = csv.reader( open('%s/test.csv' % config.get('DIRECTORY', 'origin_pt'))) fin.next() index = 0 for p in fin: q1 = str(p[1]).strip() q2 = str(p[2]).strip() weight = 0 if test_wfs_fs is None else test_wfs_fs[index][ weight_feature_id] if q1 not in q2id: q2id[q1] = len(q2id) if q2 not in q2id: q2id[q2] = len(q2id) G.add_edge(q2id[q1], q2id[q2], weight=weight) e2weight[(q2id[q1], q2id[q2])] = weight e2weight[(q2id[q2], q2id[q1])] = weight index += 1 LogUtil.log('INFO', 'Graph constructed.') return q2id, e2weight, G
def generate_hits(G, max_iter): hits_h, hits_a = nx.hits(G, max_iter=max_iter) LogUtil.log('INFO', 'Graph cal hits done') return hits_h, hits_a