def evaluate(self, validFile=None): if validFile is None: trainFile = general_config.training_file else: trainFile = validFile predicted_train = None id_train = None for i in range(self.num_models): model = self.models[i] model_name = self.models_name[i] save_dir_tmp = self.saveDir + "/" + model_name res = None for i in range(self.num_cv): save_dir = save_dir_tmp + "/" + str(i) if model_name == "TextCNN": save_dir += "/nonstatic" save_dir += "/train_valid" vocab2intPath = (self.dataDir + "/train" + str(i) + ".txt").replace(".txt", "_v2i.json") resPath = save_dir + "/train_predicted.csv" if os.path.exists(resPath): res_ = {} res_tmp = pd.read_csv(filepath_or_buffer=resPath) for id, label in zip(res_tmp["id"].values, res_tmp["label"].values): res_[id] = label else: res_ = model.predict(testFile=trainFile, vocab2intPath=vocab2intPath, load_path=save_dir, resPath=resPath) res_ = [[key, value] for (key, value) in res_.items()] res_ = pd.DataFrame(res_, columns=["id", "label"]) res_ = res_.sort_values(by="id", axis=0, ascending=True) if i == 0: id_train = res_["id"].values else: assert np.allclose(id_train, res_["id"].values) try: res += res_["label"].values except: res = res_["label"].values res = res / self.num_cv try: predicted_train = np.concatenate( [predicted_train, res.reshape((-1, 1))], axis=-1) except: predicted_train = res.reshape((-1, 1)) assert predicted_train.shape[1] == self.num_models id, _, label = readNewFile(trainFile) assert np.allclose(np.array(id), np.array(id_train)), "Inconsistent indices!" self.classifier = joblib.load(self.saveDir + "/lr.pkl") predicted_ = self.classifier.predict(predicted_train) train_accuracy = np.mean( np.equal( np.array(label).reshape((-1, )), np.array(predicted_).reshape((-1, )))) self.logger.info("Accuracy: %s" % train_accuracy) return train_accuracy
def train_2(self): predicted_train = None id_train = None for i in range(self.num_models): model = self.models[i] model_name = self.models_name[i] save_dir_tmp = self.saveDir + "/" + model_name res = {} for i in range(self.num_cv): save_dir = save_dir_tmp + "/" + str(i) if model_name == "TextCNN": save_dir += "/nonstatic" save_dir += "/train_valid" testFile = self.dataDir + "/valid" + str(i) + ".txt" vocab2intPath = testFile.replace("valid", "train").replace( ".txt", "_v2i.json") resPath = save_dir + "/valid_predicted.csv" if os.path.exists(resPath): res_ = {} res_tmp = pd.read_csv(filepath_or_buffer=resPath) for id, label in zip(res_tmp["id"].values, res_tmp["label"].values): res_[id] = label else: res_ = model.test(testFile=testFile, vocab2intPath=vocab2intPath, load_path=save_dir, resPath=resPath) res.update(res_) res = [[key, value] for (key, value) in res.items()] tmp = pd.DataFrame(res, columns=["id", "label"]) tmp = tmp.sort_values(by="id", axis=0, ascending=True) id_train = np.reshape(tmp["id"].values, newshape=(-1, )) try: predicted_train = np.concatenate( [predicted_train, tmp["label"].values.reshape((-1, 1))], axis=-1) except: predicted_train = tmp["label"].values.reshape((-1, 1)) assert predicted_train.shape[1] == self.num_models id, _, label = readNewFile(file=general_config.training_file) assert np.allclose(np.array(id), np.array(id_train)), "Inconsistent indices!" parameters = { 'C': [0.001, 0.01, 0.1, 1, 10, 100] } # Inverse of regularization strength; # must be a positive float. # Like in support vector machines, smaller values specify stronger regularization. self.classifier = GridSearchCV(self.classifier, parameters, cv=self.num_cv, refit=True) self.classifier.fit(predicted_train, np.array(label)) self.logger.info(self.classifier.cv_results_) self.logger.info(self.classifier.get_params()) save_path = self.saveDir + "/lr.pkl" joblib.dump(self.classifier, save_path)
def train(self, trainPath=general_config.data_dir + "/training_label_new.txt"): indices, sentences, labels = readNewFile( file=trainPath, vocab2intPath=general_config.global_static_v2i_path) sentences_ = [] for sentence in sentences: sentences_.append(self.embeddings[sentence].mean(axis=0)) self.model.fit(X=sentences_, y=labels) self.logger.info(self.model.get_params()) self.logger.info("Training Accuracy: %s" % self.model.score(X=sentences_, y=labels)) save_path = self.save_dir + "/model.pkl" joblib.dump(self.model, save_path)
def test(self, testPath=general_config.data_dir + "/testing_data_new.txt"): indices, sentences, labels = readNewFile( file=testPath, vocab2intPath=general_config.global_static_v2i_path) sentences_ = [] for sentence in sentences: sentences_.append(self.embeddings[sentence].mean(axis=0)) self.model = joblib.load(self.save_dir + "/model.pkl") predicted = self.model.predict(sentences_) res = np.concatenate([ np.array(indices).reshape((-1, 1)), np.array(predicted).reshape((-1, 1)) ], axis=1) WriteToSubmission( res, fileName=self.save_dir.replace("checkpoints", "results") + "/predicted.csv")
def __init__(self, loadPath, vocab2intPath, sent_len_cut=None): indices, sentences, labels = readNewFile(file=loadPath, vocab2intPath=vocab2intPath) num_words = [len(sentence) for sentence in sentences] if isinstance(sent_len_cut, int): num_words_ = [ min(len(sentence), sent_len_cut) for sentence in sentences ] else: num_words_ = num_words[:] self.df = pd.DataFrame({ "id": indices, "sentence": sentences, "label": labels, "sentence_length": num_words, "sentence_length_": num_words_ }) self.total_size = len(self.df) self.cursor = 0 self.loop = 0 self.max_len = general_config.max_seq_len self.shuffle()
def __init__(self, loadPath, vocab2intPath, num_buckets=5): indices, sentences, labels = readNewFile(file=loadPath, vocab2intPath=vocab2intPath) v2i = loadDict(vocab2intPath) docs = [] num_sentences = [] num_words = [] num_words_flat = [] for sentence in sentences: doc = sentence2doc(sentence, v2i) docs.append(doc) num_sentences.append(len(doc)) num_words_ = [len(_) for _ in doc] num_words.append(num_words_) num_words_flat.extend(num_words_) # print(max(num_sentences)) # print(max(num_words_flat)) # print(num_words[:5]) self.df = pd.DataFrame({ "id": indices, "doc": docs, "label": labels, "doc_length": num_sentences, "sentence_length": num_words }) df = self.df.sort_values("doc_length").reset_index(drop=True) self.total_size = len(df) part_size = self.total_size // num_buckets self.dfs = [] for i in range(num_buckets): self.dfs.append(df.ix[i * part_size:(i + 1) * part_size - 1]) self.dfs[num_buckets - 1].append(df.ix[num_buckets * part_size:self.total_size - 1]) self.num_buckets = num_buckets self.cursor = np.array([0] * num_buckets) self.p_list = [1 / self.num_buckets] * self.num_buckets self.loop = 0 self.shuffle()
def train(self, trainPath=general_config.data_dir + "/training_label_new.txt", num_cv=5): indices, sentences, labels = readNewFile( file=trainPath, vocab2intPath=general_config.global_static_v2i_path) sentences_ = [] for sentence in sentences: sentences_.append(self.embeddings[sentence].mean(axis=0)) parameters = { 'C': [0.001, 0.01, 0.1, 1, 10, 100] } # Inverse of regularization strength self.model = GridSearchCV(self.model, parameters, cv=num_cv, refit=True) self.model.fit(X=sentences_, y=labels) self.logger.info(self.model.cv_results_) self.logger.info(self.model.get_params()) self.logger.info("Training Accuracy: %s" % self.model.score(X=sentences_, y=labels)) save_path = self.save_dir + "/model.pkl" joblib.dump(self.model, save_path)
def __init__(self, loadPath, vocab2intPath, num_buckets=5): indices, sentences, labels = readNewFile(file=loadPath, vocab2intPath=vocab2intPath) num_words = [len(sentence) for sentence in sentences] self.df = pd.DataFrame({ "id": indices, "sentence": sentences, "label": labels, "sentence_length": num_words }) df = self.df.sort_values("sentence_length").reset_index(drop=True) self.total_size = len(df) part_size = self.total_size // num_buckets self.dfs = [] for i in range(num_buckets): self.dfs.append(df.ix[i * part_size:(i + 1) * part_size - 1]) self.dfs[num_buckets - 1].append(df.ix[num_buckets * part_size:self.total_size - 1]) self.num_buckets = num_buckets self.cursor = np.array([0] * num_buckets) self.p_list = [1 / self.num_buckets] * self.num_buckets self.loop = 0 self.max_len = general_config.max_seq_len self.shuffle()