Пример #1
0
 def evaluate(self, validFile=None):
     if validFile is None:
         trainFile = general_config.training_file
     else:
         trainFile = validFile
     predicted_train = None
     id_train = None
     for i in range(self.num_models):
         model = self.models[i]
         model_name = self.models_name[i]
         save_dir_tmp = self.saveDir + "/" + model_name
         res = None
         for i in range(self.num_cv):
             save_dir = save_dir_tmp + "/" + str(i)
             if model_name == "TextCNN":
                 save_dir += "/nonstatic"
             save_dir += "/train_valid"
             vocab2intPath = (self.dataDir + "/train" + str(i) +
                              ".txt").replace(".txt", "_v2i.json")
             resPath = save_dir + "/train_predicted.csv"
             if os.path.exists(resPath):
                 res_ = {}
                 res_tmp = pd.read_csv(filepath_or_buffer=resPath)
                 for id, label in zip(res_tmp["id"].values,
                                      res_tmp["label"].values):
                     res_[id] = label
             else:
                 res_ = model.predict(testFile=trainFile,
                                      vocab2intPath=vocab2intPath,
                                      load_path=save_dir,
                                      resPath=resPath)
             res_ = [[key, value] for (key, value) in res_.items()]
             res_ = pd.DataFrame(res_, columns=["id", "label"])
             res_ = res_.sort_values(by="id", axis=0, ascending=True)
             if i == 0:
                 id_train = res_["id"].values
             else:
                 assert np.allclose(id_train, res_["id"].values)
             try:
                 res += res_["label"].values
             except:
                 res = res_["label"].values
         res = res / self.num_cv
         try:
             predicted_train = np.concatenate(
                 [predicted_train, res.reshape((-1, 1))], axis=-1)
         except:
             predicted_train = res.reshape((-1, 1))
     assert predicted_train.shape[1] == self.num_models
     id, _, label = readNewFile(trainFile)
     assert np.allclose(np.array(id),
                        np.array(id_train)), "Inconsistent indices!"
     self.classifier = joblib.load(self.saveDir + "/lr.pkl")
     predicted_ = self.classifier.predict(predicted_train)
     train_accuracy = np.mean(
         np.equal(
             np.array(label).reshape((-1, )),
             np.array(predicted_).reshape((-1, ))))
     self.logger.info("Accuracy: %s" % train_accuracy)
     return train_accuracy
Пример #2
0
 def train_2(self):
     predicted_train = None
     id_train = None
     for i in range(self.num_models):
         model = self.models[i]
         model_name = self.models_name[i]
         save_dir_tmp = self.saveDir + "/" + model_name
         res = {}
         for i in range(self.num_cv):
             save_dir = save_dir_tmp + "/" + str(i)
             if model_name == "TextCNN":
                 save_dir += "/nonstatic"
             save_dir += "/train_valid"
             testFile = self.dataDir + "/valid" + str(i) + ".txt"
             vocab2intPath = testFile.replace("valid", "train").replace(
                 ".txt", "_v2i.json")
             resPath = save_dir + "/valid_predicted.csv"
             if os.path.exists(resPath):
                 res_ = {}
                 res_tmp = pd.read_csv(filepath_or_buffer=resPath)
                 for id, label in zip(res_tmp["id"].values,
                                      res_tmp["label"].values):
                     res_[id] = label
             else:
                 res_ = model.test(testFile=testFile,
                                   vocab2intPath=vocab2intPath,
                                   load_path=save_dir,
                                   resPath=resPath)
             res.update(res_)
         res = [[key, value] for (key, value) in res.items()]
         tmp = pd.DataFrame(res, columns=["id", "label"])
         tmp = tmp.sort_values(by="id", axis=0, ascending=True)
         id_train = np.reshape(tmp["id"].values, newshape=(-1, ))
         try:
             predicted_train = np.concatenate(
                 [predicted_train, tmp["label"].values.reshape((-1, 1))],
                 axis=-1)
         except:
             predicted_train = tmp["label"].values.reshape((-1, 1))
     assert predicted_train.shape[1] == self.num_models
     id, _, label = readNewFile(file=general_config.training_file)
     assert np.allclose(np.array(id),
                        np.array(id_train)), "Inconsistent indices!"
     parameters = {
         'C': [0.001, 0.01, 0.1, 1, 10, 100]
     }  # Inverse of regularization strength;
     # must be a positive float.
     # Like in support vector machines, smaller values specify stronger regularization.
     self.classifier = GridSearchCV(self.classifier,
                                    parameters,
                                    cv=self.num_cv,
                                    refit=True)
     self.classifier.fit(predicted_train, np.array(label))
     self.logger.info(self.classifier.cv_results_)
     self.logger.info(self.classifier.get_params())
     save_path = self.saveDir + "/lr.pkl"
     joblib.dump(self.classifier, save_path)
Пример #3
0
 def train(self,
           trainPath=general_config.data_dir + "/training_label_new.txt"):
     indices, sentences, labels = readNewFile(
         file=trainPath,
         vocab2intPath=general_config.global_static_v2i_path)
     sentences_ = []
     for sentence in sentences:
         sentences_.append(self.embeddings[sentence].mean(axis=0))
     self.model.fit(X=sentences_, y=labels)
     self.logger.info(self.model.get_params())
     self.logger.info("Training Accuracy: %s" %
                      self.model.score(X=sentences_, y=labels))
     save_path = self.save_dir + "/model.pkl"
     joblib.dump(self.model, save_path)
Пример #4
0
 def test(self, testPath=general_config.data_dir + "/testing_data_new.txt"):
     indices, sentences, labels = readNewFile(
         file=testPath, vocab2intPath=general_config.global_static_v2i_path)
     sentences_ = []
     for sentence in sentences:
         sentences_.append(self.embeddings[sentence].mean(axis=0))
     self.model = joblib.load(self.save_dir + "/model.pkl")
     predicted = self.model.predict(sentences_)
     res = np.concatenate([
         np.array(indices).reshape((-1, 1)),
         np.array(predicted).reshape((-1, 1))
     ],
                          axis=1)
     WriteToSubmission(
         res,
         fileName=self.save_dir.replace("checkpoints", "results") +
         "/predicted.csv")
Пример #5
0
 def __init__(self, loadPath, vocab2intPath, sent_len_cut=None):
     indices, sentences, labels = readNewFile(file=loadPath,
                                              vocab2intPath=vocab2intPath)
     num_words = [len(sentence) for sentence in sentences]
     if isinstance(sent_len_cut, int):
         num_words_ = [
             min(len(sentence), sent_len_cut) for sentence in sentences
         ]
     else:
         num_words_ = num_words[:]
     self.df = pd.DataFrame({
         "id": indices,
         "sentence": sentences,
         "label": labels,
         "sentence_length": num_words,
         "sentence_length_": num_words_
     })
     self.total_size = len(self.df)
     self.cursor = 0
     self.loop = 0
     self.max_len = general_config.max_seq_len
     self.shuffle()
Пример #6
0
 def __init__(self, loadPath, vocab2intPath, num_buckets=5):
     indices, sentences, labels = readNewFile(file=loadPath,
                                              vocab2intPath=vocab2intPath)
     v2i = loadDict(vocab2intPath)
     docs = []
     num_sentences = []
     num_words = []
     num_words_flat = []
     for sentence in sentences:
         doc = sentence2doc(sentence, v2i)
         docs.append(doc)
         num_sentences.append(len(doc))
         num_words_ = [len(_) for _ in doc]
         num_words.append(num_words_)
         num_words_flat.extend(num_words_)
     # print(max(num_sentences))
     # print(max(num_words_flat))
     # print(num_words[:5])
     self.df = pd.DataFrame({
         "id": indices,
         "doc": docs,
         "label": labels,
         "doc_length": num_sentences,
         "sentence_length": num_words
     })
     df = self.df.sort_values("doc_length").reset_index(drop=True)
     self.total_size = len(df)
     part_size = self.total_size // num_buckets
     self.dfs = []
     for i in range(num_buckets):
         self.dfs.append(df.ix[i * part_size:(i + 1) * part_size - 1])
     self.dfs[num_buckets - 1].append(df.ix[num_buckets *
                                            part_size:self.total_size - 1])
     self.num_buckets = num_buckets
     self.cursor = np.array([0] * num_buckets)
     self.p_list = [1 / self.num_buckets] * self.num_buckets
     self.loop = 0
     self.shuffle()
Пример #7
0
 def train(self,
           trainPath=general_config.data_dir + "/training_label_new.txt",
           num_cv=5):
     indices, sentences, labels = readNewFile(
         file=trainPath,
         vocab2intPath=general_config.global_static_v2i_path)
     sentences_ = []
     for sentence in sentences:
         sentences_.append(self.embeddings[sentence].mean(axis=0))
     parameters = {
         'C': [0.001, 0.01, 0.1, 1, 10, 100]
     }  # Inverse of regularization strength
     self.model = GridSearchCV(self.model,
                               parameters,
                               cv=num_cv,
                               refit=True)
     self.model.fit(X=sentences_, y=labels)
     self.logger.info(self.model.cv_results_)
     self.logger.info(self.model.get_params())
     self.logger.info("Training Accuracy: %s" %
                      self.model.score(X=sentences_, y=labels))
     save_path = self.save_dir + "/model.pkl"
     joblib.dump(self.model, save_path)
Пример #8
0
 def __init__(self, loadPath, vocab2intPath, num_buckets=5):
     indices, sentences, labels = readNewFile(file=loadPath,
                                              vocab2intPath=vocab2intPath)
     num_words = [len(sentence) for sentence in sentences]
     self.df = pd.DataFrame({
         "id": indices,
         "sentence": sentences,
         "label": labels,
         "sentence_length": num_words
     })
     df = self.df.sort_values("sentence_length").reset_index(drop=True)
     self.total_size = len(df)
     part_size = self.total_size // num_buckets
     self.dfs = []
     for i in range(num_buckets):
         self.dfs.append(df.ix[i * part_size:(i + 1) * part_size - 1])
     self.dfs[num_buckets - 1].append(df.ix[num_buckets *
                                            part_size:self.total_size - 1])
     self.num_buckets = num_buckets
     self.cursor = np.array([0] * num_buckets)
     self.p_list = [1 / self.num_buckets] * self.num_buckets
     self.loop = 0
     self.max_len = general_config.max_seq_len
     self.shuffle()