def make_tfidf_feature_100_holdout(row_body_path, row_stance_path, row_test_body_path, row_test_stance_path, head_save_path, body_save_path, stance_save_path, model_save=True): if not os.path.exists(head_save_path) or not os.path.exists(body_save_path) \ or not os.path.exists(stance_save_path): dataset = Dataset(row_body_path, row_stance_path) head, body, stance = dataset.read_combine() fe = Feature_enginnering(head, body, stance) # "tfidf_label_one_hot_train.pkl" # 'tfidf_body_feature_train.pkl' # 'tfidf_head_feature_train.pkl' fe.get_tfidf_vocab_100_holdout(row_test_body_path, row_test_stance_path) if not os.path.exists(head_save_path): fe.tfidf_train_head(head_save_path, model_save=model_save) if not os.path.exists(body_save_path): fe.tfidf_train_body(body_save_path, model_save=model_save) if not os.path.exists(stance_save_path): fe.tfidf_stance_save(stance_save_path, model_save=model_save) print('train_idf_100 feature saved!')
def get_tfidf_vocab_5000_holdout(self, test_body, test_stance): """ TF-IDF 벡터를 만들기 위한 train_vocab 파일을 반환 하는 메소드 :return: train용 TF-IDF vocab 파일 """ test_dataset = Dataset(test_body, test_stance) t_h, t_b = test_dataset.read_tfidf_data() test_h = [h for h in t_h] test_b = [b for b in t_b] train_data = [b + " " + h for b, h in zip(self.body, self.head)] train_data.extend(test_b) train_data.extend(test_h) model = TfidfVectorizer(max_features=5000, ngram_range=(1, 1), stop_words='english', norm='l2', use_idf=False) model.fit_transform(train_data) if os.path.exists('../pickled_model/tfidf_holdout_vocab.pkl'): self.vocab = load_model('../pickled_model/tfidf_holdout_vocab.pkl') print('vocab loaded!') else: self.vocab = model.vocabulary_ save_model('../pickled_model/tfidf_holdout_vocab.pkl', model.vocabulary_) return self.vocab
def single_flat_LSTM_50d_100(body_path, stance_path, mode): GloVe_vectors = load_embedding_pandas(param_dict['GLOVE_ZIP_FILE'], param_dict['GLOVE_FILE'], type="w2v") print(GloVe_vectors[:5]) d_set = Dataset(body_path, stance_path) head, body, one_hot_label = d_set.read_combine() all = head.tolist() all.extend(body.tolist()) vocab = create_embedding_lookup_pandas(all, param_dict["MAX_NB_WORDS"], param_dict["EMBEDDING_DIM"], GloVe_vectors, param_dict["EMBEDDING_FILE"], param_dict["VOCAB_FILE"], init_zeros=False, add_unknown=True, rdm_emb_init=True, tokenizer=nltk.word_tokenize) del GloVe_vectors concatenated = [] for i in range(len(head)): concatenated.append(head[i] + ". " + body[i]) sequences = text_to_sequences_fixed_size(concatenated, vocab, param_dict["MAX_SEQ_LENGTH"], save_full_text=False, take_full_claim=True) if mode == 'train': with open(FEATURES_DIR + PARAM_DICT_FILENAME, 'wb') as f: pickle.dump(param_dict, f, pickle.HIGHEST_PROTOCOL) print("Save PARAM_DICT as " + FEATURES_DIR + PARAM_DICT_FILENAME) return sequences
def get_tfidf_vocab_100_holdout(self, test_body, test_stance): """ TF-IDF 벡터를 만들기 위한 train_vocab 파일을 반환 하는 메소드 :return: train용 TF-IDF vocab 파일 """ test_dataset = Dataset(test_body, test_stance) t_h, t_b = test_dataset.read_tfidf_data() test_h = [h for h in t_h] test_b = [b for b in t_b] train_data = [b + " " + h for b, h in zip(self.body, self.head)] train_data.extend(test_b) train_data.extend(test_h) model = TfidfVectorizer(max_features=100, ngram_range=(1, 1), stop_words='english', norm='l2', use_idf=False) model.fit_transform(train_data) self.vocab = model.vocabulary_ return self.vocab