def get_tfidf_vocab_5000_holdout(self, test_body, test_stance):
        """
        TF-IDF 벡터를 만들기 위한 train_vocab 파일을 반환 하는 메소드
        :return: train용 TF-IDF vocab 파일
        """
        test_dataset = Dataset(test_body, test_stance)
        t_h, t_b = test_dataset.read_tfidf_data()
        test_h = [h for h in t_h]
        test_b = [b for b in t_b]
        train_data = [b + " " + h for b, h in zip(self.body, self.head)]
        train_data.extend(test_b)
        train_data.extend(test_h)

        model = TfidfVectorizer(max_features=5000,
                                ngram_range=(1, 1),
                                stop_words='english',
                                norm='l2',
                                use_idf=False)
        model.fit_transform(train_data)
        if os.path.exists('../pickled_model/tfidf_holdout_vocab.pkl'):
            self.vocab = load_model('../pickled_model/tfidf_holdout_vocab.pkl')
            print('vocab loaded!')
        else:
            self.vocab = model.vocabulary_
            save_model('../pickled_model/tfidf_holdout_vocab.pkl',
                       model.vocabulary_)
            return self.vocab
 def tfidf_stance_save(self, filename, model_save=False):
     if model_save:
         saved_path = self.save_path + "/" + filename
         print('tfidf_stance_one_hot saving......')
         save_model(saved_path, self.stance)
         print('feature saving finished!')
         print('saved path : ', saved_path)
     else:
         return self.stance
    def tfidf_train_head(self, filename, model_save=False):

        model = TfidfVectorizer(vocabulary=self.vocab, use_idf=True,
                                norm="l2", stop_words='english')

        X_head = model.fit_transform(self.head)
        if model_save:
            saved_path = self.save_path+"/"+filename
            print('tfidf_head_feature saving......')
            save_model(saved_path, X_head)
            print('feature saving finished!')
            print('saved path : ', saved_path)
        else:
            return X_head
    def tfidf_cos_save(self, head_path, body_path, filename, model_save=False):
        head = load_model(head_path).toarray()
        body = load_model(body_path).toarray()
        cos = []
        for x, y in zip(head, body):
            x = x.reshape(1, -1)
            y = y.reshape(1, -1)
            value = cosine_similarity(x, y)[0]
            cos.append(value)
        cos = np.array(cos)

        if model_save:
            saved_path = self.save_path + "/" + filename
            print('tfidf_cos saving......')
            save_model(saved_path, cos)
            print('feature saving finished!')
            print('saved path : ', saved_path)
        else:
            return cos
    def tfidf_train_body(self, filename, model_save=False):
        """
        train body 데이터를 TF-IDF 벡터로 만들어주는 메소드
        :param filename: body 파일이 존재하는 경로
        :param model_save: 모델 저장 여부
        :return: 만들어진 body 모델
        """
        model = TfidfVectorizer(vocabulary=self.vocab, use_idf=True,
                                norm="l2", stop_words='english')

        X_body = model.fit_transform(self.body)

        if model_save:
            saved_path = self.save_path+"/"+filename
            print('tfidf_body_feature saving......')
            save_model(saved_path, X_body)
            print('feature saving finished!')
            print('saved path : ', saved_path)
        else:
            return X_body
Exemplo n.º 6
0
def make_NMF_300_feature(row_body_path,
                         row_stance_path,
                         head_tfidf_pkl,
                         body_tfidf_pkl,
                         label_path,
                         save_nmf_model_path,
                         save_head_path,
                         save_body_path,
                         cos_dist=False):
    if not os.path.exists(head_tfidf_pkl) or not os.path.exists(body_tfidf_pkl) \
            or not os.path.exists(label_path):
        make_tfidf_feature_5000(row_body_path,
                                row_stance_path,
                                head_tfidf_pkl,
                                body_tfidf_pkl,
                                label_path,
                                model_save=True)

    X_tfidf_body = load_model(body_tfidf_pkl)
    X_tfidf_head = load_model(head_tfidf_pkl)

    if not os.path.exists(save_nmf_model_path):
        X_all = np.concatenate(
            (X_tfidf_head.toarray(), X_tfidf_body.toarray()), axis=0)
        print('fit NMF topic model')
        t0 = time()
        nmf = NMF(n_components=300, random_state=1, alpha=.1)
        nmf.fit(X_all)
        print('done in {}'.format(time() - t0))
        save_model(save_nmf_model_path, nmf)

    nmf = load_model(save_nmf_model_path)

    if not os.path.exists(save_head_path) or not os.path.exists(
            save_body_path):
        nmf_head_matrix = nmf.transform(X_tfidf_head)
        nmf_body_matrix = nmf.transform(X_tfidf_body)
        save_model(save_head_path, nmf_head_matrix)
        print('saved model {}'.format(save_head_path))
        save_model(save_body_path, nmf_body_matrix)
        print('saved model {}'.format(save_body_path))

    nmf_head_matrix = load_model(save_head_path)
    nmf_body_matrix = load_model(save_body_path)
    if not cos_dist:
        return np.concatenate((nmf_head_matrix, nmf_body_matrix), axis=1)
    else:
        X = []
        for i in range(len(nmf_head_matrix)):
            X_head = np.array(nmf_head_matrix[i]).reshape((1, -1))
            X_body = np.array(nmf_body_matrix[i]).reshape((1, -1))
            cos = cosine_distances(X_head, X_body).flatten()
            X.append(cos.tolist())
        X = np.array(X)
        X_train = np.concatenate((nmf_head_matrix, nmf_body_matrix), axis=1)
        X = np.concatenate((X_train, X), axis=1)
        return X