Exemplo n.º 1
0
    def get_per_sample_tfidf(self, texts, field):
        """
        Each sample is a document.
        Input:
            texts: ["train","text"]
        """
        if self.sample_tfidf is not None:
            return

        self.sample_tfidf = {}
        self.get_per_sample_words_count(texts, field)

        name = "{}/global_idf_dic.p".format(self.flags.data_path)
        self.global_idf_dic = load_pickle(self.global_idf_dic, name, {})

        for text in texts:
            name = "{}/sample_tfidf_{}.p".format(self.flags.data_path, text)
            if text not in self.global_idf_dic:
                self.global_idf_dic[text] = {}
            if os.path.exists(name):
                self.sample_tfidf[text] = pickle.load(open(name, 'rb'))
            else:
                tf_list = tf(self.sample_words_count[text], 0)
                idf_list = idf(tf_list, self.global_idf_dic[text], 0)
                tfidf_list = tf_idf(tf_list, idf_list, 0)
                pickle.dump(tfidf_list, open(name, 'wb'))
                self.sample_tfidf[text] = tfidf_list
            print("\n{} sample tfidf done".format(text))

        name = "{}/global_idf_dic.p".format(self.flags.data_path)
        save_pickle(self.global_idf_dic, name)
Exemplo n.º 2
0
    def get_global_words_count(self, texts, fields=["Text"], silent=0):
        """
        build self.words_count: {"train":Counter, "test":Counter}
        Input:
            texts: ["train","text"]
        """
        if self.words_count is not None:
            return

        self.words_count = {}
        name = "{}/{}_stem_dic.p".format(self.flags.data_path, self.name)
        self.stem_dic = load_pickle(self.stem_dic, name, {})

        for text in texts:
            name = "{}/{}_total_count_{}.p".format(self.flags.data_path,
                                                   self.name, text)
            if os.path.exists(name):
                self.words_count[text] = pickle.load(open(name, 'rb'))
            else:
                print("gen", name)
                word_list = []
                df_global_word_container(self.data[text], fields, word_list)
                # global word container means this is for the entire dataset, not per sample
                # this function is in place.

                word_list = rm_stop_words(word_list)
                word_list = rm_punctuation(word_list)
                word_list = stem(word_list, self.stem_dic)
                word_count = Counter(word_list)
                pickle.dump(word_count, open(name, 'wb'))
                self.words_count[text] = word_count

            if silent == 0:
                print("\nnumber of different words in {}:".format(text),
                      len(self.words_count[text]))
                k = 10
                print("Top {} most common words in {}".format(k, text),
                      self.words_count[text].most_common(k))

        name = "{}/{}_stem_dic.p".format(self.flags.data_path, self.name)
        save_pickle(self.stem_dic, name)

        self.global_word_count = Counter()
        for i, j in self.words_count.items():
            self.global_word_count = self.global_word_count + j
Exemplo n.º 3
0
 def select_top_k_tfidf_words(self, texts, k=10, slack=8):
     name = "{}/top{}-{}_tfidf_words.p".format(self.flags.data_path,k,slack)
     selected = load_pickle(None,name,set())
     if len(selected):
         return selected
     self.get_per_sample_tfidf(['training_text','test_text'],"Text")
     for text in texts:
         data = self.sample_tfidf[text]
         for c,tfidf in enumerate(data):
             topk = sort_value(tfidf)[:k+slack]
             topk = set([i for i in topk if len(i)>2])
             selected = selected.union(topk)
             if c>0 and c%1000 == 0:
                 print("{} documents done, sample {}, num {}".format(c,topk,len(selected)))
     print(len(selected))
     name = "{}/top{}-{}_tfidf_words.p".format(self.flags.data_path,k,slack)
     save_pickle(selected,name)
     return selected
Exemplo n.º 4
0
    def select_top_k_words(self, texts, field, mode="count", k=10, slack=8):
        name = "{}/{}_top{}-{}_{}_words.p".format(self.flags.data_path,
                                                  self.name, k, slack, mode)
        selected = load_pickle(None, name, set())
        if len(selected):
            return selected
        print("gen", name)

        name = "{}/{}_stem_dic.p".format(self.flags.data_path, self.name)
        self.stem_dic = load_pickle(self.stem_dic, name, {})
        assert len(self.stem_dic)

        if mode == "count":
            self.get_per_sample_words_count(texts, field)
        elif mode == "tfidf":
            self.get_per_sample_tfidf(texts, field)
        elif mode == "tf":
            self.get_per_sample_tfidf(texts, field)
        else:
            print("unknown mode", mode)
            assert 0

        for text in texts:
            if mode == "count":
                data = self.sample_words_count[text]
            elif mode == "tfidf":
                data = self.sample_tfidf[text]
            elif mode == "tf":
                data = self.sample_tf[text]

            for c, wd in enumerate(data):
                topk = sort_value(wd)[:k + slack]
                topk = set([i for i in topk if len(i) > 2])
                selected = selected.union(topk)
                if c > 0 and c % 1000 == 0:
                    print(
                        "{} documents done, mode {}, sample {}, num {}".format(
                            c, mode, topk, len(selected)))
        print("num of selected {} key words:".format(mode), len(selected))
        name = "{}/{}_top{}-{}_{}_words.p".format(self.flags.data_path,
                                                  self.name, k, slack, mode)
        save_pickle(selected, name)
        return selected
Exemplo n.º 5
0
    def get_split(self):
        if self.split is not None:
            return
        name = "{}/split.p".format(self.flags.data_path)
        split = load_pickle(None, name, [])

        if len(split) == 0:
            #data = self.data["training_variants"].append(self.data["test_variants_filter"])
            data = self.data["training_variants"]
            y = data['Class'] - 1
            X = np.arange(y.shape[0])
            from sklearn.model_selection import StratifiedKFold
            skf = StratifiedKFold(n_splits=self.flags.folds,
                                  shuffle=True,
                                  random_state=99)
            split = [(train_index, test_index)
                     for train_index, test_index in skf.split(X, y)]
            save_pickle(split, name)
            print("new shuffle")
        self.split = split
Exemplo n.º 6
0
    def get_per_sample_tfidf(self, texts, field, silent=0):
        """
        Each sample is a document.
        Input:
            texts: ["train","text"]
        """
        if self.sample_tfidf is not None:
            return

        self.sample_tfidf = {}
        self.get_per_sample_tf(texts, field, 1)

        name = "{}/{}_global_idf_dic.p".format(self.flags.data_path, self.name)
        self.global_idf_dic = load_pickle(self.global_idf_dic, name, {})
        if len(self.global_idf_dic) == 0:
            print("gen", name)
            all_tf_list = []
            for text in texts:
                if text not in self.noise_texts:
                    all_tf_list.extend(self.sample_tf[text])
            idf(all_tf_list, self.global_idf_dic, 0)
            save_pickle(self.global_idf_dic, name)

        for text in texts:
            name = "{}/{}_sample_tfidf_{}.p".format(self.flags.data_path,
                                                    self.name, text)
            if os.path.exists(name):
                self.sample_tfidf[text] = pickle.load(open(name, 'rb'))
            else:
                print("gen", name)
                tf_list = self.sample_tf[text]
                idf_list = self.get_idf_list(tf_list)
                tfidf_list = tf_idf(tf_list, idf_list, 0)
                pickle.dump(tfidf_list, open(name, 'wb'))
                self.sample_tfidf[text] = tfidf_list
            if silent == 0:
                print("\n{} sample tfidf done".format(text))