예제 #1
0
    def get_clean_doc(self, texts, field, selected_words):
        if self.clean_doc is not None:
            return

        self.clean_doc = {}

        name = "{}/{}_stem_dic.p".format(self.flags.data_path, self.name)
        self.stem_dic = load_pickle(self.stem_dic, name, {})
        assert len(self.stem_dic)

        for text in texts:
            name = "{}/{}_clean_doc_{}.p".format(self.flags.data_path,
                                                 self.name, text)
            if os.path.exists(name):
                self.clean_doc[text] = pickle.load(open(name, 'rb'))
            else:
                print("gen", name)
                word_lists = [
                ]  # list of lists, each item is a list of words for each sample
                df_per_sample_word_lists(self.data[text], field, word_lists)
                # this function is in place.
                clean_lists = []
                for c, word_list in enumerate(word_lists):
                    word_list = rm_stop_words(word_list)
                    word_list = rm_punctuation(word_list)
                    word_list = stem(word_list, self.stem_dic)
                    word_list = [
                        word for word in word_list if word in selected_words
                    ]
                    clean_lists.append(word_list)
                    if c % 1000 == 0:
                        print("{} docs cleaned {}".format(c, word_list[:10]))
                pickle.dump(clean_lists, open(name, 'wb'))
                self.clean_doc[text] = clean_lists
예제 #2
0
    def get_per_sample_words_count(self, texts, field, silent=0):
        """
        Each sample is a document.
        Input:
            texts: ["train","text"]
        """
        if self.sample_words_count is not None:
            return

        self.sample_words_count = {}
        self.get_global_words_count(texts, [field], 1)

        for text in texts:
            name = "{}/{}_sample_count_{}.p".format(self.flags.data_path,
                                                    self.name, text)
            if os.path.exists(name):
                self.sample_words_count[text] = pickle.load(open(name, 'rb'))
            else:
                print("gen", name)
                word_lists = [
                ]  # list of lists, each item is a list of words for each sample
                df_per_sample_word_lists(self.data[text], field, word_lists)
                # this function is in place.
                word_counts = []
                for word_list in word_lists:
                    word_list = rm_stop_words(word_list)
                    word_list = rm_punctuation(word_list)
                    word_list = stem(word_list, self.stem_dic)
                    word_counts.append(Counter(word_list))

                pickle.dump(word_counts, open(name, 'wb'))
                self.sample_words_count[text] = word_counts
            if silent == 0:
                print("\n{} sample words count done".format(text))
예제 #3
0
    def get_global_words_count(self, texts, fields=["Text"], silent=0):
        """
        build self.words_count: {"train":Counter, "test":Counter}
        Input:
            texts: ["train","text"]
        """
        if self.words_count is not None:
            return

        self.words_count = {}
        name = "{}/{}_stem_dic.p".format(self.flags.data_path, self.name)
        self.stem_dic = load_pickle(self.stem_dic, name, {})

        for text in texts:
            name = "{}/{}_total_count_{}.p".format(self.flags.data_path,
                                                   self.name, text)
            if os.path.exists(name):
                self.words_count[text] = pickle.load(open(name, 'rb'))
            else:
                print("gen", name)
                word_list = []
                df_global_word_container(self.data[text], fields, word_list)
                # global word container means this is for the entire dataset, not per sample
                # this function is in place.

                word_list = rm_stop_words(word_list)
                word_list = rm_punctuation(word_list)
                word_list = stem(word_list, self.stem_dic)
                word_count = Counter(word_list)
                pickle.dump(word_count, open(name, 'wb'))
                self.words_count[text] = word_count

            if silent == 0:
                print("\nnumber of different words in {}:".format(text),
                      len(self.words_count[text]))
                k = 10
                print("Top {} most common words in {}".format(k, text),
                      self.words_count[text].most_common(k))

        name = "{}/{}_stem_dic.p".format(self.flags.data_path, self.name)
        save_pickle(self.stem_dic, name)

        self.global_word_count = Counter()
        for i, j in self.words_count.items():
            self.global_word_count = self.global_word_count + j