Пример #1
0
    def __init__(self, file_name, args):
        self.args = args
        # 队列存储所有配对
        self.word_pair_catch = deque()
        # 采样表
        self.sample_table = []
        # 去掉频率低于mini_count后所有的单词
        self.sentence_length = 0
        # 句子个数
        self.sentence_count = 0
        # 词 --> id
        self.word2id = {}
        # id --> 词
        self.id2word = {}
        # 词频率
        self.word_frequency = {}
        # 去重 去低频次 之后单词个数
        self.word_count = 0
        self.input_file = open(os.path.join(self.args.dir, file_name),
                               encoding='utf-8').readlines()

        self.get_words()
        self.init_sample_table()

        if args.using_hs:
            tree = HuffmanTree(self.word_frequency)
            self.huffman_positive, self.huffman_negative = tree.get_huffman_code_and_path(
            )

        print('Word Count: %d' % len(self.word2id))
        print('Sentence Length: %d' % (self.sentence_length))
        print('Sentence count: %d' % (self.sentence_count))
Пример #2
0
 def __init__(self, file_name, min_count):
     self.get_words(file_name, min_count)
     print(" ")
     self.cbow_count = []
     self.word_pair_catch = deque()
     self.cbow_word_pair_catch = deque()
     self.init_sample_table()
     tree = HuffmanTree(self.word_frequency)
     print("tree ", tree)
     self.huffman_positive, self.huffman_negative = tree.get_huffman_code_and_path(
     )
     print('Word Count: %d' % len(self.word2id))
     print('Sentence Length: %d' % (self.sentence_length))