def __init__(self, file_name, args): self.args = args # 队列存储所有配对 self.word_pair_catch = deque() # 采样表 self.sample_table = [] # 去掉频率低于mini_count后所有的单词 self.sentence_length = 0 # 句子个数 self.sentence_count = 0 # 词 --> id self.word2id = {} # id --> 词 self.id2word = {} # 词频率 self.word_frequency = {} # 去重 去低频次 之后单词个数 self.word_count = 0 self.input_file = open(os.path.join(self.args.dir, file_name), encoding='utf-8').readlines() self.get_words() self.init_sample_table() if args.using_hs: tree = HuffmanTree(self.word_frequency) self.huffman_positive, self.huffman_negative = tree.get_huffman_code_and_path( ) print('Word Count: %d' % len(self.word2id)) print('Sentence Length: %d' % (self.sentence_length)) print('Sentence count: %d' % (self.sentence_count))
def __init__(self, file_name, min_count): self.get_words(file_name, min_count) print(" ") self.cbow_count = [] self.word_pair_catch = deque() self.cbow_word_pair_catch = deque() self.init_sample_table() tree = HuffmanTree(self.word_frequency) print("tree ", tree) self.huffman_positive, self.huffman_negative = tree.get_huffman_code_and_path( ) print('Word Count: %d' % len(self.word2id)) print('Sentence Length: %d' % (self.sentence_length))