class Char2Vec(Singleton): def __init__(self): if not check_uptodate(char2vec_path): _gen_char2vec() self.embedding = np.load(char2vec_path) self.char_dict = CharDict() def get_embedding(self): return self.embedding def get_vect(self, ch): return self.embedding[self.char_dict.char2int(ch)] def get_vects(self, text): return np.stack(map(self.get_vect, text)) if len(text) > 0 \ else np.reshape(np.array([[]]), [0, CHAR_VEC_DIM])
def _gen_poems(): print("Parsing poems ...") char_dict = CharDict() with open(poems_path, 'w') as fout: for corpus in _corpus_list: with open(os.path.join(raw_dir, corpus), 'r') as fin: for line in fin.readlines()[1:]: sentences = split_sentences(line.strip().split()[-1]) all_char_in_dict = True for sentence in sentences: for ch in sentence: if char_dict.char2int(ch) < 0: all_char_in_dict = False break if not all_char_in_dict: break if all_char_in_dict: fout.write(' '.join(sentences) + '\n') print("Finished parsing %s." % corpus)
def process(in_path, out_path): f_in = open(in_path, 'r') f_out = open(out_path, 'w') temp = f_in.readline().split() num_of_lines = int(temp[0]) embedding_sz = int(temp[1]) char_dict = CharDict() count = 0 for line in f_in: data = line.split() word = data[0] all_char_in_dict = True for c in word: if char_dict.char2int(c) < 0: all_char_in_dict = False break if not all_char_in_dict: #print ('skip') continue if len(word) > 3: continue f_out.write(line) count += 1 if count % 80000 == 0: print('\r {c} / {t} {p}%'.format(c=count, t=num_of_lines, p=int(count * 100 / num_of_lines)), end='') f_in.close() f_out.close()