from keras.layers.embeddings import Embedding from keras.optimizers import SGD from eval.EvalModel import eval_mulclass from rep.RepresentationLayer import RepresentationLayer from util.FileUtils import readFile max_len = 120 word_vec_dim = 50 position_vec_dim = 10 epoch_size = 100 input_file = '../../data/final_corpus.txt' word_vec_file = '/home/wang/PythonProjects/data/zhwiki_2017_03.sg_50d.word2vec' output_file = '/home/wang/PythonProjects/data/CNN.model' lines = readFile(input_file) rep = RepresentationLayer(wordvec_file=word_vec_file, frequency=200000, max_sent_len=max_len) word = Input(shape=(max_len, ), dtype='int32', name='word') distance_e1 = Input(shape=(max_len, ), dtype='int32', name='distance_e1') distance_e2 = Input(shape=(max_len, ), dtype='int32', name='distance_e2') word_emb = Embedding(rep.vec_table.shape[0], rep.vec_table.shape[1], weights=[rep.vec_table], mask_zero=False, input_length=max_len) position_emb = Embedding(max_len * 2 + 1, position_vec_dim,
def replaceWithE(content): index1 = content.find('{') index2 = content.find('}') index3 = content.rfind('{') index4 = content.rfind('}') newContent = content[:index1] + E1_B + content[index1 + 1:index2] + \ E1_E + content[index2 + 1:index3] + E2_B + content[index3 + 1:index4] + \ E2_E + content[index4 + 1:] return newContent input_file = '../../data/GAD1-1000.txt' input_file_eng = '../../data/GAD1-1000_lab.txt' output_file = '../../data/newCropusSegment.txt' dict_file = '../../data/dic.txt' lines = readFile(input_file) lab_lines = readFile(input_file_eng) jieba.load_userdict(dict_file) newLines = [] for i in range(len(lines)): label = getLabel(lab_lines[i]) type = getType(lines[i]) content = getContent(lines[i]) newContent = replaceWithE(content) newLine = label + type + newContent newLines.append(' '.join(list(jieba.cut(newLine, cut_all=False))))