示例#1
0
from keras.layers.embeddings import Embedding
from keras.optimizers import SGD

from eval.EvalModel import eval_mulclass
from rep.RepresentationLayer import RepresentationLayer
from util.FileUtils import readFile

max_len = 120
word_vec_dim = 50
position_vec_dim = 10
epoch_size = 100
input_file = '../../data/final_corpus.txt'
word_vec_file = '/home/wang/PythonProjects/data/zhwiki_2017_03.sg_50d.word2vec'
output_file = '/home/wang/PythonProjects/data/CNN.model'

lines = readFile(input_file)
rep = RepresentationLayer(wordvec_file=word_vec_file,
                          frequency=200000,
                          max_sent_len=max_len)

word = Input(shape=(max_len, ), dtype='int32', name='word')
distance_e1 = Input(shape=(max_len, ), dtype='int32', name='distance_e1')
distance_e2 = Input(shape=(max_len, ), dtype='int32', name='distance_e2')

word_emb = Embedding(rep.vec_table.shape[0],
                     rep.vec_table.shape[1],
                     weights=[rep.vec_table],
                     mask_zero=False,
                     input_length=max_len)
position_emb = Embedding(max_len * 2 + 1,
                         position_vec_dim,
示例#2
0
def replaceWithE(content):
    index1 = content.find('{') 
    index2 = content.find('}') 
    index3 = content.rfind('{') 
    index4 = content.rfind('}')
    newContent = content[:index1] + E1_B + content[index1 + 1:index2] + \
        E1_E + content[index2 + 1:index3] + E2_B + content[index3 + 1:index4] + \
        E2_E + content[index4 + 1:]
    return newContent

input_file = '../../data/GAD1-1000.txt'
input_file_eng = '../../data/GAD1-1000_lab.txt'
output_file = '../../data/newCropusSegment.txt'
dict_file = '../../data/dic.txt'

lines = readFile(input_file)
lab_lines = readFile(input_file_eng)

jieba.load_userdict(dict_file)

newLines = []
for i in range(len(lines)):
    label = getLabel(lab_lines[i])
    type = getType(lines[i])
    content = getContent(lines[i])
    newContent = replaceWithE(content)
    
    newLine = label + type + newContent
    
    newLines.append(' '.join(list(jieba.cut(newLine, cut_all=False))))