Пример #1
0
def wordVectorTrainModel(trainDataPath, modelPath, vocabularyPath,
                         word2VecOrgPath):
    begintime = time()
    sentences = GetSentence(trainDataPath)
    #对训练数据的参数配置
    model = gensim.models.Word2Vec(sentences,
                                   size=100,
                                   window=5,
                                   min_count=1,
                                   workers=4)
    loginformation("", "info", "The process is saving the model.")
    try:
        model.save(modelPath)
        model.wv.save_word2vec_format(word2VecOrgPath,
                                      vocabularyPath,
                                      binary=False)
    except:
        loginformation("", "info",
                       "Error occurs while saving the word vector model!!")
    else:
        loginformation("", "info",
                       "Finished the process of saving word vector model.")
    endtime = time()
    loginformation(
        "", "info",
        "Total word vector training time is: " + (endtime - begintime))
Пример #2
0
 def __iter__(self):  #__iter__方法的对象是可迭代的。
     for file in os.listdir(self.dirname):
         filePath = self.dirname + '/' + file
         try:
             for line in open(filePath):
                 sline = line.strip()
                 if sline == "":
                     continue
                 rline = cleandata(sline)
                 yield rline.split(" ")
         except IOError:
             loginformation(
                 "", "info",
                 "open file: " + self.dirname + '/' + file + " error ")
         else:
             loginformation("", "info",
                            "finish the file: " + self.dirname + '/' + file)
Пример #3
0
def storeDataFile(dataFile, pickleFile):
    AllList = []
    with open(dataFile, 'r') as df:
        num = 0
        id = 0
        preid = -1
        title = ""
        text = ""
        for line in df:
            num += 1
            try:
                if num % 3 == 1:
                    id = int(line.split(" ")[1])
                    #loginformation("", "info", line.split(" ")[1])
                elif num % 3 == 2:
                    title = line.lstrip("abstract : ").rstrip('\n')
                else:
                    text = line.lstrip("text : ").rstrip('\n')
                    if preid + 1 == id and title.strip() != "":
                        AllList.append({
                            'text:': text,
                            'title:': title,
                            'id: ': id
                        })
                        #loginformation("", "info", {'text:': text, 'title:': title, 'id: ': id})
                    else:
                        print("id" + str(id) + " title: " + title + " text: " +
                              text)
                    preid = id

            except:
                loginformation(
                    "", "info", "Error occurs!!" + "text: " + text +
                    " title: " + title + " id: " + id)
                pass
    print(len(AllList))
    with open(pickleFile, 'wb') as pf:
        pickle.dump(AllList, pf, -1)
    return
Пример #4
0
def getSentenceVector(sentence, wordVectorModel, wordVectorModelDim):
    sentenceList = sentence.rstrip('\n').split("  ")
    sentenceWordVectorList = []
    for word in sentenceList:
        wordVector = []
        if (word != ' ' or word != ''):
            try:
                for i in range(wordVectorModelDim):
                    wordVector.append(str(wordVectorModel[word][i].strip))
            except KeyError:
                loginformation("", "debug", word + "not in the vocabulary !!!")
                for i in range(wordVectorModelDim):
                    wordVector.append("0.0")
                pass
            else:
                loginformation("", "info", "Get the vector of " + word)
        else:
            loginformation("", "info", "null word occrs")
        sentenceWordVectorList.append(wordVector)
    loginformation("", "info",
                   "transfered sentence " + sentence + " to word vector.")
    return sentenceWordVectorList
Пример #5
0
def calculate(data_file, statistical_result_file, sparse_word_file):
    loginformation("", "info", "count word number of file: " + data_file)
    with open(data_file, 'r') as df, open(statistical_result_file, 'w') as sf, open(sparse_word_file, 'w') as swf:
        try:
            word_box = []
            for line in df:
                word_box.extend(line.rstrip('\n').split(" "))
            for i in collections.Counter(word_box).items():
                (a, b) = i
                if(b > 3):
                    sf.write(a+":"+str(b)+"\n")
                else:
                    swf.write(a+":"+str(b)+"\n")
        except:
            loginformation("", "debug", "Errors of counting word number from file: " + data_file)
        else:
            loginformation("", "info", "finish count word number of " + data_file)
Пример #6
0
def replaceSparseWord(originalFile, afterReplaceFile, sparseWordFile):
    sparseWordList = getSparseWordList(sparseWordFile)
    loginformation("", "info", "replace sparse words of file: " + originalFile)
    with open(originalFile, 'r') as of, open(afterReplaceFile, 'w') as arf:
        try:
            for line in of:
                for word in line.split(" "):
                    if(word in sparseWordList):
                        arf.write("UNK" + " ")
                    elif(getUNK(word)):
                        arf.write("UNK" + " ")
                    else:
                        arf.write(word + " ")
        except [UnicodeDecodeError, UnicodeEncodeError]:
            loginformation("", "debug", "Error of decode or encode!!")
        else:
            loginformation("", "info", "finish replacing sparse words of the file: " + originalFile)
Пример #7
0
def filterTrainingFile(training_file, after_filter_file):
    rownum = 0
    loginformation("", "info", "filer the trainging file to remove word: abstract : and text :  after segment: " + training_file)

    with open(training_file, 'r') as df, open(after_filter_file, 'w') as sf:
        try:
            for line in df:
                rownum += 1
                if (rownum % 4 == 1 or rownum % 4 == 2):
                    continue
                elif (rownum % 4 == 3):
                    if (line.find("occurring UnicodeDecodeError") == -1):
                        sf.write(line.lstrip("abstract : "))
                else:
                    if (line.find("occurring UnicodeDecodeError") == -1):
                        sf.write(line.lstrip("text : "))
        except UnicodeDecodeError:
            loginformation("", "debug", "UnicodeDecodeError while filtering")
        else:
            loginformation("", "info", "finish filter the file: " + training_file)
Пример #8
0
def getSparseWordList(spareseWordFile):
    sparse_word_list = []
    loginformation("", "info", "get sparse word list from file: " + spareseWordFile)
    with open(spareseWordFile, 'r') as swf:
        try:
            for line in swf:
                num = 0
                for i in line.split(":")[0]:
                    if (i >= u'\u0041' and i <= u'\u005a') or (i >= u'\u0061' and i <= u'\u007a') \
                            or (i >= u'\u0030' and i <= u'\u0039'):
                        num += 1
                if(num > len(line.split(":")[0])/2):
                    continue
                else:
                    sparse_word_list.append(line.split(":")[0])
        except:
            loginformation("", "debug", "Errors occur while getting line from the file: " + spareseWordFile)
        else:
            loginformation("", "info", "finish get sparse word list")
    return sparse_word_list
Пример #9
0
def RunModel(dataPath, wordVectorModel):
    w1, w2, b1, b2 = EncoderInitial(
    )  #initialize the weight and bias of encoder
    w1_decode, w2_decode, b1_decode, b2_decode = DecoderInitial(
    )  #initialize the weight and bias of decoder
    x = tf.placeholder(
        tf.float32,
        shape=[1, 250])  #place holder, feature dimension of input word
    y_ = tf.placeholder(
        tf.float32,
        shape=[1, 200])  # place holder, feature dimension of output word
    x_2_decode = tf.placeholder(tf.float32, shape=[1, 250])
    y_2_decode = tf.placeholder(tf.float32, shape=[1, 200])
    y_2 = tf.placeholder(tf.float32, shape=[1, 50])
    first_input = np.zeros(
        (1,
         50))  #genenrate the first word input, to complement dimension(补充维数)。

    #accumulate loss
    #accumulate_loss = tf.placeholder(tf.float32, shape=[1,0])
    accumulate_loss = 0
    tf.summary.scalar("loss", accumulate_loss)
    #read the data of training file
    number = 1
    computation_graph = tf.InteractiveSession()
    sess = tf.Session()
    merged = tf.summary.merge_all()
    computation_graph.run()
    summary_writer = tf.summary.FileWriter("./log/")
    title_word_vector_list = []
    for line in open(dataPath):
        loginformation("", "info", "the line number is: " + str(number))
        if number % 3 == 1:
            number += 1
            continue
        if number % 3 == 2:  #get title vector
            line_list = line.rstrip("\n").split(" ")
            title_word_vector_list = GetWordVector(line_list, wordVectorModel)
            number += 1
            continue
        print(accumulate_loss)
        if number % 10 == 0:
            print(accumulate_loss)
            summary, loss = sess.run([merged, accumulate_loss],
                                     feed_dict=feed_dict(False))
            summary_writer.add_summary("./log/", number)
            loginformation("", "info", "loss is: " + str(accumulate_loss))
        if number % 3 == 0:
            line_list = line.rstrip("\n").split(" ")
            wordVectorList = GetWordVector(line_list, wordVectorModel)
            y_2_value = np.zeros((1, 50))
            y_2_value = np.ndarray.astype(y_2_value, dtype='float32')
            for wordIndex in range(len(wordVectorList)):
                if wordIndex == 0:
                    X = np.hstack((wordVectorList[wordIndex], y_2_value[0]))
                    X = X.reshape([1, 250])
                    X = np.ndarray.astype(X, dtype='float32')
                    y_2 = RnnEncoder(x.eval(feed_dict={x: X}), w1, w2, b1, b2)
                    #with sess.as_default():
                    #sess.run(y_2)
                else:  #revised here
                    #with sess.as_default():
                    X = np.hstack((wordVectorList[wordIndex], y_2[0].eval()))
                    X = X.reshape([1, 250])
                    X = np.ndarray.astype(X, dtype='float32')
                    y_2 = RnnEncoder(x.eval(feed_dict={x: X}), w1, w2, b1, b2)
                    #sess.run(y_2)
            y_2_decode_ = np.zeros([1, 200])
            y_2_decode_ = np.ndarray.astype(y_2_decode_, dtype='float32')
            for titleWordIndex in range(len(title_word_vector_list)):
                if titleWordIndex == 0:
                    X_2 = np.hstack((y_2[0].eval(), y_2_decode_[0]))
                    X_2 = X_2.reshape([1, 250])
                    X_2 = np.ndarray.astype(X_2, dtype='float32')
                    #with sess.as_default():
                    y_2_decode = RnnEncoder(
                        x_2_decode.eval(feed_dict={x_2_decode: X_2}),
                        w1_decode, w2_decode, b1_decode, b2_decode)
                    #sess.run(y_2_decode)

                else:
                    X_2 = np.hstack((y_2[0].eval(), y_2_decode[0].eval()))
                    X_2 = X_2.reshape([1, 250])
                    X_2 = np.ndarray.astype(X_2, dtype='float32')
                    y_2_decode = RnnEncoder(
                        x_2_decode.eval(feed_dict={x_2_decode: X_2}),
                        w1_decode, w2_decode, b1_decode, b2_decode)
                    #sess.run(y_2_decode)
                #计算累积损失
                titleword = title_word_vector_list[titleWordIndex].reshape(
                    [1, 200])
                accumulate_loss = accumulate_loss + tf.reduce_mean(
                    tf.square(titleword - y_2_decode))
            trainstep = tf.train.AdamOptimizer(1e-4).minimize(accumulate_loss)
            init_op = tf.global_variables_initializer()
            sess.run(init_op)
            sess.run(trainstep)
            #trainstep.run(feed_dict={x:X, x_2_decode:X_2, y_:title_word_vector_list[titleWordIndex]})
            number += 1
    sess.close()
    summary_writer.close()
    computation_graph.close()
Пример #10
0
        b1_decode = tf.constant(_b1_decode, name='w2')
        b2 = tf.constant(_b2_decode, name='w2')

        sess = tf.Session()
        init_op = tf.global_variables_initializer()
        sess.run(init_op)

        graph_def = g.as_graph_def()
        tf.train.write_graph(graph_def, parameterfile)
        sess.close()
    return


if __name__ == "__main__":
    rootPath = os.path.abspath(".") + "/"
    loginformation("", "info", "Training Rnn Model")
    loginformation("", "info", "Get word vector infomation")
    dataPath = rootPath + "../../../../NeuralHeadlineGeneration/data/originalData/PART_I_trainW2V_segment.txt"
    wordVectorModel = gensim.models.Word2Vec.load(
        rootPath +
        "../../../../NeuralHeadlineGeneration/data/dataWordVector/PARTDATA_dim200_window10_mincount1.model"
    )
    RunModel(dataPath, wordVectorModel)
    '''
    y_1 = tf.placeholder(tf.float32, [1,50])
    y_s = np.zeros([1,50])
    y_s = np.ndarray.astype(y_s, dtype='float32')
    sess = tf.Session()
    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    with sess.as_default():