Пример #1
0
def main():
    _initWordDic()
    # parse the data using dataParser
    parser = DataParser()
    docs, summary = parser.parseFile()
    p_doc = Preparer(docs)
    p_summary = Preparer(summary, is_summary=True)
    p_doc.cutDocs()
    p_summary.cutDocs()
    docLens = p_doc.countDocs()
    sumLens = p_summary.countDocs()
    print(max(sumLens))
    #sys.exit()
    p_doc.doc2Int()
    p_summary.doc2Int()
    # docs, docLens, summary, sumLens are the data
    data = list(zip(docs, summary, docLens, sumLens))
    training_data = data[:1585]
    validation_data = data[:1835]
    testing_data = data[1835:]
    ''' FIXING THE DIMENSION ISSUES OF BATCHES
    sf_train = SF(training_data, CONFIG.BATCH_SIZE, is_training = True)
    sf_valid = SF(validation_data, CONFIG.BATCH_SIZE, is_training = False)
    for tup in sf_train.get_batch(): 
        _, doc, summary, docLens, sumLens = tup
        doc_batch = _get_doc_batch(doc)
        summary_batch = _get_summary_batch(summary)
        label_batch = _get_label_batch(summary)
        docLens = np.array(docLens)
        summaryLens = np.array(sumLens)  
        print (doc_batch[0])
        print (summary_batch[0])
        print (label_batch[0])
        print (list(doc for doc in docLens))
        print (list(doc for doc in summaryLens))
        sys.exit()'''

    with tf.Graph().as_default():
        initializer = tf.random_uniform_initializer(-1, 1)
        with tf.name_scope('Train'):
            with tf.variable_scope('Model',
                                   reuse=None,
                                   initializer=initializer):
                m = SummaryModel(is_training=True)
        with tf.name_scope('Valid'):
            with tf.variable_scope('Model',
                                   reuse=True,
                                   initializer=initializer):
                m_valid = SummaryModel(is_training=False)
        with tf.name_scope('Test'):
            with tf.variable_scope('Model',
                                   reuse=True,
                                   initializer=initializer):
                m_test = SummaryModel(is_training=False)

        init_op = tf.global_variables_initializer()
        config = tf.ConfigProto()
        config.gpu_options.visible_device_list = '7'
        sess = tf.Session(config=config)
        sess.run(init_op)
        for epoch in range(CONFIG.EPOCH):
            print('---------------running ' + str(epoch) +
                  'th epoch ----------------')
            run_epoch(sess, m, m_valid, training_data, validation_data)
Пример #2
0
temp = []
count = 0
for block in input_text:
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', block)
    temp.append([sent for sent in m.groupdict()['postcolon'].split('.') if sent])
input_text = temp

for i in range(len(input_text)):
    for j in range(len(input_text[i])):
        tokens = re.sub(r"[^a-z0-9]+", " ", input_text[i][j].lower()).split()
        input_text[i][j] = tokens
    
# doc: input_text[i], which is a list of sentences (list of words)

from prepare_sentences import Preparer
P = Preparer(DIM_RNN, input_text)
docLengths = P.cutDocs() # docLengths are the list of unpadded lengths of each sentence
# we add the label and length information to the data
# prepare labels for each sentence for each doc
sentence_labels = []
for i in range(len(labels)):
    # for each document, assign labels to all sentences
    num_sen = len(input_text[i])
    sentence_labels.append([labels[i]]*num_sen)

# IMPORTANT - format of data:
# data[0]: list of documents (list of fixed-size sentences (list of words))
# data[1]: list of (list of labels for each sentence) for each doc
# data[2]: list of (sentence lengths for each sentence) for each doc
text_labels = [[input_text[i], sentence_labels[i], docLengths[i]] for i in range(len(input_text))]
training_data = text_labels[:1585]
Пример #3
0
Файл: RNNLM.py Проект: HDG94/dl
temp = []
count = 0
for block in input_text:
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', block)
    temp.append(
        [sent for sent in m.groupdict()['postcolon'].split('.') if sent])
input_text = temp

for i in range(len(input_text)):
    for j in range(len(input_text[i])):
        tokens = re.sub(r"[^a-z0-9]+", " ", input_text[i][j].lower()).split()
        input_text[i][j] = tokens

from prepare_sentences import Preparer
P = Preparer(DIM_RNN, input_text)
P.addStartStopWords(STARTWORD, STOPWORD)
docLengths = P.cutDocs(
)  # docLengths are the list of unpadded lengths of each sentence
WORDCOUNTS = P.getTotalWordCount()
print(WORDCOUNTS)

# IMPORTANT - format of data:
# data[i][0]: document (list of fixed-size sentences (list of words))
# data[i][1]: list of (sentence lengths for each sentence) for each doc
text_labels = [[input_text[i], docLengths[i]] for i in range(len(input_text))]
training_data = text_labels[:1585]
validation_data = text_labels[1585:1835]
testing_data = text_labels[1835:]

from gensim.models import Word2Vec