示例#1
0
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import globalVals as gl

MAX_SEQUENCE_LENGTH = 200
# MAX_NB_WORDS = 20000
EMBEDDING_DIM = 200
POOL_SIZE = 100

rawfile_train_path = gl.INSURANCE_DATA_RAW + gl.TRAIN_FILE(POOL_SIZE)
rawfile_valid_path = gl.INSURANCE_DATA_RAW + gl.VALID_FILE(POOL_SIZE)
rawfile_test_path = gl.INSURANCE_DATA_RAW + gl.TEST_FILE(POOL_SIZE)

rawfile_answers_path = gl.INSURANCE_DATA_RAW + gl.ANSWERS
rawfile_questions_path = gl.INSURANCE_DATA_RAW + gl.QUESTIONS

tokenfile_train_path = gl.INSURANCE_DATA_TOKEN + gl.TRAIN_FILE(POOL_SIZE)
tokenfile_valid_path = gl.INSURANCE_DATA_TOKEN + gl.VALID_FILE(POOL_SIZE)
tokenfile_test_path = gl.INSURANCE_DATA_TOKEN + gl.TEST_FILE(POOL_SIZE)

tokenfile_answers_path = gl.INSURANCE_DATA_TOKEN + gl.ANSWERS
tokenfile_questions_path = gl.INSURANCE_DATA_TOKEN + gl.QUESTIONS

ans_texts = []
rawfile_answers = open(rawfile_answers_path)
示例#2
0
import globalVals as gl

pools = [100, 500, 1000, 1500]

quesFile = open(gl.INSURANCE_DATA_RAW + gl.QUESTIONS, 'r')
quesIndex = 0
quesDict = {}
for line in quesFile:
    quesIndex += 1
    record = line.rstrip("\n").split('\t')
    quesDict[record[1]] = quesIndex
print(len(quesDict))

for poolSize in pools:
    rf_train = open(gl.INSURANCE_DATA_RAW + gl.TRAIN_FILE(poolSize))
    rf_valid = open(gl.INSURANCE_DATA_RAW + gl.VALID_FILE(poolSize))
    rf_test = open(gl.INSURANCE_DATA_RAW + gl.TEST_FILE(poolSize))

    tf_train = open(gl.INSURANCE_DATA_TOKEN + gl.TRAIN_FILE(poolSize), 'w')
    tf_valid = open(gl.INSURANCE_DATA_TOKEN + gl.VALID_FILE(poolSize), 'w')
    tf_test = open(gl.INSURANCE_DATA_TOKEN + gl.TEST_FILE(poolSize), 'w')

    for line in rf_train:
        record = line.rstrip('\n').split('\t')
        ques_text = record[1]
        ques_index = quesDict.get(ques_text)
        newline = str(ques_index) + '\t' + record[2] + '\t' + record[3]
        tf_train.write(newline + '\n')
    rf_train.close()
    tf_train.close()
示例#3
0
log_ch.setFormatter(log_formatter)
logger.addHandler(log_fh)
logger.addHandler(log_ch)
logger.setLevel(logging.DEBUG)
logger.info("starting at %s" % time.strftime(ISOTIMEFORMAT, time.localtime()))

MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 200
POOL_SIZE = 500

vocabulary_file = open(gl.INSURANCE_DATA_TOKEN + "vocabulary")

ans_file = open(gl.INSURANCE_DATA_TOKEN + gl.ANSWERS)
ques_file = open(gl.INSURANCE_DATA_TOKEN + gl.QUESTIONS)

train_file = open(gl.INSURANCE_DATA_TOKEN + gl.TRAIN_FILE(POOL_SIZE))
valid_file = open(gl.INSURANCE_DATA_TOKEN + gl.VALID_FILE(POOL_SIZE))
test_file = open(gl.INSURANCE_DATA_TOKEN + gl.TEST_FILE(POOL_SIZE))

# second, prepare text samples and their labels
logger.info('Processing text dataset')

# load vocb of texts
logger.info("loading word index from %s" % vocabulary_file.name)
vocb = {}
for line in vocabulary_file:
    record = line.split()
    vocb[record[0]] = record[1]
logger.info("loaded %d vocbs" % len(vocb))

# load answer sequences
示例#4
0
print("embedding index loaded")
vocbFile.close()

# load ans_index
ans_file = open(gl.INSURANCE_DATA_DEMO + gl.ANSWERS)
print("loading answer index from:", ans_file.name)
for line in ans_file:
    record = line.rstrip("\n").split('\t')
    index = record[0]
    ans_text = text.text_to_word_sequence(record[1])
    ans_index[index] = ans_text
print("answer index loaded")
ans_file.close()

# load qapool
qapool_file = open(gl.INSURANCE_DATA_DEMO + gl.TRAIN_FILE(POOL_SIZE))
print("loading qapool from:", qapool_file.name)
q_count = 0;
for line in qapool_file:
    q_count += 1
    record = line.rstrip("\n").split("\t")
    q_text = text.text_to_word_sequence(record[1])
    aps = record[2]
    ams = record[3]
    qus_index[q_count] = q_text
    qap_index[q_count] = np.asarray(aps.split(), dtype=np.int32)
    qam_index[q_count] = np.asarray(ams.split(), dtype=np.int32)
print("qusetion index、qap pairs、qam pairs lodaded")

# preparing the embedding layer
embedding_layer=Embedding()