import numpy as np from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.preprocessing import text from keras.utils import to_categorical from keras.layers import Dense, Input, GlobalMaxPooling1D from keras.layers import Conv1D, MaxPooling1D, Embedding from keras.models import Model import globalVals as gl MAX_SEQUENCE_LENGTH = 200 # MAX_NB_WORDS = 20000 EMBEDDING_DIM = 200 POOL_SIZE = 100 rawfile_train_path = gl.INSURANCE_DATA_RAW + gl.TRAIN_FILE(POOL_SIZE) rawfile_valid_path = gl.INSURANCE_DATA_RAW + gl.VALID_FILE(POOL_SIZE) rawfile_test_path = gl.INSURANCE_DATA_RAW + gl.TEST_FILE(POOL_SIZE) rawfile_answers_path = gl.INSURANCE_DATA_RAW + gl.ANSWERS rawfile_questions_path = gl.INSURANCE_DATA_RAW + gl.QUESTIONS tokenfile_train_path = gl.INSURANCE_DATA_TOKEN + gl.TRAIN_FILE(POOL_SIZE) tokenfile_valid_path = gl.INSURANCE_DATA_TOKEN + gl.VALID_FILE(POOL_SIZE) tokenfile_test_path = gl.INSURANCE_DATA_TOKEN + gl.TEST_FILE(POOL_SIZE) tokenfile_answers_path = gl.INSURANCE_DATA_TOKEN + gl.ANSWERS tokenfile_questions_path = gl.INSURANCE_DATA_TOKEN + gl.QUESTIONS ans_texts = [] rawfile_answers = open(rawfile_answers_path)
import globalVals as gl pools = [100, 500, 1000, 1500] quesFile = open(gl.INSURANCE_DATA_RAW + gl.QUESTIONS, 'r') quesIndex = 0 quesDict = {} for line in quesFile: quesIndex += 1 record = line.rstrip("\n").split('\t') quesDict[record[1]] = quesIndex print(len(quesDict)) for poolSize in pools: rf_train = open(gl.INSURANCE_DATA_RAW + gl.TRAIN_FILE(poolSize)) rf_valid = open(gl.INSURANCE_DATA_RAW + gl.VALID_FILE(poolSize)) rf_test = open(gl.INSURANCE_DATA_RAW + gl.TEST_FILE(poolSize)) tf_train = open(gl.INSURANCE_DATA_TOKEN + gl.TRAIN_FILE(poolSize), 'w') tf_valid = open(gl.INSURANCE_DATA_TOKEN + gl.VALID_FILE(poolSize), 'w') tf_test = open(gl.INSURANCE_DATA_TOKEN + gl.TEST_FILE(poolSize), 'w') for line in rf_train: record = line.rstrip('\n').split('\t') ques_text = record[1] ques_index = quesDict.get(ques_text) newline = str(ques_index) + '\t' + record[2] + '\t' + record[3] tf_train.write(newline + '\n') rf_train.close() tf_train.close()
log_ch.setFormatter(log_formatter) logger.addHandler(log_fh) logger.addHandler(log_ch) logger.setLevel(logging.DEBUG) logger.info("starting at %s" % time.strftime(ISOTIMEFORMAT, time.localtime())) MAX_SEQUENCE_LENGTH = 200 EMBEDDING_DIM = 200 POOL_SIZE = 500 vocabulary_file = open(gl.INSURANCE_DATA_TOKEN + "vocabulary") ans_file = open(gl.INSURANCE_DATA_TOKEN + gl.ANSWERS) ques_file = open(gl.INSURANCE_DATA_TOKEN + gl.QUESTIONS) train_file = open(gl.INSURANCE_DATA_TOKEN + gl.TRAIN_FILE(POOL_SIZE)) valid_file = open(gl.INSURANCE_DATA_TOKEN + gl.VALID_FILE(POOL_SIZE)) test_file = open(gl.INSURANCE_DATA_TOKEN + gl.TEST_FILE(POOL_SIZE)) # second, prepare text samples and their labels logger.info('Processing text dataset') # load vocb of texts logger.info("loading word index from %s" % vocabulary_file.name) vocb = {} for line in vocabulary_file: record = line.split() vocb[record[0]] = record[1] logger.info("loaded %d vocbs" % len(vocb)) # load answer sequences
print("embedding index loaded") vocbFile.close() # load ans_index ans_file = open(gl.INSURANCE_DATA_DEMO + gl.ANSWERS) print("loading answer index from:", ans_file.name) for line in ans_file: record = line.rstrip("\n").split('\t') index = record[0] ans_text = text.text_to_word_sequence(record[1]) ans_index[index] = ans_text print("answer index loaded") ans_file.close() # load qapool qapool_file = open(gl.INSURANCE_DATA_DEMO + gl.TRAIN_FILE(POOL_SIZE)) print("loading qapool from:", qapool_file.name) q_count = 0; for line in qapool_file: q_count += 1 record = line.rstrip("\n").split("\t") q_text = text.text_to_word_sequence(record[1]) aps = record[2] ams = record[3] qus_index[q_count] = q_text qap_index[q_count] = np.asarray(aps.split(), dtype=np.int32) qam_index[q_count] = np.asarray(ams.split(), dtype=np.int32) print("qusetion index、qap pairs、qam pairs lodaded") # preparing the embedding layer embedding_layer=Embedding()