QA_TEST_FILE = "8thGr-NDMC-Test.csv" WORD2VEC_BIN = "GoogleNews-vectors-negative300.bin.gz" WORD2VEC_EMBED_SIZE = 300 LSTM_SEQLEN = 196 # from original model NUM_CHOICES = 4 # number of choices for multiple choice #### Load up the vectorizer qapairs = kaggle.get_question_answer_pairs( os.path.join(DATA_DIR, QA_TRAIN_FILE)) tqapairs = kaggle.get_question_answer_pairs(os.path.join( DATA_DIR, QA_TEST_FILE), is_test=True) word2idx = kaggle.build_vocab([], qapairs, tqapairs) vocab_size = len(word2idx) + 1 # include mask character 0 #### Load up the model with open(os.path.join(MODEL_DIR, MODEL_ARCH), "rb") as fjson: json = fjson.read() model = model_from_json(json) model.load_weights(os.path.join(MODEL_DIR, MODEL_WEIGHTS)) #### read in the data #### #### correct_answer = "B" question = "Which is a distinction between an epidemic and a pandemic?" answers = [ "the symptoms of the disease", "the geographical area affected", "the species of organisms infected", "the season in which the disease spreads"
EMBED_SIZE = 64 BATCH_SIZE = 256 NBR_EPOCHS = 20 stories = kaggle.get_stories(os.path.join(DATA_DIR, STORY_FILE)) story_maxlen = max([len(words) for words in stories]) # this part is only required to get the maximum sequence length qapairs = kaggle.get_question_answer_pairs( os.path.join(DATA_DIR, QA_TRAIN_FILE)) question_maxlen = max([len(qapair[0]) for qapair in qapairs]) answer_maxlen = max([len(qapair[1]) for qapair in qapairs]) seq_maxlen = max([story_maxlen, question_maxlen, answer_maxlen]) word2idx = kaggle.build_vocab(stories, qapairs, []) vocab_size = len(word2idx) Xs = kaggle.vectorize_stories(stories, word2idx, seq_maxlen) Xstrain, Xstest = train_test_split(Xs, test_size=0.3, random_state=42) print(Xstrain.shape, Xstest.shape) inputs = Input(shape=(seq_maxlen, vocab_size)) encoded = LSTM(EMBED_SIZE)(inputs) decoded = RepeatVector(seq_maxlen)(encoded) decoded = LSTM(vocab_size, return_sequences=True)(decoded) autoencoder = Model(inputs, decoded) autoencoder.compile("adadelta", loss="binary_crossentropy") autoencoder.fit(Xstrain,