def __init__(self, filepath, data_path, embedding_path, embedding=None): super().__init__() self._tokenizer.fit_on_texts(score_model_helper.get_dataframe(data_path)['essay']) self._vocab_size = len(self._tokenizer.word_index) + 1 if embedding is None: self._embedding = self.get_embedding_matrix(embedding_path) else: self._embedding = embedding self._model.add(Embedding(self._vocab_size, 300, weights=[self._embedding], input_length=200, trainable=False)) self._model.add(LSTM(128, dropout=0.1, return_sequences=True)) self._model.add(GlobalMaxPooling1D()) self._model.add(Dense(64, activation='relu')) self._model.add(Dense(1, activation='sigmoid')) self._model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae', 'mape', 'mse']) self._filepath = filepath self.__data_path = data_path
def load_data(self, filepath=None): """ Loads the data into the score model, and then initiates model training Parameters ---------- filepath : str Should be a filepath to a .csv file with an 'essay_id', 'essay_set', 'essay', and 'domain1_score' column. If not provided, then the default filepath will be used in its place if one exists. Returns ------- bool True if the model training was successful, otherwise False. """ y = pandas.DataFrame(np.empty(0, dtype=[('essay_id', 'int'), ('normal', 'float32')])) # Get only the essays from the essay set you will be grading against if filepath is not None: self.__data_path = filepath x = score_model_helper.get_dataframe(self.__data_path) # Training data for i in x.index.values: set_number = x.loc[i, 'essay_set'] y.loc[i, 'essay_id'] = x.loc[i, 'essay_id'] if set_number == 1: y.loc[i, 'normal'] = (x.loc[i, 'domain1_score'] - 2) / 10 if set_number == 2: y.loc[i, 'normal'] = (x.loc[i, 'domain1_score'] - 1) / 5 if set_number == 3: y.loc[i, 'normal'] = x.loc[i, 'domain1_score'] / 3 if set_number == 4: y.loc[i, 'normal'] = x.loc[i, 'domain1_score'] / 3 if set_number == 5: y.loc[i, 'normal'] = x.loc[i, 'domain1_score'] / 4 if set_number == 6: y.loc[i, 'normal'] = x.loc[i, 'domain1_score'] / 4 if set_number == 7: y.loc[i, 'normal'] = x.loc[i, 'domain1_score'] / 30 if set_number == 8: y.loc[i, 'normal'] = x.loc[i, 'domain1_score'] / 60 return self.train_and_test(x, y, 4, 4)
def load_data(self, filepath=None): """ Loads the data into the score model, and then initiates model training Parameters ---------- filepath : str Should be a filepath to a .csv file with an 'essay_id', 'essay' and 'comments' column, where the 'comments' column should contain 'ID#,ORG#,STY#', where the # is either 1, 2, or 3. If not provided, then the default filepath will be used in its place if one exists. Returns ------- bool True if the model training was successful, otherwise False. """ y = pandas.DataFrame(np.empty(0, dtype=[('essay_id', 'int'), ('normal', 'float32')])) # Get only the essays from the essay set you will be grading against if filepath is not None: self.__data_path = filepath x = score_model_helper.get_dataframe(self.__data_path) # Training data for i in x.index.values: comment = x.loc[i, 'comments'].split(',')[2] y.loc[i, 'essay_id'] = x.loc[i, 'essay_id'] if comment.find('1') != -1: y.loc[i, 'normal'] = 0.0 else: if comment.find('2') != -1: y.loc[i, 'normal'] = 0.5 else: y.loc[i, 'normal'] = 1.0 return self.train_and_test(x, y, 8, 4)