def process_metadata(metadata_list, mappings): """ Using the mappings created above, this converts all of the metadata into numeric values, and reshapes them to a fixed tensor size. Parameters: metadata_list: the original metadata for all exercises mappings: a tuple of all mapping dictionaries and a list of all morphological features. Returns: a processed python array of the metadata, with the intended shape of (num_of_exercises,(fixed)max_token_size,(fixed)num_of_features) """ user_to_id, countries_to_id, client_to_id, session_to_id, format_to_id, part_of_speech_to_id, dependency_edge_label_to_id, all_morphological_features = mappings num_of_features = 10 + len(all_morphological_features) for i in range(len(metadata_list)): metadata = metadata_list[i] for j in range(len(metadata)): m = metadata[j] # map each feature to its unique id m[0] = user_to_id[m[0]] #0 user m[1] = countries_to_id[m[1]] #1 countries m[2] = cast_to_float(m[2]) #2 days m[3] = client_to_id[m[3]] #3 client m[4] = session_to_id[m[4]] #4 session m[5] = format_to_id[m[5]] #5 format m[6] = cast_to_float(m[6]) #6 time m[7] = part_of_speech_to_id[m[7]] #7 part_of_speech m[8] = dependency_edge_label_to_id[m[8]] #8 dependency_edge_label m[9] = cast_to_int(m[9]) #9 dependency_edge_head #10 morphological_features # create an empty vector of length of all morphological features morphological_features = [0] * len(all_morphological_features) # for all features in this metadata for feature in m[10:]: # find index of this feature in the sorted list of all morphological features idx = all_morphological_features.index(feature) # and map it to 1 to mark that this metadata contains this feature morphological_features[idx] = 1 # update metadata to newly processed attributes metadata[j] = m[:10] + morphological_features # in order to have a valid input as a tensor, we need to have input size as equal for all inputs # we thus pad the metadata matrix with meaningless values (-1) into a shape of (max_token_size, num_of_features) dummy = [0] * num_of_features metadata += [dummy] * (MAX_TOKEN_SIZE - len(metadata)) metadata_list[i] = metadata return metadata_list
def valid_on_epoch(self): self.model.train(False) LOSS = [] with torch.no_grad(): for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.valid_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) pred = self.model(X_i) loss = self.loss(pred, X_o) LOSS.append(loss.data.cpu().numpy()) self.writer.add_scalar('valid Loss', np.mean(LOSS), self.epoch) return np.mean(LOSS)
def test_MSE(self): self.model.train(False) self.load_weights() LOSS = [] with torch.no_grad(): for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.test_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) pred = self.model(X_i) loss = self.loss(pred, X_o) LOSS.append(loss.data.cpu().numpy()) return np.mean(LOSS)
def train_on_epoch(self): self.model.train(False) LOSS = [] for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.train_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) X_i += torch.normal(0, 0.1, X_i.shape) self.model.zero_grad() pred = self.model(X_i) loss = self.loss(pred, X_o) loss.backward() self.optim.step() LOSS.append(loss.data.cpu().numpy()) self.writer.add_scalar('train Loss', np.mean(LOSS), self.epoch)
def test_a_window(self): self.model.train(False) self.load_weights() idx = 0 with torch.no_grad(): for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.test_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) pred = self.model(X_i) show_a_test_window(X_i.data.numpy()[0, :], X_o.data.numpy()[0, :], pred.data.numpy()[0, :], idx, self.config['data']['stride']) idx += 1
def train_on_epoch(self): self.model.train(False) LOSS = [] for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.train_dataloader: X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float( X_i, Y_i, Z_i, X_o, Y_o, Z_o) X_i, Y_i, Z_i = self.augment(X_i, Y_i, Z_i) self.model.zero_grad() pred = self.model(X_i, Y_i, Z_i) label = torch.cat((X_o, Y_o, Z_o), dim=1) loss = self.loss(pred, label) loss.backward() self.optim.step() LOSS.append(loss.data.cpu().numpy()) self.writer.add_scalar('train Loss', np.mean(LOSS), self.epoch)
def tokenize(line): tokens = line.split(",") for i in range(0, len(tokens)): tokens[i] = cast_to_float(tokens[i]) return tokens