示例#1
0
def process_metadata(metadata_list, mappings):
    """
    Using the mappings created above, this converts all of the metadata into numeric values, and reshapes them to a fixed tensor size.
    Parameters:
        metadata_list: the original metadata for all exercises
        mappings: a tuple of all mapping dictionaries and a list of all morphological features.
    Returns:
        a processed python array of the metadata, with the intended shape of (num_of_exercises,(fixed)max_token_size,(fixed)num_of_features)
    """
    user_to_id, countries_to_id, client_to_id, session_to_id, format_to_id, part_of_speech_to_id, dependency_edge_label_to_id, all_morphological_features = mappings
    num_of_features = 10 + len(all_morphological_features)
    for i in range(len(metadata_list)):
        metadata = metadata_list[i]
        for j in range(len(metadata)):
            m = metadata[j]
            # map each feature to its unique id
            m[0] = user_to_id[m[0]]  #0 user
            m[1] = countries_to_id[m[1]]  #1 countries
            m[2] = cast_to_float(m[2])  #2 days
            m[3] = client_to_id[m[3]]  #3 client
            m[4] = session_to_id[m[4]]  #4 session
            m[5] = format_to_id[m[5]]  #5 format
            m[6] = cast_to_float(m[6])  #6 time
            m[7] = part_of_speech_to_id[m[7]]  #7 part_of_speech
            m[8] = dependency_edge_label_to_id[m[8]]  #8 dependency_edge_label
            m[9] = cast_to_int(m[9])  #9 dependency_edge_head
            #10 morphological_features
            # create an empty vector of length of all morphological features
            morphological_features = [0] * len(all_morphological_features)
            # for all features in this metadata
            for feature in m[10:]:
                # find index of this feature in the sorted list of all morphological features
                idx = all_morphological_features.index(feature)
                # and map it to 1 to mark that this metadata contains this feature
                morphological_features[idx] = 1
            # update metadata to newly processed attributes
            metadata[j] = m[:10] + morphological_features
        # in order to have a valid input as a tensor, we need to have input size as equal for all inputs
        # we thus pad the metadata matrix with meaningless values (-1) into a shape of (max_token_size, num_of_features)
        dummy = [0] * num_of_features
        metadata += [dummy] * (MAX_TOKEN_SIZE - len(metadata))
        metadata_list[i] = metadata
    return metadata_list
 def valid_on_epoch(self):
     self.model.train(False)
     LOSS = []
     with torch.no_grad():
         for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.valid_dataloader:
             X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float(
                 X_i, Y_i, Z_i, X_o, Y_o, Z_o)
             pred = self.model(X_i)
             loss = self.loss(pred, X_o)
             LOSS.append(loss.data.cpu().numpy())
     self.writer.add_scalar('valid Loss', np.mean(LOSS), self.epoch)
     return np.mean(LOSS)
 def test_MSE(self):
     self.model.train(False)
     self.load_weights()
     LOSS = []
     with torch.no_grad():
         for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.test_dataloader:
             X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float(
                 X_i, Y_i, Z_i, X_o, Y_o, Z_o)
             pred = self.model(X_i)
             loss = self.loss(pred, X_o)
             LOSS.append(loss.data.cpu().numpy())
     return np.mean(LOSS)
 def train_on_epoch(self):
     self.model.train(False)
     LOSS = []
     for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.train_dataloader:
         X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float(
             X_i, Y_i, Z_i, X_o, Y_o, Z_o)
         X_i += torch.normal(0, 0.1, X_i.shape)
         self.model.zero_grad()
         pred = self.model(X_i)
         loss = self.loss(pred, X_o)
         loss.backward()
         self.optim.step()
         LOSS.append(loss.data.cpu().numpy())
     self.writer.add_scalar('train Loss', np.mean(LOSS), self.epoch)
 def test_a_window(self):
     self.model.train(False)
     self.load_weights()
     idx = 0
     with torch.no_grad():
         for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.test_dataloader:
             X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float(
                 X_i, Y_i, Z_i, X_o, Y_o, Z_o)
             pred = self.model(X_i)
             show_a_test_window(X_i.data.numpy()[0, :],
                                X_o.data.numpy()[0, :],
                                pred.data.numpy()[0, :], idx,
                                self.config['data']['stride'])
             idx += 1
 def train_on_epoch(self):
     self.model.train(False)
     LOSS = []
     for X_i, Y_i, Z_i, X_o, Y_o, Z_o in self.train_dataloader:
         X_i, Y_i, Z_i, X_o, Y_o, Z_o = cast_to_float(
             X_i, Y_i, Z_i, X_o, Y_o, Z_o)
         X_i, Y_i, Z_i = self.augment(X_i, Y_i, Z_i)
         self.model.zero_grad()
         pred = self.model(X_i, Y_i, Z_i)
         label = torch.cat((X_o, Y_o, Z_o), dim=1)
         loss = self.loss(pred, label)
         loss.backward()
         self.optim.step()
         LOSS.append(loss.data.cpu().numpy())
     self.writer.add_scalar('train Loss', np.mean(LOSS), self.epoch)
def tokenize(line):
    tokens = line.split(",")
    for i in range(0, len(tokens)):
        tokens[i] = cast_to_float(tokens[i])
    return tokens