示例#1
0
    def load_pickle_data(self):
        data = pickle.load(open(self.data_path, 'rb'))
        X_train_path = self.fp_prefix + '_train.pkl'
        X_test_path = self.fp_prefix + '_test.pkl'
        X_dev_path = self.fp_prefix + '_valid.pkl'
        if 'emotion_dic' in data and self.label == 'emotion':
            self.emotion_dic = data['emotion_dic']

        self.get_max_seq_len(data['train']['text'] + data['test']['text'] +
                             data['valid']['text'])
        self.speaker_num = data['speaker_num']

        if not os.path.exists(X_train_path):
            print("Creating new train data!")
            X_train, y_train = self.pad_dialogue(data['train'])
            if self.dialogue_context:
                X_train, y_train = self.extract_context(X_train, y_train)
            pickle.dump([*X_train, y_train], open(X_train_path, 'wb'))

        else:
            print("  - Found cached train data")
            train_data = pickle.load(open(X_train_path, 'rb'))
            X_train = train_data[:-1]
            y_train = train_data[-1]

        if not os.path.exists(X_test_path):
            print("Creating new test data!")
            X_test, y_test = self.pad_dialogue(data['test'])
            if self.dialogue_context:
                X_test, y_test = self.extract_context(X_test, y_test)
            pickle.dump([*X_test, y_test], open(X_test_path, 'wb'))

        else:
            print("  - Found cached test data")
            test_data = pickle.load(open(X_test_path, 'rb'))
            X_test = test_data[:-1]
            y_test = test_data[-1]

        if not os.path.exists(X_dev_path):
            print("Creating new dev data!")
            X_dev, y_dev = self.pad_dialogue(data['valid'])
            if self.dialogue_context:
                X_dev, y_dev = self.extract_context(X_dev, y_dev)
            pickle.dump([*X_dev, y_dev], open(X_dev_path, 'wb'))

        else:
            print("  - Found cached dev data")
            dev_data = pickle.load(open(X_dev_path, 'rb'))
            X_dev = dev_data[:-1]
            y_dev = dev_data[-1]

        X_train = [torch.tensor(x, dtype=torch.float32) for x in X_train]
        X_test = [torch.tensor(x, dtype=torch.float32) for x in X_test]
        X_dev = [torch.tensor(x, dtype=torch.float32) for x in X_dev]
        self.embedding_enabled = False
        self.sentiment_dic = None

        # Remove spurious values (-inf)
        for x in X_train:
            clean_tensor(x)
        for x in X_test:
            clean_tensor(x)
        for x in X_dev:
            clean_tensor(x)

        y_train = torch.tensor(y_train, dtype=torch.float32)
        y_test = torch.tensor(y_test, dtype=torch.float32)
        y_dev = torch.tensor(y_dev, dtype=torch.float32)

        if y_train.dim() == 3:
            y_train = y_train.squeeze(dim=-1)
            y_test = y_test.squeeze(dim=-1)
            y_dev = y_dev.squeeze(dim=-1)

        return X_train, X_test, X_dev, y_train, y_test, y_dev
示例#2
0
    def load_pickle_data(self):
        data = pickle.load(open(self.data_path, 'rb'))
        X_train_path = self.fp_prefix + '_train.pkl'
        X_test_path = self.fp_prefix + '_test.pkl'
        X_dev_path = self.fp_prefix + '_valid.pkl'

        self.get_max_seq_len(data['train']['text'] + data['test']['text'] +
                             data['valid']['text'])

        # Load embedding
        embedding_path = self.fp_prefix + '_embedding.pkl'
        if not os.path.exists(embedding_path):
            print("Creating new embeddings!")
            self.dictionary = Dictionary(start_feature_id=0)
            self.dictionary.add('UNK')
            textual_features = data['train']['text'] + data['test'][
                'text'] + data['valid']['text']
            for tokens in textual_features:
                for token in tokens:
                    self.dictionary.add(str(token.lower()))

            self.embedding = Embedding(self.dictionary, self.max_seq_len)
            self.embedding.get_embedding(dataset_name=self.dataset_name,
                                         fname=self.wordvec_path)
            pickle.dump(self.embedding, open(embedding_path, 'wb'))

        else:
            print("  - Found cached embeddings")
            self.embedding = pickle.load(open(embedding_path, 'rb'))

        if not os.path.exists(X_train_path):
            print("Creating new train data!")
            X_train = [[
                self.embedding.text_to_sequence(seq)
                for seq in data['train']['text']
            ], data['train']['vision'], data['train']['audio']]
            y_train = data['train']['labels']
            pickle.dump([*X_train, y_train], open(X_train_path, 'wb'))
        else:
            print("  - Found cached train data")
            train_data = pickle.load(open(X_train_path, 'rb'))
            X_train = train_data[:-1]
            y_train = train_data[-1]

        if not os.path.exists(X_test_path):
            print("Creating new test data!")
            X_test = [[
                self.embedding.text_to_sequence(seq)
                for seq in data['test']['text']
            ], data['test']['vision'], data['test']['audio']]
            y_test = data['test']['labels']
            pickle.dump([*X_test, y_test], open(X_test_path, 'wb'))
        else:
            print("  - Found cached test data")
            test_data = pickle.load(open(X_test_path, 'rb'))
            X_test = test_data[:-1]
            y_test = test_data[-1]

        if not os.path.exists(X_dev_path):
            print("Creating new valid data!")
            X_dev = [[
                self.embedding.text_to_sequence(seq)
                for seq in data['valid']['text']
            ], data['valid']['vision'], data['valid']['audio']]
            y_dev = data['valid']['labels']
            pickle.dump([*X_dev, y_dev], open(X_dev_path, 'wb'))
        else:
            print("  - Found cached valid data")
            dev_data = pickle.load(open(X_dev_path, 'rb'))
            X_dev = dev_data[:-1]
            y_dev = dev_data[-1]

        # Convert data to tensor format
        X_train = [
            torch.tensor(x, dtype=torch.int64)
            if i == 0 else torch.tensor(x, dtype=torch.float32)
            for i, x in enumerate(X_train)
        ]
        X_test = [
            torch.tensor(x, dtype=torch.int64)
            if i == 0 else torch.tensor(x, dtype=torch.float32)
            for i, x in enumerate(X_test)
        ]
        X_dev = [
            torch.tensor(x, dtype=torch.int64)
            if i == 0 else torch.tensor(x, dtype=torch.float32)
            for i, x in enumerate(X_dev)
        ]

        # Remove spurious values (-inf)
        for x in X_train:
            clean_tensor(x)
        for x in X_test:
            clean_tensor(x)
        for x in X_dev:
            clean_tensor(x)

        y_train = torch.tensor(y_train, dtype=torch.float32)
        y_test = torch.tensor(y_test, dtype=torch.float32)
        y_dev = torch.tensor(y_dev, dtype=torch.float32)

        if y_train.dim() == 3:
            y_train = y_train.squeeze(dim=-1)
            y_test = y_test.squeeze(dim=-1)
            y_dev = y_dev.squeeze(dim=-1)

        return X_train, X_test, X_dev, y_train, y_test, y_dev