Exemplo n.º 1
0
 def load_embed_file(self):
     self._str2embed = dict(zip(self.SPECIAL_TOKENS, range(self.START_IDX)))
     self._embed2str = dict(zip(range(self.START_IDX), self.SPECIAL_TOKENS))
     embeds = [[0] * self.words_dim, [0] * self.words_dim]
     with open(self.embed_file) as f:
         cur_idx = self.START_IDX
         for line_num, line in enumerate(f):
             line = line.strip().split()
             if line:
                 try:
                     if self.dataset_type != 'SST-1' or self.dataset_type != 'SST-2':
                         self._str2embed[clean_str(line[0])] = cur_idx
                         self._embed2str[cur_idx] = clean_str(line[0])
                     else:
                         self._str2embed[clean_str_sst(line[0])] = cur_idx
                         self._embed2str[cur_idx] = clean_str_sst(line[0])
                     embeds.append(line[1:])
                     cur_idx += 1
                 except:
                     raise ValueError(
                         'The embedding file is misformatted at line %d' %
                         (line_num + 1))
     # Randomly initialize the pre-trained vector for those words not in pre-train-file
     for word in self._str2idx.keys():
         if word not in self._str2embed.keys():
             self._str2embed[word] = cur_idx
             self._embed2str[cur_idx] = word
             embeds.append(list(np.random.uniform(-1, 1, self.words_dim)))
             cur_idx += 1
     self.pretrained_embeddings = np.array(embeds, dtype=np.float64)
     del embeds
     return
Exemplo n.º 2
0
    def add_train_file(self):
        if self.dataset_type == 'TREC':
            with open(self.train_file) as f:
                for line_num, line in enumerate(f):
                    line = clean_str(line).split()
                    if line:
                        if self.name == 'Targets':
                            self.add(line[0])
                        if self.name == 'Words':
                            for word in line[2:]:
                                self.add(word)
        else:
            with open(self.train_file) as f:
                for line_num, line in enumerate(f):
                    line = clean_str(line).split()
                    if line:
                        if self.name == 'Targets':
                            self.add(line[0])
                        if self.name == 'Words':
                            for word in line[1:]:
                                self.add(word)

        self.index_vocab()
Exemplo n.º 3
0
 def reading_dataset(self, filename):
     """
 :param filename:
 :return:
 """
     if self.dataset_type == 'SST-1' or self.dataset_type == 'SST-2':
         with open(filename) as f:
             buff = []
             for line_num, line in enumerate(f):
                 line = clean_str_sst(line).split()
                 if len(line) > 1:
                     buff.append(line)
             self._process_buff(buff)
     else:
         with open(filename) as f:
             buff = []
             for line_num, line in enumerate(f):
                 line = clean_str(line).split()
                 if line:
                     buff.append(line)
             self._process_buff(buff)
     return