示例#1
0
 def create_vocabulary(self, min_frequency=5, tokenizer='spacy',
                       downcase=False, max_vocab_size=None,
                       name='new', load_w2v=True):
     self.vocab_path, self.w2v_path, self.metadata_path = \
         datasets.new_vocabulary([self.train_path], self.dataset_path,
                                 min_frequency, tokenizer=tokenizer, downcase=downcase,
                                 max_vocab_size=max_vocab_size, name=name)
     self.__refresh(load_w2v)
示例#2
0
    def initialize_vocabulary(self):
        line_processor = lambda line: " ".join(line.split('\t')[:1])

        self.vocab_path, self.w2v_path, self.metadata_path = \
            datasets.new_vocabulary(
                files=[self.train_path], dataset_path=self.dataset_path,
                min_frequency=5, tokenizer='spacy',
                downcase=True, max_vocab_size=None,
                name='new', line_processor=line_processor)

        self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path)
        self.w2v = datasets.preload_w2v(self.w2i)
        datasets.save_w2v(self.w2v_path, self.w2v)
示例#3
0
    def create_vocabulary(self, min_frequency=5, tokenizer='spacy',
                          downcase=False, max_vocab_size=None,
                          name='new', load_w2v=True):
        def line_processor(line):
            json_obj = json.loads(line)
            line = json_obj["review_header"] + " " + json_obj["review_text"]
            return line

        self.vocab_path, self.w2v_path, self.metadata_path = \
            datasets.new_vocabulary([self.data_path], self.dataset_path,
                                    min_frequency, tokenizer=tokenizer,
                                    downcase=downcase,
                                    max_vocab_size=max_vocab_size, name=name,
                                    line_processor=line_processor, lang='de')
        self.__refresh(load_w2v)
示例#4
0
    def initialize_vocabulary_ll(self, names, min_frequencies, downcases,
                                 tokenizer):
        for i in range(len(self.vocab_paths)):
            self.vocab_paths[i], self.w2v_paths[i], self.metadata_paths[i] = \
                datasets.new_vocabulary(
                    files=[self.train_path], dataset_path=self.dataset_path,
                    min_frequency=min_frequencies[i], tokenizer=tokenizer[i],
                    downcase=downcases[i], max_vocab_size=None,
                    name=names[i],
                    line_processor=lambda line: line.split('\t')[i], lang='de')

            self.w2i[i], self.i2w[i] = datasets.load_vocabulary(
                self.vocab_paths[i])
            self.w2v[i] = datasets.preload_w2v(self.w2i[i], lang='de')
            datasets.save_w2v(self.w2v_paths[i], self.w2v[i])
示例#5
0
    def create_vocabulary(self,
                          min_frequency=5,
                          tokenizer='spacy',
                          downcase=False,
                          max_vocab_size=None,
                          name='new',
                          load_w2v=True):
        def line_processor(line):
            line = line.strip().split('\t')[-1]
            return line

        self.vocab_path, self.w2v_path, self.metadata_path = \
            datasets.new_vocabulary([self.data_path], self.dataset_path,
                                    min_frequency, tokenizer=tokenizer,
                                    downcase=downcase,
                                    max_vocab_size=max_vocab_size, name=name,
                                    line_processor=line_processor, lang='de')
        self.__refresh(load_w2v)
示例#6
0
    def create_vocabulary(self,
                          all_files,
                          min_frequency=5,
                          tokenizer='spacy',
                          downcase=True,
                          max_vocab_size=None,
                          name='new',
                          load_w2v=True):
        # It doesn't seem to make sense to want to create a new vocabulary for
        # the other two types of data (NER data or POS tags). So I'll only allow
        # for new vocabularies on the text

        self.vocab_paths[0], self.w2v_paths[0], self.metadata_paths[0] = \
            datasets.new_vocabulary(
                files=all_files, dataset_path=self.dataset_path,
                min_frequency=min_frequency,
                tokenizer=tokenizer, downcase=downcase,
                max_vocab_size=max_vocab_size, name=name,
                line_processor=lambda line: line.split('\t')[0])
        self.__refresh(load_w2v)