def create_vocabulary(self, min_frequency=5, tokenizer='spacy', downcase=False, max_vocab_size=None, name='new', load_w2v=True): self.vocab_path, self.w2v_path, self.metadata_path = \ datasets.new_vocabulary([self.train_path], self.dataset_path, min_frequency, tokenizer=tokenizer, downcase=downcase, max_vocab_size=max_vocab_size, name=name) self.__refresh(load_w2v)
def initialize_vocabulary(self): line_processor = lambda line: " ".join(line.split('\t')[:1]) self.vocab_path, self.w2v_path, self.metadata_path = \ datasets.new_vocabulary( files=[self.train_path], dataset_path=self.dataset_path, min_frequency=5, tokenizer='spacy', downcase=True, max_vocab_size=None, name='new', line_processor=line_processor) self.w2i, self.i2w = datasets.load_vocabulary(self.vocab_path) self.w2v = datasets.preload_w2v(self.w2i) datasets.save_w2v(self.w2v_path, self.w2v)
def create_vocabulary(self, min_frequency=5, tokenizer='spacy', downcase=False, max_vocab_size=None, name='new', load_w2v=True): def line_processor(line): json_obj = json.loads(line) line = json_obj["review_header"] + " " + json_obj["review_text"] return line self.vocab_path, self.w2v_path, self.metadata_path = \ datasets.new_vocabulary([self.data_path], self.dataset_path, min_frequency, tokenizer=tokenizer, downcase=downcase, max_vocab_size=max_vocab_size, name=name, line_processor=line_processor, lang='de') self.__refresh(load_w2v)
def initialize_vocabulary_ll(self, names, min_frequencies, downcases, tokenizer): for i in range(len(self.vocab_paths)): self.vocab_paths[i], self.w2v_paths[i], self.metadata_paths[i] = \ datasets.new_vocabulary( files=[self.train_path], dataset_path=self.dataset_path, min_frequency=min_frequencies[i], tokenizer=tokenizer[i], downcase=downcases[i], max_vocab_size=None, name=names[i], line_processor=lambda line: line.split('\t')[i], lang='de') self.w2i[i], self.i2w[i] = datasets.load_vocabulary( self.vocab_paths[i]) self.w2v[i] = datasets.preload_w2v(self.w2i[i], lang='de') datasets.save_w2v(self.w2v_paths[i], self.w2v[i])
def create_vocabulary(self, min_frequency=5, tokenizer='spacy', downcase=False, max_vocab_size=None, name='new', load_w2v=True): def line_processor(line): line = line.strip().split('\t')[-1] return line self.vocab_path, self.w2v_path, self.metadata_path = \ datasets.new_vocabulary([self.data_path], self.dataset_path, min_frequency, tokenizer=tokenizer, downcase=downcase, max_vocab_size=max_vocab_size, name=name, line_processor=line_processor, lang='de') self.__refresh(load_w2v)
def create_vocabulary(self, all_files, min_frequency=5, tokenizer='spacy', downcase=True, max_vocab_size=None, name='new', load_w2v=True): # It doesn't seem to make sense to want to create a new vocabulary for # the other two types of data (NER data or POS tags). So I'll only allow # for new vocabularies on the text self.vocab_paths[0], self.w2v_paths[0], self.metadata_paths[0] = \ datasets.new_vocabulary( files=all_files, dataset_path=self.dataset_path, min_frequency=min_frequency, tokenizer=tokenizer, downcase=downcase, max_vocab_size=max_vocab_size, name=name, line_processor=lambda line: line.split('\t')[0]) self.__refresh(load_w2v)