def write(self, sess, global_step): self.checkpoint_saver.save(sess, self.config.checkpoint, global_step=global_step) utils.write_cpickle( (self.history, self.unlabeled_data_reader.current_file, self.unlabeled_data_reader.current_line), self.config.progress)
def evaluate_all_tasks(self, sess, summary_writer, history, train_set=False): for task in self.tasks: results = self._evaluate_task(sess, task, summary_writer, train_set) if history is not None: results.append(('step', self._model.get_global_step(sess))) history.append(results) if history is not None: utils.write_cpickle(history, self._config.history_file)
def main(data_dir='/content/data'): random.seed(0) utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS") for pretrained in ['glove.6B.50d.txt']: config = configure.Config(data_dir=data_dir, for_preprocessing=True, pretrained_embeddings=pretrained, word_embedding_size=50) embeddings.PretrainedEmbeddingLoader(config).build() utils.log("CONSTRUCTING DEV SETS") for task_name in ["chunk"]: # chunking does not come with a provided dev split, so create one by # selecting a random subset of the data config = configure.Config(data_dir=data_dir, for_preprocessing=True) task_data_dir = os.path.join(config.raw_data_topdir, task_name) + '/' train_sentences = word_level_data.TaggedDataLoader( config, task_name, False).get_labeled_sentences("train") random.shuffle(train_sentences) write_sentences(task_data_dir + 'train_subset.txt', train_sentences[1500:]) write_sentences(task_data_dir + 'dev.txt', train_sentences[:1500]) utils.log("WRITING LABEL MAPPINGS") for task_name in ["chunk"]: for i, label_encoding in enumerate(["BIOES"]): config = configure.Config(data_dir=data_dir, for_preprocessing=True, label_encoding=label_encoding) token_level = task_name in ["ccg", "pos", "depparse"] loader = word_level_data.TaggedDataLoader(config, task_name, token_level) if token_level: if i != 0: continue utils.log("WRITING LABEL MAPPING FOR", task_name.upper()) else: utils.log(" Writing label mapping for", task_name.upper(), label_encoding) utils.log(" ", len(loader.label_mapping), "classes") utils.write_cpickle(loader.label_mapping, loader.label_mapping_path)
def main(data_dir='./data'): random.seed(0) utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS") for pretrained in ['glove.6B.300d.txt']: config = configure.Config(data_dir=data_dir, for_preprocessing=True, pretrained_embeddings=pretrained, word_embedding_size=300) embeddings.PretrainedEmbeddingLoader(config).build() utils.log("CONSTRUCTING DEV SETS") for task_name in ["chunk"]: # chunking does not come with a provided dev split, so create one by # selecting a random subset of the data config = configure.Config(data_dir=data_dir, for_preprocessing=True) task_data_dir = os.path.join(config.raw_data_topdir, task_name) + '/' train_sentences = word_level_data.TaggedDataLoader( config, task_name, False).get_labeled_sentences("train") random.shuffle(train_sentences) write_sentences(task_data_dir + 'train_subset.txt', train_sentences[1500:]) write_sentences(task_data_dir + 'dev.txt', train_sentences[:1500]) utils.log("WRITING LABEL MAPPINGS") for task_name in ["chunk"]: for i, label_encoding in enumerate(["BIOES"]): config = configure.Config(data_dir=data_dir, for_preprocessing=True, label_encoding=label_encoding) token_level = task_name in ["ccg", "pos", "depparse"] loader = word_level_data.TaggedDataLoader(config, task_name, token_level) if token_level: if i != 0: continue utils.log("WRITING LABEL MAPPING FOR", task_name.upper()) else: utils.log(" Writing label mapping for", task_name.upper(), label_encoding) utils.log(" ", len(loader.label_mapping), "classes") utils.write_cpickle(loader.label_mapping, loader.label_mapping_path)
def main(data_dir='/content/data'): random.seed(0) utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS") for pretrained in ['glove.6B.100d.txt']: config = configure.Config(data_dir=data_dir, for_preprocessing=True, pretrained_embeddings=pretrained, word_embedding_size=100) embeddings.PretrainedEmbeddingLoader(config).build() utils.log("WRITING LABEL MAPPINGS") for task_name in ["senclass"]: config = configure.Config(data_dir=data_dir, for_preprocessing=True) loader = sentence_level_data.SentenceClassificationDataLoader(config, task_name) utils.log("WRITING LABEL MAPPING FOR", task_name.upper()) utils.log(" ", len(loader.label_mapping), "classes") utils.write_cpickle(loader.label_mapping, loader.label_mapping_path)
def _write(self): utils.write_cpickle(np.vstack(self.vectors), self.config.word_embeddings) utils.write_cpickle(self.vocabulary, self.config.word_vocabulary)