def df_from_file(path): data = load_json(path) meta = [["experiment_setup", "subcategory"], ["experiment_setup", "method"], ["experiment_setup", "embeddings"]] df = json_normalize(data, meta=meta) # df["reciprocal_rank"] = 1 / (df["rank"] + 1) return df
def load_dataset_infos(): for f_meta in gen_metadata_snippets(Path(dir_datasets)): # print("visiting", f_meta.parent) metadata = load_json(f_meta) if "name" in metadata: metadata["local_path"] = f_meta.parent resources[metadata["name"]] = metadata
def _load_dataset(self, dataset): """Loading the vocabulary file from the location specified in the ldt config file. If vecto-style metadata is found, it will also be bundled with the experiment metadata automatically. Args: dataset (str): either a full path to the dataset or a subfolder of "experiments/vocab_samples" folder in the general ldt resources location. Returns: None """ dataset_metadata_path = \ os.path.join(config["path_to_resources"], "experiments", "vocab_samples", dataset, "metadata.json") if os.path.isfile(dataset_metadata_path): self.metadata["dataset"] = load_json(dataset_metadata_path) else: self.metadata["dataset"] = dataset dataset_path = dataset_metadata_path.strip("metadata.json") # assume there is a single ".vocab" file in the dataset folder file = [x for x in os.listdir(dataset_path) if x.endswith(".vocab")][0] dataset = load_resource(os.path.join(dataset_path, file), format="vocab") self.dataset = list(dataset)
def __init__(self, experiment_name=config["experiments"]["experiment_name"], extra_metadata=None, dataset=config["experiments"]["vocab_sample"], embeddings=config["experiments"]["embeddings"], output_dir=os.path.join(config["path_to_resources"], "experiments"), overwrite=config["experiments"]["overwrite"], experiment_subfolder=None): """ Initializing an Experiment. Args: experiment_name (str): the human-readable name of an experiment (e.g. "Profiling CBOW with window size 2-10") extra_metadata (dict): any extra fields to be added to the experiment metadata (overwriting any previously existing fields) embeddings (list of str or None): a list of paths to input data (each containing a metadata.json file). If set to None, the config parameters will be ignored (for experiments where embedding metadata has already been processed and can be just copied over from the previous step.) output_dir (str): the *existing* path for saving the *subfolder* named with the specified experiment_name, where the output data and metadata.json file will be saved. dataset (str): the location of the dataset to be used in the experiment. overwrite (bool): if True, any previous data for the same experiment will be overwritten, and the experiment will be re-started. If metadata from previous experiment is not found, this setting is disregarded. experiment_subfolder (str): if provided, the experiment results will be saved to this subfolder of the "experiments" folder """ if not isinstance(experiment_name, str): raise ValueError("Please specify experiment_name argument: a short " "description of the experiment you're conducting.") self.output_dir = check_output(output_dir, experiment_subfolder, experiment_name) self.message = None if embeddings: self.embeddings = check_input(input_data=embeddings) self._overwrite = overwrite if self._overwrite: self._init_metadata(embeddings) else: metadata_path = os.path.join(self.output_dir, "metadata.json") if os.path.isfile(metadata_path): self.metadata = load_json(metadata_path) else: self._init_metadata(embeddings) self._overwrite = True self._load_dataset(dataset=dataset) if isinstance(extra_metadata, dict): self.metadata.update(extra_metadata)
def df_from_file(path): data = load_json(path) meta = [["experiment_setup", "subcategory"], ["experiment_setup", "method"], ["experiment_setup", "embeddings"]] dframe = json_normalize(data, meta=meta) if "details" in dframe: dframe.drop("details", axis="columns", inplace=True) dframe["result"] = dframe[ "result." + dframe["experiment_setup.default_measurement"].unique()[0]] # df["reciprocal_rank"] = 1 / (df["rank"] + 1) return dframe
def df_from_file(path): data = load_json(path) meta = [["experiment_setup", "task"], ["experiment_setup", "subcategory"], ["experiment_setup", "method"], ["experiment_setup", "embeddings"]] dframe = json_normalize(data, meta=meta) if "details" in dframe: dframe.drop("details", axis="columns", inplace=True) default_measurement = "accuracy" try: default_measurement = dframe[ "experiment_setup.default_measurement"].unique()[0] except: logger.warning(f"default_measurement not specified in {path}") dframe["result"] = dframe["result." + default_measurement] # df["reciprocal_rank"] = 1 / (df["rank"] + 1) return dframe
def _init_metadata(self, embeddings): """Metadata Initialization helper""" self.metadata = {} self.metadata["timestamp"] = {} self.metadata["version"] = "ldt v. " + __version__ self.metadata["class"] = "experiment" if hasattr(self, "embeddings"): self.metadata["embeddings"] = [] shared_subpath = check_shared_subpath(embeddings, "") for embedding in embeddings: meta_path = os.path.join(embedding, "metadata.json") if os.path.isfile(meta_path): embedding_metadata = load_json(meta_path) embedding_metadata["path"] = embedding else: embedding_metadata = create_metadata_stub(embedding, shared_subpath) save_json(embedding_metadata, meta_path) self.metadata["embeddings"].append(embedding_metadata)
def get_result(self, embs, path_dataset, path_output='/tmp/text_classification/'): self.out = path_output self.unit = embs.matrix.shape[1] if not os.path.isdir(path_output): os.makedirs(path_output) # Load a dataset self.path_dataset = path_dataset if self.path_dataset == 'dbpedia': train, test, vocab = text_datasets.get_dbpedia( char_based=self.char_based, vocab=embs.vocabulary.dic_words_ids, shrink=self.shrink) elif self.path_dataset.startswith('imdb.'): train, test, vocab = text_datasets.get_imdb( fine_grained=self.path_dataset.endswith('.fine'), char_based=self.char_based, vocab=embs.vocabulary.dic_words_ids, shrink=self.shrink) elif self.path_dataset in [ 'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj' ]: train, test, vocab = text_datasets.get_other_text_dataset( self.path_dataset, char_based=self.char_based, vocab=embs.vocabulary.dic_words_ids, shrink=self.shrink) else: # finallly, if file is not downloadable, load from local path train, test, vocab = text_datasets.get_dataset_from_path( path_dataset, vocab=embs.vocabulary.dic_words_ids, char_based=self.char_based, shrink=self.shrink) print('# train data: {}'.format(len(train))) print('# test data: {}'.format(len(test))) print('# vocab: {}'.format(len(vocab))) n_class = len(set([int(d[1]) for d in train])) print('# class: {}'.format(n_class)) train_iter = chainer.iterators.SerialIterator(train, self.batchsize) test_iter = chainer.iterators.SerialIterator(test, self.batchsize, repeat=False, shuffle=False) # Setup a model if self.model == 'rnn': Encoder = nets.RNNEncoder elif self.model == 'cnn': Encoder = nets.CNNEncoder elif self.model == 'bow': Encoder = nets.BOWMLPEncoder encoder = Encoder(n_layers=self.layer, n_vocab=len(vocab), n_units=self.unit, dropout=self.dropout, wv=embs.matrix) model = nets.TextClassifier(encoder, n_class) if self.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(self.gpu).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4)) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, converter=convert_seq, device=self.gpu) trainer = training.Trainer(updater, (self.epoch, 'epoch'), out=self.out) # Evaluate the model with the test dataset for each epoch trainer.extend( extensions.Evaluator(test_iter, model, converter=convert_seq, device=self.gpu)) # Take a best snapshot record_trigger = training.triggers.MaxValueTrigger( 'validation/main/accuracy', (1, 'epoch')) trainer.extend(extensions.snapshot_object(model, 'best_model.npz'), trigger=record_trigger) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) # Save vocabulary and model's setting if not os.path.isdir(self.out): os.mkdir(self.out) vocab_path = os.path.join(self.out, 'vocab.json') with open(vocab_path, 'w') as f: json.dump(vocab, f) model_path = os.path.join(self.out, 'best_model.npz') experiment_setup = self.__dict__ experiment_setup['vocab_path'] = vocab_path experiment_setup['model_path'] = model_path experiment_setup['n_class'] = n_class experiment_setup['datetime'] = self.current_datetime with open(os.path.join(self.out, 'args.json'), 'w') as f: json.dump(self.__dict__, f) # Run the training trainer.run() result = {} result['experiment_setup'] = experiment_setup result['log'] = load_json(os.path.join(self.out, 'log')) result['result'] = result['log'][-1]['validation/main/accuracy'] return result
def get_result(self, embeddings, path_dataset, path_output='/tmp/text_classification/'): self.out = path_output self.unit = embeddings.matrix.shape[1] if not os.path.isdir(path_output): os.makedirs(path_output) # TODO: move this to protonn ds management self.path_dataset = path_dataset # if self.path_dataset == 'dbpedia': # train, test, vocab = text_datasets.get_dbpedia( # char_based=self.char_based, # vocab=embeddings.vocabulary.dic_words_ids, # shrink=self.shrink) # elif self.path_dataset.startswith('imdb.'): # train, test, vocab = text_datasets.get_imdb( # fine_grained=self.path_dataset.endswith('.fine'), # char_based=self.char_based, # vocab=embeddings.vocabulary.dic_words_ids, # shrink=self.shrink) # elif self.path_dataset in ['TREC', 'stsa.binary', 'stsa.fine', # 'custrev', 'mpqa', 'rt-polarity', 'subj']: # train, test, vocab = text_datasets.get_other_text_dataset( # self.path_dataset, # char_based=self.char_based, # vocab=embeddings.vocabulary.dic_words_ids, # shrink=self.shrink) # else: # finallly, if file is not downloadable, load from local path print(path_dataset) path_adapter = os.path.join(path_dataset, "adapter.py") if os.path.isfile(path_adapter): spec = importlib.util.spec_from_file_location( "ds_adapter", path_adapter) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) adapter = module.Adapter() train, test, _ = adapter.read() vocab = embeddings.vocabulary.dic_words_ids train = nlp_utils.transform_to_array(train, vocab) test = nlp_utils.transform_to_array(test, vocab) # exit(0) else: train, test, vocab = text_datasets.get_dataset_from_path( path_dataset, vocab=embeddings.vocabulary.dic_words_ids, char_based=self.char_based, shrink=self.shrink) print('# cnt train samples: {}'.format(len(train))) print('# cnt test samples: {}'.format(len(test))) print('# size vocab: {}'.format(len(vocab))) n_class = len(set([int(d[1]) for d in train])) print('# cnt classes: {}'.format(n_class)) # print(train[0]) # exit(0) train_iter = chainer.iterators.SerialIterator(train, self.batchsize) test_iter = chainer.iterators.SerialIterator(test, self.batchsize, repeat=False, shuffle=False) # Setup a model if self.model == 'rnn': Encoder = nets.RNNEncoder elif self.model == 'cnn': Encoder = nets.CNNEncoder elif self.model == 'bow': Encoder = nets.BOWMLPEncoder encoder = Encoder(n_layers=self.layer, n_vocab=len(vocab), n_units=self.unit, dropout=self.dropout, wv=embeddings.matrix) model = nets.TextClassifier(encoder, n_class) if self.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(self.gpu).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4)) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, converter=nlp_utils.convert_seq, device=self.gpu) trainer = training.Trainer(updater, (self.epoch, 'epoch'), out=self.out) # Evaluate the model with the test dataset for each epoch trainer.extend( extensions.Evaluator(test_iter, model, converter=nlp_utils.convert_seq, device=self.gpu)) # Take a best snapshot record_trigger = training.triggers.MaxValueTrigger( 'validation/main/accuracy', (1, 'epoch')) trainer.extend(extensions.snapshot_object(model, 'best_model.npz'), trigger=record_trigger) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) # Save vocabulary and model's setting if not os.path.isdir(self.out): os.mkdir(self.out) vocab_path = os.path.join(self.out, 'vocab.json') with open(vocab_path, 'w') as f: json.dump(vocab, f) model_path = os.path.join(self.out, 'best_model.npz') experiment_setup = self.__dict__ # TODO: move all this to the parent class experiment_setup['task'] = "text classification" experiment_setup['vocab_path'] = vocab_path experiment_setup['model_path'] = model_path experiment_setup['n_class'] = n_class experiment_setup['datetime'] = self.current_datetime with open(os.path.join(self.out, 'args.json'), 'w') as f: json.dump(self.__dict__, f) # Run the training trainer.run() result = {} result['experiment_setup'] = experiment_setup result['experiment_setup']['default_measurement'] = 'accuracy' result['experiment_setup']['dataset'] = os.path.basename( os.path.normpath(path_dataset)) result['experiment_setup']['method'] = self.model result['experiment_setup']['embeddings'] = embeddings.metadata result['log'] = load_json(os.path.join(self.out, 'log')) # TODO: old version was returning last test value, make a footnote # result['result'] = {"accuracy": result['log'][-1]['validation/main/accuracy']} accuracy = max(_["validation/main/accuracy"] for _ in result['log']) result['result'] = {"accuracy": accuracy} return [result]
def __init__(self, experiment_name=config["experiments"]["experiment_name"], extra_metadata=None, overwrite=config["experiments"]["overwrite"], ld_scores="main", output_dir=os.path.join(config["path_to_resources"], "experiments")): """ Annotating pre-computed top *n* neighbors for a given vocab sample Args: experiment_name (str): the human-readable name for the current experiment, which will be used to make a subfolder storing the generated data. If None, the folder will be simply timestamped. extra_metadata (dict): any extra fields to be added to the experiment metadata (overwriting any previously existing fields) output_dir (str): the *existing* path for saving the *subfolder* named with the specified experiment_name, where the output data and metadata.json file will be saved. overwrite (bool): if True, any previous data for the same experiment will be overwritten, and the experiment will be re-started. ld_scores (str or list of str): "all" for all supported scores, or a list of ld_scores. Supported values are: - "SharedPOS", - "SharedMorphForm", - "SharedDerivation", - "NonCooccurring", - "CloseNeighbors", - "FarNeighbors", - "LowFreqNeighbors", - 'HighFreqNeighbors', - "GDeps", - "TargetFrequency", - "NeighborFrequency", - "Associations", - "ShortestPathMedian", - "CloseInOntology", - "Synonyms", - "Antonyms", - "Meronyms", - "Hyponyms", - "Hypernyms", - "OtherRelations", - "Numbers", - "ProperNouns", - "Misspellings", - "URLs", - "Filenames", - "ForeignWords", - "Hashtags", - "Noise". Returns: (None): a table with ld scores for all available variables, together with the experiment metadata. """ super(LDScoring, self).__init__( experiment_name=experiment_name, extra_metadata=extra_metadata, \ overwrite=overwrite, embeddings=None, output_dir=output_dir, dataset=None, experiment_subfolder="analysis") self.metadata["task"] = "ld_scores_analysis" self.metadata["uuid"] = str(uuid.uuid4()) self._load_dataset(dataset=None) neighbors_metadata_path = self.output_dir.replace( "analysis", "neighbors_annotated") neighbors_metadata_path = os.path.join(neighbors_metadata_path, "metadata.json") if not os.path.isfile(neighbors_metadata_path): raise IOError("The metadata for the annotated neighborhood files " "was not found at " + neighbors_metadata_path) else: neighbors_metadata = load_json(neighbors_metadata_path) self.metadata["embeddings"] = neighbors_metadata["embeddings"] self.metadata["annotation"] = neighbors_metadata del self.metadata["annotation"]["embeddings"] self.embeddings = [] for embedding in self.metadata["embeddings"]: self.embeddings.append(embedding["path"]) self.supported_vars = [ "SharedPOS", "SharedMorphForm", "SharedDerivation", "NonCooccurring", "GDeps", "TargetFrequency", "NeighborFrequency", "Associations", "ShortestPath", "Synonyms", "Antonyms", "Meronyms", "Hyponyms", "Hypernyms", "OtherRelations", "Numbers", "ProperNouns", "Misspellings", "URLs", "Filenames", "ForeignWords", "Hashtags", "Noise" ] self.continuous_vars = [ 'ShortestPath', 'TargetFrequency', 'NeighborFrequency', 'CloseNeighbors', 'FarNeighbors' ] self.binary_vars = [x for x in self.supported_vars if not \ x in self.continuous_vars] output_vars = [ "Model", "SharedPOS", "SharedMorphForm", "SharedDerivation", "NonCooccurring", "CloseNeighbors", "FarNeighbors", "LowFreqNeighbors", 'HighFreqNeighbors', "GDeps", "Associations", "ShortestPathMedian", "CloseInOntology", "Synonyms", "Antonyms", "Meronyms", "Hyponyms", "Hypernyms", "OtherRelations", "Numbers", "ProperNouns", "Misspellings", "URLs", "Filenames", "ForeignWords", "Hashtags", "Noise" ] # corpus_specific = ["NonCooccurring", "LowFreqNeighbors", # "HighFreqNeighbors"] # # if not config["corpus"]: # output_vars = [x for x in output_vars if not x in corpus_specific] output_scores_error = "The ld_scores argument is invalid. It should " \ "be 'all' for all supported relations, " \ "or a list with one or more of the following " \ "values:\n" + ", ".join(output_vars) if ld_scores == "all": self.output_vars = output_vars elif ld_scores == "main": exclude = [ "ShortestPathMedian", "URLs", "Filenames", "Hashtags", "Noise" ] if not config["corpus"]: exclude += [ "NonCooccurring", "LowFreqNeighbors", 'HighFreqNeighbors', "GDeps" ] self.output_vars = [x for x in output_vars if not x in exclude] else: if isinstance(ld_scores, list): unsupported = [x for x in ld_scores if not x in output_vars] if unsupported: raise ValueError(output_scores_error) else: self.output_vars = [ x for x in output_vars if x in ld_scores ] self.output_vars = ["Model"] + self.output_vars else: raise ValueError(output_scores_error) self.metadata["ld_scores"] = self.output_vars self.message = None #"\n Annotation done! Analyzing the data now."
def __init__(self, experiment_name=config["experiments"]["experiment_name"], extra_metadata=None, overwrite=config["experiments"]["overwrite"], ld_scores="main", output_dir=os.path.join(config["path_to_resources"], "experiments"), ldt_analyzer=None, multiprocessing=config["experiments"]["multiprocessing"], debugging=False): """ Annotating pre-computed top *n* neighbors for a given vocab sample Args: experiment_name (str): the human-readable name for the current experiment, which will be used to make a subfolder storing the generated data. If None, the folder will be simply timestamped. extra_metadata (dict): any extra fields to be added to the experiment metadata (overwriting any previously existing fields) output_dir (str): the *existing* path for saving the *subfolder* named with the specified experiment_name, where the output data and metadata.json file will be saved. overwrite (bool): if True, any previous data for the same experiment will be overwritten, and the experiment will be re-started. ldt_analyzer: :class:`~ldt.relations.pair.RelationsInPair` instance, with lexicographic, morphological and normalization resources set up as desired (see tutorial and class documentation). If None, default settings for English will be used. ld_scores (str or list of str): "all" for all supported scores, or a list of ld_scores. Supported values are: - "SharedPOS", - "SharedMorphForm", - "SharedDerivation", - "NonCooccurring", - "GDeps", - "TargetFrequency", - "NeighborFrequency", - "Associations", - "ShortestPath", - "Synonyms", - "Antonyms", - "Meronyms", - "Hyponyms", - "Hypernyms", - "OtherRelations", - "Numbers", - "ProperNouns", - "Noise", - "URLs", - "Filenames", - "ForeignWords", - "Hashtags" - 'TargetFrequency', - 'NeighborFrequency'. See more details for these scores `here <http://ldtoolkit.space/ldscores/>`_. Returns: (None): the annotated neighbors file will be written to disk together with the experiment metadata. """ super(AnnotateVectorNeighborhoods, self).__init__( experiment_name=experiment_name, extra_metadata=extra_metadata, \ overwrite=overwrite, embeddings=None, output_dir=output_dir, dataset=None, experiment_subfolder="neighbors_annotated") self.metadata["task"] = "annotate_neighbors" self.metadata["uuid"] = str(uuid.uuid4()) self.metadata["ldt_config"] = config self.metadata["output_dir"] = self.output_dir self.metadata["debugging"] = debugging self.metadata["multiprocessing"] = multiprocessing self._load_dataset(dataset=None) neighbors_metadata_path = self.output_dir.replace( "neighbors_annotated", "neighbors") neighbors_metadata_path = os.path.join(neighbors_metadata_path, "metadata.json") if not os.path.isfile(neighbors_metadata_path): raise IOError("The metadata for the neighborhood generation task " "was not found at " + neighbors_metadata_path) else: self.metadata["neighbors_metadata_path"] = neighbors_metadata_path neighbors_metadata = load_json(neighbors_metadata_path) self.metadata["embeddings"] = neighbors_metadata["embeddings"] self.embeddings = [] for embedding in self.metadata["embeddings"]: self.embeddings.append(embedding["path"]) self.message = "\n\nStarting LD annotation. This will take a while " \ "for " \ "the first files, but the remainder should go faster, " \ "because many neighbor pairs will be the same." # self.metadata["failed_pairs"] = [] self.metadata["missed_pairs"] = [] self.metadata["total_pairs"] = 0 self.supported_vars = [ "SharedPOS", "SharedMorphForm", "SharedDerivation", "NonCooccurring", "GDeps", "TargetFrequency", "NeighborFrequency", "Associations", "ShortestPath", "Synonyms", "Antonyms", "Meronyms", "Hyponyms", "Hypernyms", "OtherRelations", "Numbers", "ProperNouns", "Misspellings", "URLs", "Filenames", "ForeignWords", "Hashtags", "Noise" ] self.continuous_vars = [ 'ShortestPath', 'TargetFrequency', 'NeighborFrequency' ] corpus_specific = [ "NonCooccurring", "TargetFrequency", "NeighborFrequency" ] if not config["corpus"]: for i in [self.supported_vars, self.continuous_vars]: i = [x for x in i if not i in corpus_specific] self.binary_vars = [x for x in self.supported_vars if not \ x in self.continuous_vars] ld_scores_error = "The ld_scores argument is invalid. It should be " \ "'all' for all supported relations, or a list with " \ "one or more of the following values:\n" + \ ", ".join(self.supported_vars) if ld_scores == "all": self._ld_scores = self.supported_vars elif ld_scores == "main": exclude = [ "ShortestPath", "URLs", "Filenames", "Hashtags", "Noise" ] if not config["corpus"]: exclude += [ "NonCooccurring", "GDeps", "TargetFrequency", "NeighborFrequency" ] self._ld_scores = [ x for x in self.supported_vars if not x in exclude ] else: if isinstance(ld_scores, list): unsupported = [ x for x in ld_scores if not x in self.supported_vars ] if unsupported: raise ValueError(ld_scores_error) else: self._ld_scores = [ x for x in self.supported_vars if x in ld_scores ] else: raise ValueError(ld_scores_error) self.metadata["ld_scores"] = self._ld_scores self.metadata["continuous_vars"] = self.continuous_vars self.metadata["binary_vars"] = self.binary_vars self.ldt_analyzer = ldt_analyzer