def _adaptive_grouping(self, dataset: Dataset): batches = [] current_batch = [] current_lengths = defaultdict(dict) # type: Dict[str, Dict[str, int]] logger.debug("Creating adaptive groups") for instance in dataset.instances: current_batch.append(instance) instance_lengths = instance.get_padding_lengths() for field_name in instance_lengths: for key in instance_lengths[field_name]: current_lengths[field_name][key] = max(instance_lengths[field_name][key], current_lengths[field_name].get(key, -1)) big_o_memory_constant = self._padding_memory_scaling(current_lengths) if (len(current_batch) * big_o_memory_constant > self._adaptive_memory_usage_constant or len(current_batch) > self._maximum_batch_size): current_batch.pop() if logger.getEffectiveLevel() <= logging.DEBUG: padding_lengths = Dataset(current_batch).get_padding_lengths() logger.debug("Batch size: %d; padding: %s", len(current_batch), padding_lengths) batches.append(current_batch) current_batch = [instance] current_lengths = instance_lengths if logger.getEffectiveLevel() <= logging.DEBUG: padding_lengths = Dataset(current_batch).get_padding_lengths() logger.debug("Batch size: %d; padding: %s", len(current_batch), padding_lengths) batches.append(current_batch) return batches
def get_answer(): # Take user input and convert to Instance user_context = request.args.get("context", "", type=str) user_question = request.args.get("question", "", type=str) input_instance = squad_reader.text_to_instance( question_text=user_question, passage_text=user_context) # Make a dataset from the instance dataset = Dataset([input_instance]) dataset.index_instances(train_vocab) batch = dataset.as_tensor_dict(cuda_device=0 if cuda else -1, for_training=False) # Extract relevant data from batch. passage = batch["passage"]["tokens"] question = batch["question"]["tokens"] metadata = batch.get("metadata", {}) # Run data through model to get start and end logits. output_dict = model(passage, question) start_logits = output_dict["start_logits"] end_logits = output_dict["end_logits"] # Compute the best span best_span = get_best_span(start_logits, end_logits) # Get the string corresponding to the best span passage_str = metadata[0]['original_passage'] offsets = metadata[0]['token_offsets'] predicted_span = tuple(best_span[0].data.cpu().numpy()) start_offset = offsets[predicted_span[0]][0] end_offset = offsets[predicted_span[1]][1] best_span_string = passage_str[start_offset:end_offset] # Return the best string back to the GUI return jsonify(answer=best_span_string)
def forward_on_instances( self, instances: List[Instance], cuda_device: int) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. """ dataset = Dataset(instances) dataset.index_instances(self.vocab) model_input = arrays_to_variables(dataset.as_array_dict(), cuda_device=cuda_device, for_training=False) outputs = self.decode(self.forward(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.autograd.Variable): output = output.data.cpu().numpy() outputs[name] = output for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def test_elmo(self): # load the test model options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') elmo = Elmo(options_file, weight_file, 2) # Correctness checks are in ElmoBiLm and ScalarMix, here we just add a shallow test # to ensure things execute. indexer = ELMoTokenCharactersIndexer() sentences = [['The', 'sentence', '.'], ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.']] # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) character_ids = dataset.as_array_dict()['elmo']['character_ids'] output = elmo(Variable(torch.from_numpy(character_ids))) elmo_representations = output['elmo_representations'] mask = output['mask'] assert len(elmo_representations) == 2 assert list(elmo_representations[0].size()) == [2, 7, 32] assert list(elmo_representations[1].size()) == [2, 7, 32] assert list(mask.size()) == [2, 7]
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') elmo_bilm = _ElmoBiLm(options_file, weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) # Now finally we can iterate through batches. iterator = BasicIterator(3) for i, batch in enumerate(iterator(dataset, num_epochs=1, shuffle=False)): batch_tensor = Variable(torch.from_numpy(batch['elmo']['character_ids'])) lm_embeddings = elmo_bilm(batch_tensor) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings['activations'][2], lm_embeddings['mask'] ) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6 ) )
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] with open(file_path, 'r') as data_file: logger.info("Reading instances in ARC jsonl format from dataset at: %s", file_path) for line in data_file: item_json = json.loads(line.strip()) item_id = item_json["id"] question_text = item_json["question"]["stem"] choice_label_to_id = {} choice_text_list = [] for choice_id, choice_item in enumerate(item_json["question"]["choices"]): choice_label = choice_item["label"] choice_label_to_id[choice_label] = choice_id choice_text = choice_item["text"] choice_text_list.append(choice_text) answer_id = choice_label_to_id[item_json["answerKey"]] instances.append(self.text_to_instance(item_id, question_text, choice_text_list, answer_id)) return Dataset(instances)
def setUp(self): token_indexer = SingleIdTokenIndexer("tokens") text_field = TextField([Token(t) for t in ["a", "a", "a", "a", "b", "b", "c", "c", "c"]], {"tokens": token_indexer}) self.instance = Instance({"text": text_field}) self.dataset = Dataset([self.instance]) super(TestVocabulary, self).setUp()
def _sort_dataset_by_padding( dataset: Dataset, sorting_keys: List[Tuple[str, str]], # pylint: disable=invalid-sequence-index padding_noise: float = 0.0 ) -> Dataset: """ Sorts the ``Instances`` in this ``Dataset`` by their padding lengths, using the keys in ``sorting_keys`` (in the order in which they are provided). ``sorting_keys`` is a list of ``(field_name, padding_key)`` tuples. """ instances_with_lengths = [] for instance in dataset.instances: padding_lengths = cast(Dict[str, Dict[str, float]], instance.get_padding_lengths()) if padding_noise > 0.0: noisy_lengths = {} for field_name, field_lengths in padding_lengths.items(): noisy_lengths[field_name] = add_noise_to_dict_values( field_lengths, padding_noise) padding_lengths = noisy_lengths instance_with_lengths = ([ padding_lengths[field_name][padding_key] for (field_name, padding_key) in sorting_keys ], instance) instances_with_lengths.append(instance_with_lengths) instances_with_lengths.sort(key=lambda x: x[0]) return Dataset([ instance_with_lengths[-1] for instance_with_lengths in instances_with_lengths ])
def ensure_batch_predictions_are_consistent(self): self.model.eval() single_predictions = [] for i, instance in enumerate(self.dataset.instances): dataset = Dataset([instance]) arrays = dataset.as_array_dict(dataset.get_padding_lengths(), verbose=False) variables = arrays_to_variables(arrays, for_training=False) result = self.model.forward(**variables) single_predictions.append(result) batch_arrays = self.dataset.as_array_dict( self.dataset.get_padding_lengths(), verbose=False) batch_variables = arrays_to_variables(batch_arrays, for_training=False) batch_predictions = self.model.forward(**batch_variables) for i, instance_predictions in enumerate(single_predictions): for key, single_predicted in instance_predictions.items(): tolerance = 1e-6 if key == 'loss': # Loss is particularly unstable; we'll just be satisfied if everything else is # close. continue single_predicted = single_predicted[0] batch_predicted = batch_predictions[key][i] if isinstance(single_predicted, torch.autograd.Variable): if single_predicted.size() != batch_predicted.size(): # This is probably a sequence model, and our output shape has some padded # elements in the batched case. Fixing this in general is complicated; # we'll just fix some easy cases that we actually have, for now. num_tokens = single_predicted.size(0) if batch_predicted.dim() == 1: batch_predicted = batch_predicted[:num_tokens] elif batch_predicted.dim() == 2: batch_predicted = batch_predicted[:num_tokens, :] else: raise NotImplementedError assert_allclose(single_predicted.data.numpy(), batch_predicted.data.numpy(), atol=tolerance, err_msg=key) else: assert single_predicted == batch_predicted, key
def read(self, file_path): logger.info('Reading instances from file %s', file_path) reader = TaggedCorpusReader(*os.path.split(file_path), sep='\t', word_tokenizer=RegexpTokenizer(r'\n', gaps=True), sent_tokenizer=BlanklineTokenizer(), para_block_reader=lambda s: [s.read()]) return Dataset([ self.text_to_instance(*tuple(zip(*tagged_sent))) for tagged_sent in reader.tagged_sents() ])
def read(self, file_path: str): with open(file_path, "r") as text_file: instance_strings = text_file.readlines() if self._tokens_per_instance is not None: all_text = " ".join( [x.replace("\n", " ").strip() for x in instance_strings]) tokenized_text = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance tokenized_strings = [] for index in range(0, len(tokenized_text) - num_tokens, num_tokens): tokenized_strings.append(tokenized_text[index:index + num_tokens]) else: tokenized_strings = [ self._tokenizer.tokenize(s) for s in instance_strings ] # TODO(matt): this isn't quite right, because you really want to split on sentences, # tokenize the sentences, add the start and end tokens per sentence, then change the tokens # per instance if desired. But, we can fix that later, if someone actually wants to use # this for language modeling. This is just another example of how to use the data reader # code, for now. tokenized_strings = [[self._start_token] + x + [self._end_token] for x in tokenized_strings] # No matter how you want to represent the input, we'll always represent the output as a # single token id. This code lets you learn a language model that concatenates word # embeddings with character-level encoders, in order to predict the word token that comes # next. output_indexer = None # type: Dict[str, TokenIndexer] for name, indexer in self._token_indexers.items(): if isinstance(indexer, SingleIdTokenIndexer): output_indexer = {name: indexer} break else: output_indexer = {"tokens": SingleIdTokenIndexer()} instances = [] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], output_indexer) instances.append( Instance({ 'input_tokens': input_field, 'output_tokens': output_field })) return Dataset(instances)
def batch_to_ids(batch): """ Given a batch (as list of tokenized sentences), return a batch of padded character ids. """ instances = [] for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() # dataset.index_instances(vocab) for instance in dataset.instances: instance.index_fields(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def read(self, file_path): with open(file_path, "r") as data_file: instances = [] for line in data_file: tokens_and_tags = [ pair.split("###") for pair in line.strip("\n").split("\t") ] tokens = [x[0] for x in tokens_and_tags] tags = [x[1] for x in tokens_and_tags] sequence = TextField(tokens, self._token_indexers) sequence_tags = TagField(tags, sequence) instances.append( Instance({ 'tokens': sequence, 'tags': sequence_tags })) return Dataset(instances)
def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding='utf-8') token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer) tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]] text_field = TextField(tokens, {"characters": token_indexer}) dataset = Dataset([Instance({"sentence": text_field})]) vocab = Vocabulary.from_dataset(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save') vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access assert indexed_tokens == indexed_tokens2
def setUp(self): super(IteratorTest, self).setUp() self.token_indexers = {"tokens": SingleIdTokenIndexer()} self.vocab = Vocabulary() self.this_index = self.vocab.add_token_to_namespace('this') self.is_index = self.vocab.add_token_to_namespace('is') self.a_index = self.vocab.add_token_to_namespace('a') self.sentence_index = self.vocab.add_token_to_namespace('sentence') self.another_index = self.vocab.add_token_to_namespace('another') self.yet_index = self.vocab.add_token_to_namespace('yet') self.very_index = self.vocab.add_token_to_namespace('very') self.long_index = self.vocab.add_token_to_namespace('long') self.instances = [ self.create_instance(["this", "is", "a", "sentence"]), self.create_instance(["this", "is", "another", "sentence"]), self.create_instance(["yet", "another", "sentence"]), self.create_instance(["this", "is", "a", "very", "very", "very", "very", "long", "sentence"]), self.create_instance(["sentence"]), ] self.dataset = Dataset(self.instances)
def read(self, file_path: str): instances = [] with open(file_path, 'r') as snli_file: for line in snli_file: example = json.loads(line) label = example["gold_label"] label_field = LabelField(label) premise = example["sentence1"] premise_field = TextField(self._tokenizer.tokenize(premise), self._token_indexers) hypothesis = example["sentence2"] hypothesis_field = TextField( self._tokenizer.tokenize(hypothesis), self._token_indexers) instances.append( Instance({ 'label': label_field, 'premise': premise_field, 'hypothesis': hypothesis_field })) return Dataset(instances)
def get_dataset(self): field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer) field2 = TextField([ Token(t) for t in ["this", "is", "a", "different", "sentence", "."] ], self.token_indexer) field3 = TextField( [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [ Instance({ "text1": field1, "text2": field2 }), Instance({ "text1": field3, "text2": field4 }) ] return Dataset(instances)
def read(self, file_path: str): # Import is here, since it isn't necessary by default. import nltk # Holds tuples of (question_text, answer_sentence_id) questions = [] logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] logger.info("Reading the dataset") for article in tqdm(dataset): for paragraph in article['paragraphs']: paragraph_id = len(self._paragraph_sentences) self._paragraph_sentences[paragraph_id] = [] context_article = paragraph["context"] # replace newlines in the context article cleaned_context_article = context_article.replace("\n", "") # Split the cleaned_context_article into a list of sentences. sentences = nltk.sent_tokenize(cleaned_context_article) # Make a dict from span indices to sentence. The end span is # exclusive, and the start span is inclusive. span_to_sentence_index = {} current_index = 0 for sentence in sentences: sentence_id = len(self._sentence_to_id) self._sentence_to_id[sentence] = sentence_id self._id_to_sentence[sentence_id] = sentence self._sentence_paragraph_map[sentence_id] = paragraph_id self._paragraph_sentences[paragraph_id].append(sentence_id) sentence_len = len(sentence) # Need to add one to the end index to account for the # trailing space after punctuation that is stripped by NLTK. span_to_sentence_index[(current_index, current_index + sentence_len + 1)] = sentence current_index += sentence_len + 1 for question_answer in paragraph['qas']: question_text = question_answer["question"].strip() question_id = len(self._question_to_id) self._question_to_id[question_text] = question_id self._id_to_question[question_id] = question_text # There may be multiple answer annotations, so pick the one # that occurs the most. candidate_answer_start_indices = Counter() # type: Counter for answer in question_answer["answers"]: candidate_answer_start_indices[ answer["answer_start"]] += 1 answer_start_index, _ = candidate_answer_start_indices.most_common( 1)[0] # Get the full sentence corresponding to the answer. answer_sentence = None for span_tuple in span_to_sentence_index: start_span, end_span = span_tuple if start_span <= answer_start_index and answer_start_index < end_span: answer_sentence = span_to_sentence_index[ span_tuple] break else: # no break raise ValueError( "Index of answer start was out of bounds. " "This should never happen, please raise " "an issue on GitHub.") # Now that we have the string of the full sentence, we need to # search for it in our shuffled list to get the index. answer_id = self._sentence_to_id[answer_sentence] # Now we can make the string representation and add this # to the list of processed_rows. questions.append((question_id, answer_id)) instances = [] logger.info("Processing questions into training instances") for question_id, answer_id in tqdm(questions): sentence_choices, correct_choice = self._get_sentence_choices( question_id, answer_id) question_text = self._id_to_question[question_id] sentence_fields = [] # type: List[Field] for sentence in sentence_choices: tokenized_sentence = self._tokenizer.tokenize(sentence) sentence_field = TextField(tokenized_sentence, self._token_indexers) sentence_fields.append(sentence_field) sentences_field = ListField(sentence_fields) tokenized_question = self._tokenizer.tokenize(question_text) question_field = TextField(tokenized_question, self._token_indexers) correct_sentence_field = IndexField(correct_choice, sentences_field) instances.append( Instance({ 'question': question_field, 'sentences': sentences_field, 'correct_sentence': correct_sentence_field })) return Dataset(instances)
def test_instances_must_have_homogeneous_fields(self): instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))}) instance2 = Instance({"words": TextField([Token("hello")], {})}) with pytest.raises(ConfigurationError): _ = Dataset([instance1, instance2])
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]], cuda_device: int, serialization_dir: str, filtering: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ SimpleRandom.set_seeds() os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. ds_params = params.pop('dataset_reader', {}) dataset_reader = FEVERReader(db, sentence_level=ds_params.pop( "sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {})), filtering=filtering) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), Dataset([ instance for dataset in all_datasets for instance in dataset.instances ])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model
def read(self, file_path: str): instances = [] sentence = [] # type: List[str] verbal_predicates = [] # type: List[int] predicate_argument_labels = [] # type: List[List[str]] current_span_label = [] # type: List[Optional[str]] for root, _, files in os.walk(file_path): for data_file in files: # These are a relic of the dataset pre-processing. Every file will be duplicated # - one file called filename.gold_skel and one generated from the preprocessing # called filename.gold_conll. if 'gold_conll' not in data_file: continue with codecs.open(os.path.join(root, data_file), 'r', encoding='utf8') as open_file: for line in open_file: line = line.strip() if line == '' or line.startswith("#"): # Conll format data begins and ends with lines containing a hash, # which may or may not occur after an empty line. To deal with this # we check if the sentence is empty or not and if it is, we just skip # adding instances, because there aren't any to add. if not sentence: continue instances.extend(self._process_sentence(sentence, verbal_predicates, predicate_argument_labels)) # Reset everything for the next sentence. sentence = [] verbal_predicates = [] predicate_argument_labels = [] current_span_label = [] continue conll_components = line.split() word = conll_components[3] sentence.append(word) word_index = len(sentence) - 1 if word_index == 0: # We're starting a new sentence. Here we set up a list of lists # for the BIO labels for the annotation for each verb and create # a temporary 'current_span_label' list for each annotation which # we will use to keep track of whether we are beginning, inside of, # or outside a particular span. predicate_argument_labels = [[] for _ in conll_components[11:-1]] current_span_label = [None for _ in conll_components[11:-1]] num_annotations = len(predicate_argument_labels) is_verbal_predicate = False # Iterate over all verb annotations for the current sentence. for annotation_index in range(num_annotations): annotation = conll_components[11 + annotation_index] label = annotation.strip("()*") if "(" in annotation: # Entering into a span for a particular semantic role label. # We append the label and set the current span for this annotation. bio_label = "B-" + label predicate_argument_labels[annotation_index].append(bio_label) current_span_label[annotation_index] = label elif current_span_label[annotation_index] is not None: # If there's no '(' token, but the current_span_label is not None, # then we are inside a span. bio_label = "I-" + current_span_label[annotation_index] predicate_argument_labels[annotation_index].append(bio_label) else: # We're outside a span. predicate_argument_labels[annotation_index].append("O") # Exiting a span, so we reset the current span label for this annotation. if ")" in annotation: current_span_label[annotation_index] = None # If any annotation contains this word as a verb predicate, # we need to record its index. This also has the side effect # of ordering the verbal predicates by their location in the # sentence, automatically aligning them with the annotations. if "(V" in annotation: is_verbal_predicate = True if is_verbal_predicate: verbal_predicates.append(word_index) return Dataset(instances)
def train_model(params: Params, serialization_dir: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets: List[Dataset] = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = dataset_reader.read(test_data_path) all_datasets.append(test_data) datasets_in_vocab.append("test") else: test_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), Dataset([instance for dataset in all_datasets for instance in dataset.instances])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop("evaluate_on_test", False) params.assert_empty('base train command') trainer.train() # Now tar up results archive_model(serialization_dir) if test_data and evaluate_on_test: test_data.index_instances(vocab) evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") return model
def generate_features(zipped_annotated_data, feature, feature_details, reader, mithun_logger, objUofaTrainTest, dataset, length_data): mithun_logger.info(f"got inside generate_features") mithun_logger.info(f"value of feature is:{feature}") mithun_logger.info(f"value of dataset is:{dataset}") instances = [] for index, (he, be, hl, bl, hw, bw, ht, hd, hfc) in enumerate(zipped_annotated_data): new_label = "" label = hfc if (dataset == "fnc"): if (label == "unrelated"): continue else: if (label == 'discuss'): new_label = "NOT ENOUGH INFO" if (label == 'agree'): new_label = "SUPPORTS" if (label == 'disagree'): new_label = "REFUTES" else: new_label = label he_split = he.split(" ") be_split = be.split(" ") hl_split = hl.split(" ") bl_split = bl.split(" ") hw_split = hw.split(" ") bw_split = bw.split(" ") premise_ann = "" hypothesis_ann = "" if (feature == "plain_NER"): premise_ann, hypothesis_ann = objUofaTrainTest.convert_NER_form_per_sent_plain_NER( he_split, be_split, hl_split, bl_split, hw_split, bw_split, mithun_logger) else: if (feature == "smart_NER"): premise_ann, hypothesis_ann, found_intersection = objUofaTrainTest.convert_SMARTNER_form_per_sent( he_split, be_split, hl_split, bl_split, hw_split, bw_split, mithun_logger) # mithun_logger.info(f"value of old label is:{label}") # mithun_logger.info(f"value of new label is:{new_label}") # mithun_logger.info(f"value of claim before annotation is:{hw}") # mithun_logger.info(f"value of evidence before anntoation is is:{bw}") # mithun_logger.info(f"value of premise_ann is:{premise_ann}") # mithun_logger.info(f"value of hypothesis_ann is:{hypothesis_ann}") # mithun_logger.debug(f"value of old label is:{label}") # mithun_logger.debug(f"value of new label is:{new_label}") # mithun_logger.debug(f"value of claim before annotation is:{hw}") # mithun_logger.debug(f"value of evidence before anntoation is is:{bw}") # mithun_logger.debug(f"value of premise_ann is:{premise_ann}") # mithun_logger.debug(f"value of hypothesis_ann is:{hypothesis_ann}") #todo: fixe me. not able to cleanly retrieve boolean values from the config file # person_c1 = feature_details.pop('person_c1', {}) # lower_case_tokens= feature_details.pop('lower_case_tokens', {}) # update_embeddings= feature_details.pop('update_embeddings', {}) # assert type(person_c1) is str # assert type(lower_case_tokens) is bool # assert type(update_embeddings) is bool # # if(lower_case_tokens): # premise_ann=premise_ann.lower(), # hypothesis_ann=hypothesis_ann.lower() # mithun_logger.debug(f"value of premise_ann after lower case token is:{premise_ann}") # mithun_logger.debug(f"value of label after lower case token is:{hypothesis_ann}") instances.append( reader.text_to_instance(premise_ann, hypothesis_ann, new_label)) if len(instances) == 0: mithun_logger.error( "No instances were read from the given filepath {}. " "Is the path correct?") sys.exit(1) mithun_logger.info(f"type of instances is :{type(instances)}") return Dataset(instances)
def train_model(params: Params, serialization_dir: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. # 1. Primary training data. dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) # 2. Auxillary training data. dataset_reader_aux = DatasetReader.from_params( params.pop('dataset_reader_aux')) train_data_path_aux = params.pop('train_data_path_aux') logger.info("Reading auxilliary training data from %s", train_data_path_aux) train_data_aux = dataset_reader_aux.read(train_data_path_aux) # If only using a fraction of the auxiliary data. aux_sample_fraction = params.pop("aux_sample_fraction", 1.0) if aux_sample_fraction < 1.0: sample_size = int(aux_sample_fraction * len(train_data_aux.instances)) train_data_aux = Dataset( random.sample(train_data_aux.instances, sample_size)) # Balance the two datasets by inflating the size of the smaller dataset to the size of the larger dataset. train_size = len(train_data.instances) aux_train_size = len(train_data_aux.instances) mixing_ratio = params.pop("mixing_ratio") # mixing_ratio = float(train_size)/aux_train_size if train_size > aux_train_size: # case for PB scaffold. difference = train_size - aux_train_size aux_sample = [ random.choice(train_data_aux.instances) for _ in range(difference) ] train_data_aux = Dataset(train_data_aux.instances + aux_sample) logger.info( "Inflating auxiliary train data from {} to {} samples".format( aux_train_size, len(train_data_aux.instances))) # else: # case for FN scaffold. # difference = aux_train_size - train_size # train_sample = [random.choice(train_data.instances) for _ in range(difference)] # train_data = Dataset(train_data.instances + train_sample) # logger.info("Inflating train data from {} to {} samples".format( # train_size, len(train_data.instances))) all_datasets: Dict[str, Dataset] = {"train": train_data} all_datasets_aux: Dict[str, Dataset] = {"train_aux": train_data_aux} # 3. Primary validation data. validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets["validation"] = validation_data else: validation_data = None # 4. Auxillary validation data. validation_data_path_aux = params.pop('validation_data_path_aux', None) if validation_data_path_aux is not None: logger.info("Reading auxilliary validation data from %s", validation_data_path_aux) validation_data_aux = dataset_reader_aux.read(validation_data_path_aux) all_datasets_aux["validation_aux"] = validation_data_aux else: validation_data_aux = None # 5. Primary test data test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = dataset_reader.read(test_data_path) all_datasets["test"] = test_data else: test_data = None # 6. Auxillary test data test_data_path_aux = params.pop("test_data_path_aux", None) if test_data_path_aux is not None: logger.info("Reading auxillary test data from %s", test_data_path_aux) test_data_aux = dataset_reader_aux.read(test_data_path_aux) all_datasets_aux["test_aux"] = test_data_aux else: test_data_aux = None datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) datasets_for_vocab_creation_aux = set( params.pop("auxillary_datasets_for_vocab_creation", all_datasets_aux)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "Creating a vocabulary using %s data. Auxillary also included.", ", ".join(datasets_for_vocab_creation)) dataset_primary = Dataset([ instance for key, dataset in all_datasets.items() for instance in dataset.instances if key in datasets_for_vocab_creation ]) dataset_aux = Dataset([ instance for key, dataset in all_datasets_aux.items() for instance in dataset.instances if key in datasets_for_vocab_creation_aux ]) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), dataset_primary, dataset_aux=dataset_aux) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator_aux = DataIterator.from_params(params.pop("iterator_aux")) train_data.index_instances(vocab) train_data_aux.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) if validation_data_aux: validation_data_aux.index_instances(vocab) cutoff_epoch = params.pop("cutoff_epoch", -1) trainer_params = params.pop("trainer") trainer = MultiTaskTrainer.from_params( model=model, serialization_dir=serialization_dir, iterator=iterator, iterator_aux=iterator_aux, train_dataset=train_data, train_dataset_aux=train_data_aux, mixing_ratio=mixing_ratio, cutoff_epoch=cutoff_epoch, validation_dataset=validation_data, validation_dataset_aux=validation_data_aux, params=trainer_params, files_to_archive=params.files_to_archive) evaluate_on_test = params.pop("evaluate_on_test", False) params.assert_empty('base train command') trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_data.index_instances(vocab) evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") if test_data_aux and evaluate_on_test: test_data_aux.index_instances(vocab) evaluate(model, test_data_aux, iterator_aux, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data_aux: logger.info( "To evaluate on the auxillary test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") return model