def forward_on_instances( self, instances: List[Instance], cuda_device: int) -> List[Dict[str, numpy.ndarray]]: """ Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing) and returns the result. Before returning the result, we convert any ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the batched output into a list of individual dicts per instance. Note that typically this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to :func:`forward_on_instance`. """ dataset = Dataset(instances) dataset.index_instances(self.vocab) model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False) outputs = self.decode(self(**model_input)) instance_separated_output: List[Dict[str, numpy.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.autograd.Variable): output = output.data.cpu().numpy() outputs[name] = output for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def _yield_one_epoch(self, dataset: Dataset, shuffle: bool): grouped_instances = self._create_batches(dataset, shuffle) for group in grouped_instances: batch = Dataset(group) padding_lengths = batch.get_padding_lengths() logger.debug("Batch padding lengths: %s", str(padding_lengths)) logger.debug("Batch size: %d", len(batch.instances)) yield batch.as_array_dict(padding_lengths, verbose=False)
def token_to_elmo_id(token): tokens = [Token(token)] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances = [instance] dataset = Dataset(instances) vocab = Vocabulary() for instance in dataset.instances: instance.index_fields(vocab) #dataset.index_instances(vocab) # replaced by above, so that there's no progress bar return dataset.as_tensor_dict()['elmo']['character_ids']
def _yield_one_epoch(self, dataset: Dataset, shuffle: bool, cuda_device: int, for_training: bool): grouped_instances = self._create_batches(dataset, shuffle) for group in grouped_instances: batch = Dataset(group) padding_lengths = batch.get_padding_lengths() logger.debug("Batch padding lengths: %s", str(padding_lengths)) logger.debug("Batch size: %d", len(batch.instances)) yield batch.as_tensor_dict(padding_lengths, cuda_device=cuda_device, for_training=for_training)
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model options_file = os.path.join(FIXTURES, 'options.json') weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5') elmo_bilm = _ElmoBiLm(options_file, weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) # Now finally we can iterate through batches. iterator = BasicIterator(3) for i, batch in enumerate(iterator(dataset, num_epochs=1, shuffle=False)): lm_embeddings = elmo_bilm(batch['elmo']['character_ids']) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings['activations'][2], lm_embeddings['mask'] ) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6 ) )
def test_lazy_as_tensor_dict(self): lazy_dataset = self.get_lazy_dataset() lazy_dataset.index_instances(self.vocab) for _ in range(10): dataset = Dataset([instance for instance in lazy_dataset]) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"].data.cpu().numpy() text2 = tensors["text2"]["tokens"].data.cpu().numpy() numpy.testing.assert_array_almost_equal( text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal( text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def batch_to_ids(batch): """ Given a batch (as list of tokenized sentences), return a batch of padded character ids. """ instances = [] for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Dataset(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] ontonotes_reader = Ontonotes() for sentences in tqdm(ontonotes_reader.dataset_document_iterator(file_path)): clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) instance = self.text_to_instance([s.words for s in sentences], canonical_clusters) instances.append(instance) if not instances: raise ConfigurationError("No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: instances = [] logger.info("Reading instances from lines in file at: %s", file_path) for line in tqdm.tqdm(data_file): line = line.strip("\n") # skip blank lines if not line: continue tokens_and_tags = [ pair.rsplit(self._word_tag_delimiter, 1) for pair in line.split(self._token_delimiter) ] tokens = [x[0] for x in tokens_and_tags] tags = [x[1] for x in tokens_and_tags] sequence = TextField(tokens, self._token_indexers) sequence_tags = SequenceLabelField(tags, sequence) instances.append( Instance({ 'tokens': sequence, 'tags': sequence_tags })) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def read_fnc(self, d): instances = [] for s in tqdm.tqdm(d.stances): headline = s['Headline'] bodyid = s['Body ID'] actualBody = d.articles[bodyid] label = s['Stance'] if not (label == "unrelated"): if (label == 'discuss'): new_label = "NOT ENOUGH INFO" if (label == 'agree'): new_label = "SUPPORTS" if (label == 'disagree'): new_label = "REFUTES" hypothesis = headline premise = actualBody instances.append( self.text_to_instance(premise, hypothesis, new_label)) # print(new_label) # print(premise) # print(hypothesis) # sys.exit(1) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?") return Dataset(instances)
def _sort_dataset_by_padding( dataset: Dataset, sorting_keys: List[Tuple[str, str]], # pylint: disable=invalid-sequence-index padding_noise: float = 0.0 ) -> Dataset: """ Sorts the ``Instances`` in this ``Batch`` by their padding lengths, using the keys in ``sorting_keys`` (in the order in which they are provided). ``sorting_keys`` is a list of ``(field_name, padding_key)`` tuples. """ instances_with_lengths = [] for instance in dataset.instances: padding_lengths = cast(Dict[str, Dict[str, float]], instance.get_padding_lengths()) if padding_noise > 0.0: noisy_lengths = {} for field_name, field_lengths in padding_lengths.items(): noisy_lengths[field_name] = add_noise_to_dict_values( field_lengths, padding_noise) padding_lengths = noisy_lengths instance_with_lengths = ([ padding_lengths[field_name][padding_key] for (field_name, padding_key) in sorting_keys ], instance) instances_with_lengths.append(instance_with_lengths) instances_with_lengths.sort(key=lambda x: x[0]) return Dataset([ instance_with_lengths[-1] for instance_with_lengths in instances_with_lengths ])
def setUp(self): token_indexer = SingleIdTokenIndexer("tokens") text_field = TextField(["a", "a", "a", "a", "b", "b", "c", "c", "c"], {"tokens": token_indexer}) self.instance = Instance({"text": text_field}) self.dataset = Dataset([self.instance]) super(TestVocabulary, self).setUp()
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] with open(file_path, 'r') as snli_file: logger.info("Reading instances from tsv/jsonl dataset at: %s", file_path) for line in tqdm.tqdm(snli_file): if file_path.endswith(".jsonl"): # SNLI format example = json.loads(line) label = example["gold_label"] premise = example["sentence1"] hypothesis = example["sentence2"] else: # DGEM/TSV format fields = line.split("\t") premise = fields[0] hypothesis = fields[1] label = fields[2] if label == '-': # ignore unknown examples continue instances.append( self.text_to_instance(premise, hypothesis, label)) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def read(self, file_path) -> Dataset: """ Read data from the `file_path` and return a :class:`Dataset`. """ # set trackers et = HCNEntityTracker() at = HCNActionTracker(et, file_path) action_templates = at.action_templates # get dialogs from file logger.info("Reading instances from lines in file at: {}".format(file_path)) dialogs, dialog_indices = util.read_dialogs(file_path, with_indices=True) with open('out/dialog_indices.json', 'w') as f: json.dump(dialog_indices, f) # get utterances utterances = util.get_utterances(file_path, dialogs) # get responses responses = util.get_responses(file_path, dialogs) responses = [self.get_template_id(response, et, action_templates) for response in responses] instances = [] for u, r in zip(utterances, responses): instances.append(self.text_to_instance(action_templates, u, r)) if not instances: raise ConfigurationError("No instances read!") return Dataset(instances)
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] with open(file_path, 'r') as snli_file: logger.info("Reading SNLI instances from jsonl dataset at: %s", file_path) for line in tqdm.tqdm(snli_file): example = json.loads(line) label = example["gold_label"] if label == '-': # These were cases where the annotators disagreed; we'll just skip them. It's # like 800 out of 500k examples in the training data. continue premise = example["sentence1"] hypothesis = example["sentence2"] instances.append( self.text_to_instance(premise, hypothesis, label)) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def read(self, file_path: str): logger.info( "Reading FrameNet full text instances from dataset files at: %s", file_path) instances = [] # prev_len = len(instances) for root, _, directory in tqdm.tqdm(list(os.walk(file_path))): for data_file in sorted(directory): if not data_file.endswith(".xml"): continue instances.extend( self.read_single_fulltext_file( os.path.join(root, data_file))) # logger.info("%s: # instances = %d", data_file, len(instances) - prev_len) # prev_len = len(instances) logger.info("# instances = %d", len(instances)) logger.info("# sentences = %d", self._num_sents) logger.info("# valid sentences = %d", self._valid_sents) logger.info("# avg tokens in sentence = %f", self._total_sentence_length / self._valid_sents) logger.info("# discontinuous targets = %d", self._discontinuous_targets) logger.info("%% adjacent spans with same label = %f (%d/%d)", self._adjacent_labeled_args / self._total_labeled_args, self._adjacent_labeled_args, self._total_labeled_args) self._reset() return Dataset(instances)
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) instances = [] with open(file_path) as dataset_file: document_state = _DocumentState() for line in dataset_file: if self._begin_document_regex.match(line): # We're beginning a document. Refresh the state. document_state = _DocumentState() elif line.startswith("#end document"): # We've finished a document. document_state.assert_document_is_finished() clusters = document_state.canonicalize_clusters() instance = self.text_to_instance(document_state.sentences, clusters) instances.append(instance) else: # Process a line. self._handle_line(line, document_state) if not instances: raise ConfigurationError("No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def read(self, file_path: str): instances = [] ds = FEVERDataSet(file_path, reader=self.reader, formatter=self.formatter) ds.read() for instance in tqdm.tqdm(ds.data): if instance is None: continue if not self._sentence_level: pages = set(ev[0] for ev in instance["evidence"]) premise = " ".join([self.db.get_doc_text(p) for p in pages]) else: lines = set([ self.get_doc_line(d[0], d[1]) for d in instance['evidence'] ]) premise = " ".join(lines) if len(premise.strip()) == 0: premise = "" hypothesis = instance["claim"] label = instance["label_text"] instances.append(self.text_to_instance(premise, hypothesis, label)) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] # open data file and read lines with open(file_path, 'r') as ontm_file: logger.info( "Reading ontology matching instances from jsonl dataset at: %s", file_path) for line in tqdm.tqdm(ontm_file): training_pair = json.loads(line) s_ent = training_pair['source_ent'] t_ent = training_pair['target_ent'] label = training_pair['label'] # convert entry to instance and append to instances instances.append(self.text_to_instance(s_ent, t_ent, label)) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) for sentence in ontonotes_reader.dataset_iterator(file_path): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] instances.append( self.text_to_instance(tokens, verb_label, tags)) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] instances.append( self.text_to_instance(tokens, verb_indicator, tags)) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def read(self, file_path: str): instances = [] with open(file_path, 'r') as snli_file: logger.info("Reading SNLI instances from jsonl dataset at: %s", file_path) for line in tqdm.tqdm(snli_file): example = json.loads(line) label = example["gold_label"] if label == '-': # These were cases where the annotators disagreed; we'll just skip them. It's # like 800 out of 500k examples in the training data. continue label_field = LabelField(label) premise = example["sentence1"] premise_field = TextField(self._tokenizer.tokenize(premise), self._token_indexers) hypothesis = example["sentence2"] hypothesis_field = TextField( self._tokenizer.tokenize(hypothesis), self._token_indexers) instances.append( Instance({ 'label': label_field, 'premise': premise_field, 'hypothesis': hypothesis_field })) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def read(self, file_path: str): with open(file_path, "r") as text_file: instance_strings = text_file.readlines() if self._tokens_per_instance is not None: all_text = " ".join( [x.replace("\n", " ").strip() for x in instance_strings]) tokenized_text = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance tokenized_strings = [] logger.info("Creating dataset from all text in file: %s", file_path) for index in tqdm.tqdm( range(0, len(tokenized_text) - num_tokens, num_tokens)): tokenized_strings.append(tokenized_text[index:index + num_tokens]) else: tokenized_strings = [ self._tokenizer.tokenize(s) for s in instance_strings ] # TODO(matt): this isn't quite right, because you really want to split on sentences, # tokenize the sentences, add the start and end tokens per sentence, then change the tokens # per instance if desired. But, we can fix that later, if someone actually wants to use # this for language modeling. This is just another example of how to use the data reader # code, for now. tokenized_strings = [[self._start_token] + x + [self._end_token] for x in tokenized_strings] # No matter how you want to represent the input, we'll always represent the output as a # single token id. This code lets you learn a language model that concatenates word # embeddings with character-level encoders, in order to predict the word token that comes # next. output_indexer = None # type: Dict[str, TokenIndexer] for name, indexer in self._token_indexers.items(): if isinstance(indexer, SingleIdTokenIndexer): output_indexer = {name: indexer} break else: output_indexer = {"tokens": SingleIdTokenIndexer()} instances = [] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], output_indexer) instances.append( Instance({ 'input_tokens': input_field, 'output_tokens': output_field })) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in tqdm.tqdm( itertools.groupby(data_file, _is_divider)): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists tokens, pos_tags, chunk_tags, ner_tags = [ list(field) for field in zip(*fields) ] # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens] sequence = TextField(tokens, self._token_indexers) instance_fields = {'tokens': sequence} # Add "feature labels" to instance if 'pos' in self.feature_labels: instance_fields['pos_tags'] = SequenceLabelField( pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: instance_fields['chunk_tags'] = SequenceLabelField( chunk_tags, sequence, "chunk_tags") if 'ner' in self.feature_labels: instance_fields['ner_tags'] = SequenceLabelField( ner_tags, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'ner': instance_fields['tags'] = SequenceLabelField( ner_tags, sequence) elif self.tag_label == 'pos': instance_fields['tags'] = SequenceLabelField( pos_tags, sequence) elif self.tag_label == 'chunk': instance_fields['tags'] = SequenceLabelField( chunk_tags, sequence) instances.append(Instance(instance_fields)) if not instances: raise ConfigurationError( "reading {} resulted in an empty Dataset".format(file_path)) return Dataset(instances)
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as text_file: instance_strings = text_file.readlines() if self._tokens_per_instance is not None: all_text = " ".join( [x.replace("\n", " ").strip() for x in instance_strings]) tokenized_text, _ = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance + 1 tokenized_strings = [] logger.info("Creating dataset from all text in file: %s", file_path) for index in tqdm.tqdm( range(0, len(tokenized_text) - num_tokens, num_tokens - 1)): tokenized_strings.append(tokenized_text[index:(index + num_tokens)]) else: tokenized_strings = [ self._tokenizer.tokenize(s)[0] for s in instance_strings ] # No matter how you want to represent the input, we'll always represent the output as a # single token id. This code lets you learn a language model that concatenates word # embeddings with character-level encoders, in order to predict the word token that comes # next. output_indexer = None # type: Dict[str, TokenIndexer] for name, indexer in self._token_indexers.items(): if isinstance(indexer, SingleIdTokenIndexer): output_indexer = {name: indexer} break else: output_indexer = {"tokens": SingleIdTokenIndexer()} instances = [] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], output_indexer) instances.append( Instance({ 'input_tokens': input_field, 'output_tokens': output_field })) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def _create_batches(self, dataset: Dataset, shuffle: bool) -> Iterable[Dataset]: instances = dataset.instances if shuffle: random.shuffle(instances) grouped_instances = group_by_count(instances, self._batch_size, None) # The last group might have not been full, so we check if any of the instances # are None, which is how group_by_count pads non-complete batches. grouped_instances[-1] = [ instance for instance in grouped_instances[-1] if instance is not None ] return (Dataset(batch) for batch in grouped_instances)
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] with open(file_path, 'r') as snli_file: logger.info("Reading JSONline instances from jsonl dataset at: %s", file_path) for line in tqdm.tqdm(snli_file): example = json.loads(line) input = example[self._input] label = str(example[self._gold_label]) instances.append(self.text_to_instance(input, label)) if not instances: raise ConfigurationError("No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] logger.info("Reading the dataset") instances = [] for article in tqdm(dataset): for paragraph_json in article['paragraphs']: paragraph = paragraph_json["context"] tokenized_paragraph = self._tokenizer.tokenize(paragraph) for question_answer in paragraph_json['qas']: question_text = question_answer["question"].strip( ).replace("\n", "") question_id = question_answer['id'].strip() # There may be multiple answer annotations, so we pick the one that occurs the # most. This only matters on the SQuAD dev set, and it means our computed # metrics ("start_acc", "end_acc", and "span_acc") aren't quite the same as the # official metrics, which look at all of the annotations. This is why we have # a separate official SQuAD metric calculation (the "em" and "f1" metrics use # the official script). candidate_answers: Counter = Counter() for answer in question_answer["answers"]: candidate_answers[(answer["answer_start"], answer["text"])] += 1 answer_texts = [ answer['text'] for answer in question_answer['answers'] ] char_span_start, answer_text = candidate_answers.most_common( 1)[0][0] instance = self.text_to_instance(question_text, paragraph, question_id, answer_text, char_span_start, tokenized_paragraph, answer_texts) instances.append(instance) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def read(self, file_path): instances = [] with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line_num, line in enumerate(tqdm.tqdm(data_file.readlines())): line = line.strip("\n") if not line: continue paper_json = json.loads(line) title = paper_json['title'] abstract = paper_json['paperAbstract'] venue = paper_json['venue'] instances.append(self.text_to_instance(title, abstract, venue)) if not instances: raise ConfigurationError("No instances read!") return Dataset(instances)
def read(self, file_path): instances = [] with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line_num, line in enumerate(tqdm.tqdm(data_file.readlines())): line = line.strip("\n") if not line: continue line = line.split("@@@") pivot_phrase = line[0] context_word = line[1] instances.append( self.text_to_instance(pivot_phrase, context_word)) if not instances: raise ConfigurationError("No instances read!") return Dataset(instances)