def add_file_to_archive(self, name: str) -> None: """ Any class in its ``from_params`` method can request that some of its input files be added to the archive by calling this method. For example, if some class ``A`` had an ``input_file`` parameter, it could call ``` params.add_file_to_archive("input_file") ``` which would store the supplied value for ``input_file`` at the key ``previous.history.and.then.input_file``. The ``files_to_archive`` dict is shared with child instances via the ``_check_is_dict`` method, so that the final mapping can be retrieved from the top-level ``Params`` object. NOTE: You must call ``add_file_to_archive`` before you ``pop()`` the parameter, because the ``Params`` instance looks up the value of the filename inside itself. If the ``loading_from_archive`` flag is True, this will be a no-op. """ if not self.loading_from_archive: self.files_to_archive[f"{self.history}{name}"] = cached_path( self.get(name))
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] logger.info("Reading the dataset") for article in dataset: for paragraph_json in article['paragraphs']: paragraph = paragraph_json["context"] tokenized_paragraph = self._tokenizer.tokenize(paragraph) for question_answer in paragraph_json['qas']: question_text = question_answer["question"].strip( ).replace("\n", "") answer_texts = [ answer['text'] for answer in question_answer['answers'] ] span_starts = [ answer['answer_start'] for answer in question_answer['answers'] ] span_ends = [ start + len(answer) for start, answer in zip(span_starts, answer_texts) ] instance = self.text_to_instance( question_text, paragraph, zip(span_starts, span_ends), answer_texts, tokenized_paragraph) yield instance
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as text_file: instance_strings = text_file.readlines() if self._tokens_per_instance is not None: all_text = " ".join( [x.replace("\n", " ").strip() for x in instance_strings]) tokenized_text = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance + 1 tokenized_strings = [] logger.info("Creating dataset from all text in file: %s", file_path) for index in Tqdm.tqdm( range(0, len(tokenized_text) - num_tokens, num_tokens - 1)): tokenized_strings.append(tokenized_text[index:(index + num_tokens)]) else: tokenized_strings = [ self._tokenizer.tokenize(s) for s in instance_strings ] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) yield Instance({ 'input_tokens': input_field, 'output_tokens': output_field })
def _load_cnn_weights(self): cnn_options = self._options['char_cnn'] filters = cnn_options['filters'] char_embed_dim = cnn_options['embedding']['dim'] convolutions = [] for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d(in_channels=char_embed_dim, out_channels=num, kernel_size=width, bias=True) # load the weights with h5py.File(cached_path(self._weight_file), 'r') as fin: weight = fin['CNN']['W_cnn_{}'.format(i)][...] bias = fin['CNN']['b_cnn_{}'.format(i)][...] w_reshaped = numpy.transpose(weight.squeeze(axis=0), axes=(2, 1, 0)) if w_reshaped.shape != tuple(conv.weight.data.shape): raise ValueError("Invalid weight file") conv.weight.data.copy_(torch.FloatTensor(w_reshaped)) conv.bias.data.copy_(torch.FloatTensor(bias)) conv.weight.requires_grad = self.requires_grad conv.bias.requires_grad = self.requires_grad convolutions.append(conv) self.add_module('char_conv_{}'.format(i), conv) self._convolutions = convolutions
def __init__(self, file_uri: str, encoding: str = DEFAULT_ENCODING, cache_dir: str = None) -> None: self.uri = file_uri self._encoding = encoding self._cache_dir = cache_dir self._archive_handle: Any = None # only if the file is inside an archive main_file_uri, path_inside_archive = parse_embeddings_file_uri( file_uri) main_file_local_path = cached_path(main_file_uri, cache_dir=cache_dir) if zipfile.is_zipfile(main_file_local_path): # ZIP archive self._open_inside_zip(main_file_uri, path_inside_archive) elif tarfile.is_tarfile(main_file_local_path): # TAR archive self._open_inside_tar(main_file_uri, path_inside_archive) else: # all the other supported formats, including uncompressed files if path_inside_archive: raise ValueError('Unsupported archive format: %s' + main_file_uri) # All the python packages for compressed files share the same interface of io.open extension = get_file_extension(main_file_uri) package = { '.txt': io, '.vec': io, '.gz': gzip, '.bz2': bz2, '.lzma': lzma, }.get(extension, None) if package is None: logger.warning( 'The embeddings file has an unknown file extension "%s". ' 'We will assume the file is an (uncompressed) text file', extension) package = io self._handle = package.open(main_file_local_path, 'rt', encoding=encoding) # type: ignore # To use this with tqdm we'd like to know the number of tokens. It's possible that the # first line of the embeddings file contains this: if it does, we want to start iteration # from the 2nd line, otherwise we want to start from the 1st. # Unfortunately, once we read the first line, we cannot move back the file iterator # because the underlying file may be "not seekable"; we use itertools.chain instead. first_line = next(self._handle) # this moves the iterator forward self.num_tokens = EmbeddingsTextFile._get_num_tokens_from_first_line( first_line) if self.num_tokens: # the first line is a header line: start iterating from the 2nd line self._iterator = self._handle else: # the first line is not a header line: start iterating from the 1st line self._iterator = itertools.chain([first_line], self._handle)
def _read(self, file_path: str) -> Iterable[Instance]: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(data_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists fields = [list(field) for field in zip(*fields)] if self.ignore_ner_tags: tokens_, pos_tags, chunk_tags = fields[:3] ner_tags = None else: tokens_, pos_tags, chunk_tags, ner_tags = fields # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens_] yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, 'r') as conllu_file: logger.info("Reading UD instances from conllu dataset at: %s", file_path) for annotation in lazy_parse(conllu_file.read()): # CoNLLU annotations sometimes add back in words that have been elided # in the original sentence; we remove these, as we're just predicting # dependencies for the original sentence. # We filter by None here as elided words have a non-integer word id, # and are replaced with None by the conllu python library. annotation = [x for x in annotation if x["id"] is not None] heads = [x["head"] for x in annotation] tags = [x["deprel"] for x in annotation] words = [x["form"] for x in annotation] if self.use_language_specific_pos: pos_tags = [x["xpostag"] for x in annotation] else: pos_tags = [x["upostag"] for x in annotation] yield self.text_to_instance(words, pos_tags, list(zip(tags, heads)))
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info("Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info( "Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = ["O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [ 1 if label[-2:] == "-V" else 0 for label in tags ] yield self.text_to_instance(tokens, verb_indicator, tags)
def load_weights(self, weight_file: str) -> None: """ Load the pre-trained weights from the file. """ requires_grad = self.requires_grad with h5py.File(cached_path(weight_file), 'r') as fin: for i_layer, lstms in enumerate( zip(self.forward_layers, self.backward_layers) ): for j_direction, lstm in enumerate(lstms): # lstm is an instance of LSTMCellWithProjection cell_size = lstm.cell_size dataset = fin['RNN_%s' % j_direction]['RNN']['MultiRNNCell']['Cell%s' % i_layer ]['LSTMCell'] # tensorflow packs together both W and U matrices into one matrix, # but pytorch maintains individual matrices. In addition, tensorflow # packs the gates as input, memory, forget, output but pytorch # uses input, forget, memory, output. So we need to modify the weights. tf_weights = numpy.transpose(dataset['W_0'][...]) torch_weights = tf_weights.copy() # split the W from U matrices input_size = lstm.input_size input_weights = torch_weights[:, :input_size] recurrent_weights = torch_weights[:, input_size:] tf_input_weights = tf_weights[:, :input_size] tf_recurrent_weights = tf_weights[:, input_size:] # handle the different gate order convention for torch_w, tf_w in [[input_weights, tf_input_weights], [recurrent_weights, tf_recurrent_weights]]: torch_w[(1 * cell_size):(2 * cell_size), :] = tf_w[(2 * cell_size):(3 * cell_size), :] torch_w[(2 * cell_size):(3 * cell_size), :] = tf_w[(1 * cell_size):(2 * cell_size), :] lstm.input_linearity.weight.data.copy_(torch.FloatTensor(input_weights)) lstm.state_linearity.weight.data.copy_(torch.FloatTensor(recurrent_weights)) lstm.input_linearity.weight.requires_grad = requires_grad lstm.state_linearity.weight.requires_grad = requires_grad # the bias weights tf_bias = dataset['B'][...] # tensorflow adds 1.0 to forget gate bias instead of modifying the # parameters... tf_bias[(2 * cell_size):(3 * cell_size)] += 1 torch_bias = tf_bias.copy() torch_bias[(1 * cell_size):(2 * cell_size) ] = tf_bias[(2 * cell_size):(3 * cell_size)] torch_bias[(2 * cell_size):(3 * cell_size) ] = tf_bias[(1 * cell_size):(2 * cell_size)] lstm.state_linearity.bias.data.copy_(torch.FloatTensor(torch_bias)) lstm.state_linearity.bias.requires_grad = requires_grad # the projection weights proj_weights = numpy.transpose(dataset['W_P_0'][...]) lstm.state_projection.weight.data.copy_(torch.FloatTensor(proj_weights)) lstm.state_projection.weight.requires_grad = requires_grad
def _read(self, file_path): logger.info("Reading instances from lines in file at: %s", file_path) with open(cached_path(file_path), "r") as data_file: tsv_in = csv.reader(data_file, delimiter='\t') for row in tsv_in: if len(row) == 4: yield self.text_to_instance(premise=row[1], hypothesis=row[2], label=row[0])
def _load_char_embedding(self): with h5py.File(cached_path(self._weight_file), 'r') as fin: char_embed_weights = fin['char_embed'][...] weights = numpy.zeros( (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]), dtype='float32') weights[1:, :] = char_embed_weights self._char_embedding_weights = torch.nn.Parameter( torch.FloatTensor(weights), requires_grad=self.requires_grad)
def _execute_logical_form_on_table(logical_form: str, table: str): """ The parameters are written out to files which the jar file reads and then executes the logical form. """ logical_form_filename = os.path.join(SEMPRE_DIR, 'logical_forms.txt') with open(logical_form_filename, 'w') as temp_file: temp_file.write(logical_form + '\n') table_dir = os.path.join(SEMPRE_DIR, 'tsv/') os.makedirs(table_dir, exist_ok=True) # The .tsv file extension is important here since the table string parameter is in tsv format. # If this file was named with suffix .csv then Sempre would interpret it as comma separated # and return the wrong denotation. table_filename = 'context.tsv' with open(os.path.join(table_dir, table_filename), 'w', encoding='utf-8') as temp_file: temp_file.write(table) # The id, target, and utterance are ignored, we just need to get the # table filename into sempre's lisp format. test_record = ( '(example (id nt-0) (utterance none) (context (graph tables.TableKnowledgeGraph %s))' '(targetValue (list (description "6"))))' % (table_filename)) test_data_filename = os.path.join(SEMPRE_DIR, 'data.examples') with open(test_data_filename, 'w') as temp_file: temp_file.write(test_record) # TODO(matt): The jar that we have isn't optimal for this use case - we're using a # script designed for computing accuracy, and just pulling out a piece of it. Writing # a new entry point to the jar that's tailored for this use would be cleaner. command = ' '.join([ 'java', '-jar', cached_path(DEFAULT_EXECUTOR_JAR), test_data_filename, logical_form_filename, table_dir ]) run(command, shell=True) denotations_file = os.path.join(SEMPRE_DIR, 'logical_forms_denotations.tsv') with open(denotations_file) as temp_file: line = temp_file.readline().split('\t') # Clean up all the temp files generated from this function. # Take care to not remove the auxiliary sempre files os.remove(logical_form_filename) shutil.rmtree(table_dir) os.remove(denotations_file) os.remove(test_data_filename) return line[1] if len(line) > 1 else line[0]
def _open_inside_zip(self, archive_path: str, member_path: Optional[str] = None) -> None: cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir) archive = zipfile.ZipFile(cached_archive_path, 'r') if member_path is None: members_list = archive.namelist() member_path = self._get_the_only_file_in_the_archive( members_list, archive_path) member_path = cast(str, member_path) member_file = archive.open(member_path, 'r') self._handle = io.TextIOWrapper(member_file, encoding=self._encoding) self._archive_handle = archive
def test_cached_path(self): url = 'http://fake.datastore.com/glove.txt.gz' set_up_glove(url, self.glove_bytes) # non-existent file with pytest.raises(FileNotFoundError): filename = cached_path(self.FIXTURES_ROOT / "does_not_exist" / "fake_file.tar.gz") # unparsable URI with pytest.raises(ValueError): filename = cached_path("fakescheme://path/to/fake/file.tar.gz") # existing file as path assert cached_path(self.glove_file) == str(self.glove_file) # caches urls filename = cached_path(url, cache_dir=self.TEST_DIR) assert len(responses.calls) == 2 assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0")) with open(filename, 'rb') as cached_file: assert cached_file.read() == self.glove_bytes
def from_file(params_file: str, params_overrides: str = "") -> 'Params': """ Load a `Params` object from a configuration file. """ # redirect to cache, if necessary params_file = cached_path(params_file) ext_vars = dict(os.environ) file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars)) overrides_dict = parse_overrides(params_overrides) param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict) return Params(param_dict)
def _open_inside_tar(self, archive_path: str, member_path: Optional[str] = None) -> None: cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir) archive = tarfile.open(cached_archive_path, 'r') if member_path is None: members_list = archive.getnames() member_path = self._get_the_only_file_in_the_archive( members_list, archive_path) member_path = cast(str, member_path) member = archive.getmember( member_path) # raises exception if not present member_file = cast(IO[bytes], archive.extractfile(member)) self._handle = io.TextIOWrapper(member_file, encoding=self._encoding) self._archive_handle = archive
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path) as atis_file: logger.info("Reading ATIS instances from dataset at : %s", file_path) for line in _lazy_parse(atis_file.read()): utterances = [] for current_interaction in line['interaction']: if not current_interaction['utterance']: continue utterances.append(current_interaction['utterance']) instance = self.text_to_instance(utterances, current_interaction['sql']) if not instance: continue yield instance
def _load_projection(self): cnn_options = self._options['char_cnn'] filters = cnn_options['filters'] n_filters = sum(f[1] for f in filters) self._projection = torch.nn.Linear(n_filters, self.output_dim, bias=True) with h5py.File(cached_path(self._weight_file), 'r') as fin: weight = fin['CNN_proj']['W_proj'][...] bias = fin['CNN_proj']['b_proj'][...] self._projection.weight.data.copy_( torch.FloatTensor(numpy.transpose(weight))) self._projection.bias.data.copy_(torch.FloatTensor(bias)) self._projection.weight.requires_grad = self.requires_grad self._projection.bias.requires_grad = self.requires_grad
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) directory, filename = os.path.split(file_path) logger.info("Reading instances from lines in file at: %s", file_path) for parse in BracketParseCorpusReader(root=directory, fileids=[filename ]).parsed_sents(): self._strip_functional_tags(parse) # This is un-needed and clutters the label space. # All the trees also contain a root S node. if parse.label() == "VROOT": parse = parse[0] pos_tags = [x[1] for x in parse.pos()] if self._use_pos_tags else None yield self.text_to_instance(parse.leaves(), pos_tags, parse)
def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False, vocab_to_cache: List[str] = None) -> None: super(_ElmoBiLm, self).__init__() self._token_embedder = _ElmoCharacterEncoder( options_file, weight_file, requires_grad=requires_grad) self._requires_grad = requires_grad if requires_grad and vocab_to_cache: logging.warning( "You are fine tuning ELMo and caching char CNN word vectors. " "This behaviour is not guaranteed to be well defined, particularly. " "if not all of your inputs will occur in the vocabulary cache." ) # This is an embedding, used to look up cached # word vectors built from character level cnn embeddings. self._word_embedding = None self._bos_embedding: torch.Tensor = None self._eos_embedding: torch.Tensor = None if vocab_to_cache: logging.info( "Caching character cnn layers for words in vocabulary.") # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding. # They are set in the method so they can be accessed from outside the # constructor. self.create_cached_cnn_embeddings(vocab_to_cache) with open(cached_path(options_file), 'r') as fin: options = json.load(fin) if not options['lstm'].get('use_skip_connections'): raise ConfigurationError( 'We only support pretrained biLMs with residual connections') self._elmo_lstm = ElmoLstm( input_size=options['lstm']['projection_dim'], hidden_size=options['lstm']['projection_dim'], cell_size=options['lstm']['dim'], num_layers=options['lstm']['n_layers'], memory_cell_clip_value=options['lstm']['cell_clip'], state_projection_clip_value=options['lstm']['proj_clip'], requires_grad=requires_grad) self._elmo_lstm.load_weights(weight_file) # Number of representation layers including context independent layer self.num_layers = options['lstm']['n_layers'] + 1
def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line_num, line in enumerate(data_file): line = line.strip("\n") if not line: continue line_parts = line.split('\t') if len(line_parts) != 2: raise ConfigurationError( "Invalid line format: %s (line number %d)" % (line, line_num + 1)) source_sequence, target_sequence = line_parts yield self.text_to_instance(source_sequence, target_sequence)
def _create_sempre_executor(self) -> None: """ Creates a server running SEMPRE that we can send logical forms to for evaluation. This uses inter-process communication, because SEMPRE is java code. We also need to be careful to clean up the process when our program exits. """ if self._executor_process: return # It'd be much nicer to just use `cached_path` for these files. However, the SEMPRE jar # that we're using expects to find these files in a particular location, so we need to make # sure we put the files in that location. os.makedirs(SEMPRE_DIR, exist_ok=True) abbreviations_path = os.path.join(SEMPRE_DIR, 'abbreviations.tsv') if not os.path.exists(abbreviations_path): subprocess.run(f'wget {ABBREVIATIONS_FILE}', shell=True) subprocess.run( f'mv wikitables-abbreviations.tsv {abbreviations_path}', shell=True) grammar_path = os.path.join(SEMPRE_DIR, 'grow.grammar') if not os.path.exists(grammar_path): subprocess.run(f'wget {GROW_FILE}', shell=True) subprocess.run(f'mv wikitables-grow.grammar {grammar_path}', shell=True) args = [ 'java', '-jar', cached_path(SEMPRE_EXECUTOR_JAR), 'serve', self._table_directory ] self._executor_process = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=1) lines = [] for _ in range(6): # SEMPRE outputs six lines of stuff when it loads that I can't disable. So, we clear # that here. lines.append(str(self._executor_process.stdout.readline())) assert 'Parser' in lines[ -1], "SEMPRE server output unexpected; the server may have changed" logger.info("Started SEMPRE server for evaluating logical forms") # This is supposed to ensure that the subprocess gets killed when python exits. atexit.register(self._stop_sempre_executor)
def __init__(self, encoder: Dict[str, int] = None, byte_pairs: List[Tuple[str, str]] = None, n_ctx: int = 512, model_path: str = None) -> None: too_much_information = model_path and (encoder or byte_pairs) too_little_information = not model_path and not (encoder and byte_pairs) if too_much_information or too_little_information: raise ConfigurationError("must specify either model path or (encoder + byte_pairs) but not both") if model_path: model_path = cached_path(model_path) # Load encoder and byte_pairs from tar.gz with tarfile.open(model_path) as tmp: encoder_name = next(m.name for m in tmp.getmembers() if 'encoder_bpe' in m.name) encoder_info = tmp.extractfile(encoder_name) if encoder_info: encoder = json.loads(encoder_info.read()) else: raise ConfigurationError(f"expected encoder_bpe file in archive {model_path}") bpe_name = next(m.name for m in tmp.getmembers() if m.name.endswith('.bpe')) bpe_info = tmp.extractfile(bpe_name) if bpe_info: # First line is "version", last line is blank lines = bpe_info.read().decode('utf-8').split('\n')[1:-1] # Convert "b1 b2" -> (b1, b2) byte_pairs = [tuple(line.split()) for line in lines] # type: ignore else: raise ConfigurationError(f"expected .bpe file in archive {model_path}") self.encoder = encoder self.decoder = {word_id: word for word, word_id in self.encoder.items()} # Compute ranks self.bpe_ranks = {pair: idx for idx, pair in enumerate(byte_pairs)} self.cache: Dict[str, List[str]] = {} self.n_ctx = n_ctx
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line in data_file: line = line.strip("\n") # skip blank lines if not line: continue tokens_and_tags = [pair.rsplit(self._word_tag_delimiter, 1) for pair in line.split(self._token_delimiter)] tokens = [Token(token) for token, tag in tokens_and_tags] tags = [tag for token, tag in tokens_and_tags] yield self.text_to_instance(tokens, tags)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] logger.info("Reading the dataset") for article in dataset: for paragraph_json in article['paragraphs']: paragraph = paragraph_json["context"] tokenized_paragraph = self._tokenizer.tokenize(paragraph) qas = paragraph_json['qas'] metadata = {} metadata["instance_id"] = [qa['id'] for qa in qas] question_text_list = [ qa["question"].strip().replace("\n", "") for qa in qas ] answer_texts_list = [ [answer['text'] for answer in qa['answers']] for qa in qas ] metadata["question"] = question_text_list metadata['answer_texts_list'] = answer_texts_list span_starts_list = [[ answer['answer_start'] for answer in qa['answers'] ] for qa in qas] span_ends_list = [] for answer_starts, an_list in zip(span_starts_list, answer_texts_list): span_ends = [ start + len(answer) for start, answer in zip(answer_starts, an_list) ] span_ends_list.append(span_ends) yesno_list = [str(qa['yesno']) for qa in qas] followup_list = [str(qa['followup']) for qa in qas] instance = self.text_to_instance(question_text_list, paragraph, span_starts_list, span_ends_list, tokenized_paragraph, yesno_list, followup_list, metadata) yield instance
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, 'r') as snli_file: logger.info("Reading SNLI instances from jsonl dataset at: %s", file_path) for line in snli_file: example = json.loads(line) label = example["gold_label"] if label == '-': # These were cases where the annotators disagreed; we'll just skip them. It's # like 800 out of 500k examples in the training data. continue premise = example["sentence1"] hypothesis = example["sentence2"] yield self.text_to_instance(premise, hypothesis, label)
def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line in data_file.readlines(): line = line.strip("\n") if not line: continue parsed_line = Tree.fromstring(line) if self._use_subtrees: for subtree in parsed_line.subtrees(): instance = self.text_to_instance( subtree.leaves(), subtree.label()) if instance is not None: yield instance else: instance = self.text_to_instance(parsed_line.leaves(), parsed_line.label()) if instance is not None: yield instance
def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False) -> None: super(_ElmoCharacterEncoder, self).__init__() with open(cached_path(options_file), 'r') as fin: self._options = json.load(fin) self._weight_file = weight_file self.output_dim = self._options['lstm']['projection_dim'] self.requires_grad = requires_grad self._load_weights() # Cache the arrays for use in forward -- +1 due to masking. self._beginning_of_sentence_characters = torch.from_numpy( numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1) self._end_of_sentence_characters = torch.from_numpy( numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters: DefaultDict[int, List[Tuple[int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading instances from lines in file at: %s", file_path) with open(file_path) as input_file: for line in input_file: if line.startswith("(<"): # Each leaf looks like # (<L ccg_category modified_pos original_pos token predicate_arg_category>) leaves = re.findall("<L (.*?)>", line) # Use magic unzipping trick to split into tuples tuples = zip(*[leaf.split() for leaf in leaves]) # Convert to lists and assign to variables. ccg_categories, modified_pos_tags, original_pos_tags, tokens, predicate_arg_categories = \ [list(result) for result in tuples] yield self.text_to_instance(tokens, ccg_categories, modified_pos_tags, original_pos_tags, predicate_arg_categories)