def _open_inside_zip(self, archive_path: str, member_path: Optional[str] = None) -> None: cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir) archive = zipfile.ZipFile(cached_archive_path, 'r') if member_path is None: members_list = archive.namelist() member_path = self._get_the_only_file_in_the_archive(members_list, archive_path) member_path = cast(str, member_path) member_file = archive.open(member_path, 'r') self._handle = io.TextIOWrapper(member_file, encoding=self._encoding) self._archive_handle = archive
def _open_inside_tar(self, archive_path: str, member_path: Optional[str] = None) -> None: cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir) archive = tarfile.open(cached_archive_path, 'r') if member_path is None: members_list = archive.getnames() member_path = self._get_the_only_file_in_the_archive(members_list, archive_path) member_path = cast(str, member_path) member = archive.getmember(member_path) # raises exception if not present member_file = cast(IO[bytes], archive.extractfile(member)) self._handle = io.TextIOWrapper(member_file, encoding=self._encoding) self._archive_handle = archive
def __init__(self, file_uri: str, encoding: str = DEFAULT_ENCODING, cache_dir: str = None) -> None: self.uri = file_uri self._encoding = encoding self._cache_dir = cache_dir self._archive_handle: Any = None # only if the file is inside an archive main_file_uri, path_inside_archive = parse_embeddings_file_uri(file_uri) main_file_local_path = cached_path(main_file_uri, cache_dir=cache_dir) if zipfile.is_zipfile(main_file_local_path): # ZIP archive self._open_inside_zip(main_file_uri, path_inside_archive) elif tarfile.is_tarfile(main_file_local_path): # TAR archive self._open_inside_tar(main_file_uri, path_inside_archive) else: # all the other supported formats, including uncompressed files if path_inside_archive: raise ValueError('Unsupported archive format: %s' + main_file_uri) # All the python packages for compressed files share the same interface of io.open extension = get_file_extension(main_file_uri) package = { '.txt': io, '.vec': io, '.gz': gzip, '.bz2': bz2, '.lzma': lzma, }.get(extension, None) if package is None: logger.warning('The embeddings file has an unknown file extension "%s". ' 'We will assume the file is an (uncompressed) text file', extension) package = io self._handle = package.open(main_file_local_path, 'rt', encoding=encoding) # type: ignore # To use this with tqdm we'd like to know the number of tokens. It's possible that the # first line of the embeddings file contains this: if it does, we want to start iteration # from the 2nd line, otherwise we want to start from the 1st. # Unfortunately, once we read the first line, we cannot move back the file iterator # because the underlying file may be "not seekable"; we use itertools.chain instead. first_line = next(self._handle) # this moves the iterator forward self.num_tokens = EmbeddingsTextFile._get_num_tokens_from_first_line(first_line) if self.num_tokens: # the first line is a header line: start iterating from the 2nd line self._iterator = self._handle else: # the first line is not a header line: start iterating from the 1st line self._iterator = itertools.chain([first_line], self._handle)
def _read_document(self, file_path: str): # if `file_path` is a URL, redirect to the cache if self._file_path is None: self._file_path = file_path file_path = cached_path(file_path) logger.info("Reading GC2012 instances from dataset file at: %s", file_path) # See `http://lair.cse.msu.edu/projects/implicit_annotations.html` for details. examples = [] with open(file_path, "r", encoding="utf-8") as f: for line in f: examples.append(json.loads(line)) for example in examples: pred_arg_info = defaultdict(list) trigger_span = tuple(example["trigger"]["span"]) for argn in example["arguments"]: arguments = example["arguments"][argn] # assert len(arguments) == 1 # expecting just the closest argument to trigger rather than a whole argument cluster for a in arguments: argument_span = a["span"] pred_arg_info[trigger_span].append( (argn, tuple(argument_span))) if pred_arg_info == dict(): # no annotations for this example continue sentence_start_offsets = [] sentence_ids = [] total_tokens = 0 for s_id, sentence in enumerate(example["sentences"]): sentence_start_offsets.append(total_tokens) sentence_ids.append(s_id) total_tokens += len(sentence) doc_id = example["doc_key"] genre = 'nw' yield self.text_to_instance(example["sentences"], sentence_start_offsets, pred_arg_info, genre, doc_id)
def _read_document(self, file_path: str): # if `file_path` is a URL, redirect to the cache if self._file_path is None: self._file_path = file_path file_path = cached_path(file_path) logger.info("Reading GVDB instances from dataset files at: %s", file_path) with open(file_path, "r", encoding="utf-8") as text_file: reports = text_file.read().splitlines() unique_slots = set() reports_skipped = 0 args_skipped = 0 input_args_seen = 0 for i, reportline in enumerate(reports): report = json.loads(reportline) tokens = [] token_offset = 0 # word offset wrt the document sentence_offsets = [] text_lens = [] date = report["date"] sentences = report["full_text"] spans = report["spans"] # During training, we want to skip documents without arguments. # During test, we do not. This flag is hard to pass through to # the dataset reader, and so instead we directly modify the code. # It's possible that it's safe to always append @@UNKNOWN@@, but # that will affect model behavior. training = False if spans == []: # No annotations for this report, so skip it reports_skipped += 1 if training: continue else: spans.append([0, 1, "@@UNKNOWN@@", "@@@", ["@@@"]]) for sentence in sentences: sentence_offsets.append(token_offset) tokens.extend([word for word in sentence]) token_offset += len(sentence) text_lens.append(len(sentence)) all_links = [] # elements are (event_type, slot, value) tuples doc_start, doc_end = 0, len( tokens ) - 1 # [0, num_tokens) -> [0, num_tokens - 1] to form an inclusive interval for link in spans: v_start, v_end, slot, gold, _ = link v_end -= 1 # right end is exclusive, so make it inclusive # the trigger is the entire document (t, s, v, g) = ((doc_start, doc_end), slot, (v_start, v_end), gold) all_links.append((t, s, v, g)) unique_slots.add(slot) input_args_seen += 1 text = " ".join(tokens) genre = None assert len(sentence_offsets) == len(text_lens) doc_id = report.get("doc_key", file_path + "_" + str(i)) doc_link_info = dict( ) # elements are {event_type: [(value, slot)]} dicts for (t, s, v, g) in all_links: if t not in doc_link_info: doc_link_info[t] = {t: [(s, v, g)]} else: doc_link_info[t][t].extend([(s, v, g)]) doc_link_info = [*doc_link_info.values()] # Find if report has any within-sentence value spans (i.e., findable by our model) # If it doesn't, skip this report trigger_arg_pairs = [] if doc_link_info is not None else None args = [] if doc_link_info is not None else None arg_sent_ids = {} if doc_link_info is not None else None if doc_link_info is not None: trigger_arg_pairs = [] for frame_data in doc_link_info: for trigger_span, argument_data in frame_data.items(): for (role, argument_span, gold) in argument_data: trigger_arg_pairs.append( [trigger_span, argument_span, role, gold]) arg_sentence_id = self._get_sentence_id( argument_span, sentence_offsets + [float('inf')]) if arg_sentence_id == CROSSES_SENTENCE_BOUNDARY: args_skipped += 1 continue args.append(argument_span) arg_sent_ids[argument_span] = arg_sentence_id if len(args) == 0: logger.info("No values for report #{}".format(i)) reports_skipped += 1 continue yield self.text_to_instance(sentences, sentence_offsets, doc_link_info, genre, doc_id) logger.info("Unique slots: {}".format(sorted(unique_slots))) logger.info("Skipped {} reports".format(reports_skipped)) logger.info( "Skipped {} arguments because they crossed a sentence boundary". format(args_skipped)) logger.info("{} arguments were given in the input documents".format( input_args_seen))
def _read_from_json(self, file_paths: List[str]) -> Iterator[Instance]: for file_path in file_paths: # if `file_path` is a URL, redirect to the cache self._file_path = file_path file_path = cached_path(file_path) skip_if_no_gold_links = True #### !!!!! When not given both gold triggers and gold arguments: Set to True during training, False during prediction with open(file_path, "r") as f: input_ = json.load(f) # check if document is in the language we expect doc_lang = input_["language_id"] if doc_lang != self._language: logger.info( f"Skipping file because it isn't in the target language: {file_path}" ) continue sentences = input_["sentences"] sentence_start_offsets = [] total_tokens = 0 for sentence in sentences: sentence_start_offsets.append(total_tokens) total_tokens += len(sentence) if self._annotation_mode == "events": triggers = input_["evt_triggers"] #triggers = [[0, total_tokens-1, t[2]] for t in triggers] # baseline: full-context trigger # Entity mentions and event mentions can be arguments arguments = input_["ent_spans"] + input_["evt_triggers"] has_gold_links = "gold_evt_links" in input_ if has_gold_links: links = input_["gold_evt_links"] #links = [[[0, total_tokens-1], link[1], link[2]] for link in links] # baseline: full-context trigger else: links = [] if skip_if_no_gold_links and links == []: # No links in this training document, so skip it logger.info( f"Skipping file because it has no links: {file_path}") continue recoverable_triggers = [ [t[0], t[1]] for t in triggers if (t[1] - t[0]) + 1 <= self._max_trigger_span_width ] recoverable_arguments = [ [a[0], a[1]] for a in arguments if (a[1] - a[0]) + 1 <= self._max_arg_span_width ] recoverable_links = [l for l in links] recoverable_links = [ rl for rl in recoverable_links if (self._use_gold_triggers or rl[0] in recoverable_triggers) and (self._use_gold_arguments or rl[1] in recoverable_arguments) ] if skip_if_no_gold_links and len(recoverable_links) == 0: logger.info( f"Skipping file because it has no RECOVERABLE links (max trigger width={self._max_trigger_span_width}, max arg width={self._max_arg_span_width}): {file_path}" ) continue else: raise ValueError( f"unrecognized annotation mode: {self._annotation_mode}") # Extract just the spans and not any auxiliary information (e.g., types) trigger_spans = tuple((t[0], t[1]) for t in triggers) argument_spans = tuple((a[0], a[1]) for a in arguments) # Deduplicate spans so that elements of `trigger_spans` and `argument_spans` are unique trigger_spans = list(set(trigger_spans)) argument_spans = list(set(argument_spans)) links = list(deep_tuple(links)) assert len(set(tuple(trigger_spans))) == len(trigger_spans) assert len(set(tuple(argument_spans))) == len(argument_spans) # Annotations might not be in sorted order, # so sort by start index, then by end index, then by type trigger_spans.sort() argument_spans.sort() links.sort() if self._use_gold_triggers and (not trigger_spans) and ( not has_gold_links): logger.info( f"No triggers given for this document: {file_path}. Skipping..." ) continue if self._use_gold_arguments and (not argument_spans) and ( not has_gold_links): logger.info( f"No arguments given for this document: {file_path}. Skipping..." ) continue doc_trigger_arg_info = defaultdict(list) for link in links: trigger, argument, role = link doc_trigger_arg_info[tuple(trigger)].append( (tuple(argument), ROLE_MAP(role))) if (not skip_if_no_gold_links) and (len(recoverable_links) == 0): for trigger in trigger_spans: # placeholder dummy argument of minimal length so that things don't crash during prediction mode due to span width hyperparameter settings # !!!!! This means that you should not rely on the automatically reported metrics, but rely only on the output of the RAMS scorer doc_trigger_arg_info[tuple(trigger)].append( (tuple([0, 0]), ROLE_MAP("DUMMY_ARGUMENT"))) genre = input_["doc_key"].split('_')[0] doc_id = input_["doc_key"] yield self.text_to_instance(sentences, sentence_start_offsets, genre, doc_id, trigger_spans, argument_spans, doc_trigger_arg_info)