Пример #1
0
 def _open_inside_zip(self, archive_path: str, member_path: Optional[str] = None) -> None:
     cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir)
     archive = zipfile.ZipFile(cached_archive_path, 'r')
     if member_path is None:
         members_list = archive.namelist()
         member_path = self._get_the_only_file_in_the_archive(members_list, archive_path)
     member_path = cast(str, member_path)
     member_file = archive.open(member_path, 'r')
     self._handle = io.TextIOWrapper(member_file, encoding=self._encoding)
     self._archive_handle = archive
Пример #2
0
 def _open_inside_tar(self, archive_path: str, member_path: Optional[str] = None) -> None:
     cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir)
     archive = tarfile.open(cached_archive_path, 'r')
     if member_path is None:
         members_list = archive.getnames()
         member_path = self._get_the_only_file_in_the_archive(members_list, archive_path)
     member_path = cast(str, member_path)
     member = archive.getmember(member_path)   # raises exception if not present
     member_file = cast(IO[bytes], archive.extractfile(member))
     self._handle = io.TextIOWrapper(member_file, encoding=self._encoding)
     self._archive_handle = archive
Пример #3
0
    def __init__(self,
                 file_uri: str,
                 encoding: str = DEFAULT_ENCODING,
                 cache_dir: str = None) -> None:

        self.uri = file_uri
        self._encoding = encoding
        self._cache_dir = cache_dir
        self._archive_handle: Any = None   # only if the file is inside an archive

        main_file_uri, path_inside_archive = parse_embeddings_file_uri(file_uri)
        main_file_local_path = cached_path(main_file_uri, cache_dir=cache_dir)

        if zipfile.is_zipfile(main_file_local_path):  # ZIP archive
            self._open_inside_zip(main_file_uri, path_inside_archive)

        elif tarfile.is_tarfile(main_file_local_path):  # TAR archive
            self._open_inside_tar(main_file_uri, path_inside_archive)

        else:  # all the other supported formats, including uncompressed files
            if path_inside_archive:
                raise ValueError('Unsupported archive format: %s' + main_file_uri)

            # All the python packages for compressed files share the same interface of io.open
            extension = get_file_extension(main_file_uri)
            package = {
                    '.txt': io,
                    '.vec': io,
                    '.gz': gzip,
                    '.bz2': bz2,
                    '.lzma': lzma,
                    }.get(extension, None)

            if package is None:
                logger.warning('The embeddings file has an unknown file extension "%s". '
                               'We will assume the file is an (uncompressed) text file', extension)
                package = io

            self._handle = package.open(main_file_local_path, 'rt', encoding=encoding)  # type: ignore

        # To use this with tqdm we'd like to know the number of tokens. It's possible that the
        # first line of the embeddings file contains this: if it does, we want to start iteration
        # from the 2nd line, otherwise we want to start from the 1st.
        # Unfortunately, once we read the first line, we cannot move back the file iterator
        # because the underlying file may be "not seekable"; we use itertools.chain instead.
        first_line = next(self._handle)     # this moves the iterator forward
        self.num_tokens = EmbeddingsTextFile._get_num_tokens_from_first_line(first_line)
        if self.num_tokens:
            # the first line is a header line: start iterating from the 2nd line
            self._iterator = self._handle
        else:
            # the first line is not a header line: start iterating from the 1st line
            self._iterator = itertools.chain([first_line], self._handle)
Пример #4
0
    def _read_document(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        if self._file_path is None:
            self._file_path = file_path
        file_path = cached_path(file_path)
        logger.info("Reading GC2012 instances from dataset file at: %s",
                    file_path)

        # See `http://lair.cse.msu.edu/projects/implicit_annotations.html` for details.

        examples = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                examples.append(json.loads(line))

        for example in examples:
            pred_arg_info = defaultdict(list)
            trigger_span = tuple(example["trigger"]["span"])
            for argn in example["arguments"]:
                arguments = example["arguments"][argn]
                # assert len(arguments) == 1  # expecting just the closest argument to trigger rather than a whole argument cluster
                for a in arguments:
                    argument_span = a["span"]

                    pred_arg_info[trigger_span].append(
                        (argn, tuple(argument_span)))

            if pred_arg_info == dict():
                # no annotations for this example
                continue

            sentence_start_offsets = []
            sentence_ids = []
            total_tokens = 0
            for s_id, sentence in enumerate(example["sentences"]):
                sentence_start_offsets.append(total_tokens)
                sentence_ids.append(s_id)

                total_tokens += len(sentence)

            doc_id = example["doc_key"]
            genre = 'nw'
            yield self.text_to_instance(example["sentences"],
                                        sentence_start_offsets, pred_arg_info,
                                        genre, doc_id)
Пример #5
0
    def _read_document(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        if self._file_path is None:
            self._file_path = file_path
        file_path = cached_path(file_path)
        logger.info("Reading GVDB instances from dataset files at: %s",
                    file_path)

        with open(file_path, "r", encoding="utf-8") as text_file:
            reports = text_file.read().splitlines()

        unique_slots = set()
        reports_skipped = 0
        args_skipped = 0
        input_args_seen = 0
        for i, reportline in enumerate(reports):
            report = json.loads(reportline)

            tokens = []
            token_offset = 0  # word offset wrt the document
            sentence_offsets = []
            text_lens = []

            date = report["date"]
            sentences = report["full_text"]
            spans = report["spans"]

            # During training, we want to skip documents without arguments.
            # During test, we do not. This flag is hard to pass through to
            # the dataset reader, and so instead we directly  modify the code.
            # It's possible that it's safe to always append @@UNKNOWN@@, but
            # that will affect model behavior.
            training = False

            if spans == []:
                # No annotations for this report, so skip it
                reports_skipped += 1
                if training:
                    continue
                else:
                    spans.append([0, 1, "@@UNKNOWN@@", "@@@", ["@@@"]])

            for sentence in sentences:
                sentence_offsets.append(token_offset)
                tokens.extend([word for word in sentence])
                token_offset += len(sentence)
                text_lens.append(len(sentence))

            all_links = []  # elements are (event_type, slot, value) tuples
            doc_start, doc_end = 0, len(
                tokens
            ) - 1  # [0, num_tokens) -> [0, num_tokens - 1] to form an inclusive interval
            for link in spans:
                v_start, v_end, slot, gold, _ = link
                v_end -= 1  # right end is exclusive, so make it inclusive
                # the trigger is the entire document
                (t, s, v, g) = ((doc_start, doc_end), slot, (v_start, v_end),
                                gold)
                all_links.append((t, s, v, g))
                unique_slots.add(slot)
                input_args_seen += 1

            text = " ".join(tokens)
            genre = None
            assert len(sentence_offsets) == len(text_lens)
            doc_id = report.get("doc_key", file_path + "_" + str(i))

            doc_link_info = dict(
            )  # elements are {event_type: [(value, slot)]} dicts
            for (t, s, v, g) in all_links:
                if t not in doc_link_info:
                    doc_link_info[t] = {t: [(s, v, g)]}
                else:
                    doc_link_info[t][t].extend([(s, v, g)])

            doc_link_info = [*doc_link_info.values()]

            # Find if report has any within-sentence value spans (i.e., findable by our model)
            # If it doesn't, skip this report
            trigger_arg_pairs = [] if doc_link_info is not None else None
            args = [] if doc_link_info is not None else None
            arg_sent_ids = {} if doc_link_info is not None else None
            if doc_link_info is not None:
                trigger_arg_pairs = []
                for frame_data in doc_link_info:
                    for trigger_span, argument_data in frame_data.items():
                        for (role, argument_span, gold) in argument_data:
                            trigger_arg_pairs.append(
                                [trigger_span, argument_span, role, gold])
                            arg_sentence_id = self._get_sentence_id(
                                argument_span,
                                sentence_offsets + [float('inf')])
                            if arg_sentence_id == CROSSES_SENTENCE_BOUNDARY:
                                args_skipped += 1
                                continue

                            args.append(argument_span)

                            arg_sent_ids[argument_span] = arg_sentence_id
            if len(args) == 0:
                logger.info("No values for report #{}".format(i))
                reports_skipped += 1
                continue

            yield self.text_to_instance(sentences, sentence_offsets,
                                        doc_link_info, genre, doc_id)
        logger.info("Unique slots: {}".format(sorted(unique_slots)))
        logger.info("Skipped {} reports".format(reports_skipped))
        logger.info(
            "Skipped {} arguments because they crossed a sentence boundary".
            format(args_skipped))
        logger.info("{} arguments were given in the input documents".format(
            input_args_seen))
Пример #6
0
    def _read_from_json(self, file_paths: List[str]) -> Iterator[Instance]:
        for file_path in file_paths:
            # if `file_path` is a URL, redirect to the cache
            self._file_path = file_path
            file_path = cached_path(file_path)
            skip_if_no_gold_links = True  #### !!!!! When not given both gold triggers and gold arguments: Set to True during training, False during prediction

            with open(file_path, "r") as f:
                input_ = json.load(f)

            # check if document is in the language we expect
            doc_lang = input_["language_id"]
            if doc_lang != self._language:
                logger.info(
                    f"Skipping file because it isn't in the target language: {file_path}"
                )
                continue

            sentences = input_["sentences"]

            sentence_start_offsets = []
            total_tokens = 0
            for sentence in sentences:
                sentence_start_offsets.append(total_tokens)
                total_tokens += len(sentence)

            if self._annotation_mode == "events":
                triggers = input_["evt_triggers"]
                #triggers = [[0, total_tokens-1, t[2]] for t in triggers] # baseline: full-context trigger

                # Entity mentions and event mentions can be arguments
                arguments = input_["ent_spans"] + input_["evt_triggers"]

                has_gold_links = "gold_evt_links" in input_

                if has_gold_links:
                    links = input_["gold_evt_links"]
                    #links = [[[0, total_tokens-1], link[1], link[2]] for link in links] # baseline: full-context trigger
                else:
                    links = []

                if skip_if_no_gold_links and links == []:
                    # No links in this training document, so skip it
                    logger.info(
                        f"Skipping file because it has no links: {file_path}")
                    continue

                recoverable_triggers = [
                    [t[0], t[1]] for t in triggers
                    if (t[1] - t[0]) + 1 <= self._max_trigger_span_width
                ]
                recoverable_arguments = [
                    [a[0], a[1]] for a in arguments
                    if (a[1] - a[0]) + 1 <= self._max_arg_span_width
                ]
                recoverable_links = [l for l in links]
                recoverable_links = [
                    rl for rl in recoverable_links if
                    (self._use_gold_triggers or rl[0] in recoverable_triggers)
                    and (self._use_gold_arguments
                         or rl[1] in recoverable_arguments)
                ]
                if skip_if_no_gold_links and len(recoverable_links) == 0:
                    logger.info(
                        f"Skipping file because it has no RECOVERABLE links (max trigger width={self._max_trigger_span_width}, max arg width={self._max_arg_span_width}): {file_path}"
                    )
                    continue

            else:
                raise ValueError(
                    f"unrecognized annotation mode: {self._annotation_mode}")

            # Extract just the spans and not any auxiliary information (e.g., types)
            trigger_spans = tuple((t[0], t[1]) for t in triggers)
            argument_spans = tuple((a[0], a[1]) for a in arguments)

            # Deduplicate spans so that elements of `trigger_spans` and `argument_spans` are unique
            trigger_spans = list(set(trigger_spans))
            argument_spans = list(set(argument_spans))
            links = list(deep_tuple(links))

            assert len(set(tuple(trigger_spans))) == len(trigger_spans)
            assert len(set(tuple(argument_spans))) == len(argument_spans)

            # Annotations might not be in sorted order,
            # so sort by start index, then by end index, then by type
            trigger_spans.sort()
            argument_spans.sort()
            links.sort()

            if self._use_gold_triggers and (not trigger_spans) and (
                    not has_gold_links):
                logger.info(
                    f"No triggers given for this document: {file_path}. Skipping..."
                )
                continue

            if self._use_gold_arguments and (not argument_spans) and (
                    not has_gold_links):
                logger.info(
                    f"No arguments given for this document: {file_path}. Skipping..."
                )
                continue

            doc_trigger_arg_info = defaultdict(list)
            for link in links:
                trigger, argument, role = link
                doc_trigger_arg_info[tuple(trigger)].append(
                    (tuple(argument), ROLE_MAP(role)))

            if (not skip_if_no_gold_links) and (len(recoverable_links) == 0):
                for trigger in trigger_spans:
                    # placeholder dummy argument of minimal length so that things don't crash during prediction mode due to span width hyperparameter settings
                    # !!!!! This means that you should not rely on the automatically reported metrics, but rely only on the output of the RAMS scorer
                    doc_trigger_arg_info[tuple(trigger)].append(
                        (tuple([0, 0]), ROLE_MAP("DUMMY_ARGUMENT")))

            genre = input_["doc_key"].split('_')[0]
            doc_id = input_["doc_key"]

            yield self.text_to_instance(sentences, sentence_start_offsets,
                                        genre, doc_id, trigger_spans,
                                        argument_spans, doc_trigger_arg_info)