def __init__(self, tokens: Union[SpanArray, Sequence[SpanArray]],
                 begin_tokens: Union[pd.Series, np.ndarray, Sequence[int]] = None,
                 end_tokens: Union[pd.Series, np.ndarray, Sequence[int]] = None):

        """
        :param tokens: Character-level span information about the underlying
        tokens. Can be a single set of tokens, covering all spans, or a separate
        `SpanArray` pointer for every span.

        :param begin_tokens: Array of begin offsets measured in tokens
        :param end_tokens: Array of end offsets measured in tokens
        """
        # Superclass constructor expects values for things that the subclass doesn't
        # use.
        super().__init__(_NOT_A_DOCUMENT_TEXT, _EMPTY_INT_ARRAY, _EMPTY_INT_ARRAY)

        if not isinstance(begin_tokens, (pd.Series, np.ndarray, list)):
            raise TypeError(f"begin_tokens is of unsupported type {type(begin_tokens)}. "
                            f"Supported types are Series, ndarray and List[int].")
        if not isinstance(end_tokens, (pd.Series, np.ndarray, list)):
            raise TypeError(f"end_tokens is of unsupported type {type(end_tokens)}. "
                            f"Supported types are Series, ndarray and List[int].")

        if isinstance(tokens, SpanArray):
            if not tokens.is_single_document:
                raise ValueError(f"Token spans come from more than one document.")
            # Can't just pass a SpanArray to np.full() or np.array(), because Numpy will
            # interpret it as an array-like of Span values.
            tokens_array = np.empty(len(begin_tokens), dtype=object)
            for i in range(len(begin_tokens)):
                tokens_array[i] = tokens
            tokens = tokens_array
        elif isinstance(tokens, collections.abc.Sequence):
            if len(tokens) != len(begin_tokens):
                raise ValueError(f"Received {len(tokens)} arrays of tokens and "
                                 f"{len(begin_tokens)} begin offsets. "
                                 f"Lengths should be equal.")
            # Can't just pass a SpanArray to np.array(), because Numpy will interpret it
            # as an array-like of Span values.
            tokens_array = np.empty(len(begin_tokens), dtype=object)
            for i in range(len(begin_tokens)):
                tokens_array[i] = tokens[i]
            tokens = tokens_array
        elif isinstance(tokens, np.ndarray):
            if len(tokens) != len(begin_tokens):
                raise ValueError(f"Received {len(tokens)} arrays of tokens and "
                                 f"{len(begin_tokens)} begin offsets. "
                                 f"Lengths should be equal.")
            if (len(tokens) > 0
                    and tokens[0] is not None
                    and not isinstance(tokens[0], SpanArray)):
                raise TypeError(f"Tokens object for row 0 is of unexpected type "
                                f"{type(tokens[0])}. Type should be SpanArray.")
        else:
            raise TypeError(f"Expected SpanArray or list of SpanArray as tokens "
                            f"but got {type(tokens)}")

        self._tokens = tokens
        self._begin_tokens = to_int_array(begin_tokens)
        self._end_tokens = to_int_array(end_tokens)
示例#2
0
    def __init__(self, text: Union[str, Sequence[str], np.ndarray,
                                   Tuple[StringTable, np.ndarray]],
                 begins: Union[pd.Series, np.ndarray, Sequence[int]],
                 ends: Union[pd.Series, np.ndarray, Sequence[int]]):
        """
        Factory method for creating instances of this class.

        :param text: Target text from which the spans of this array are drawn,
         or a sequence of texts if different spans can have different targets
        :param begins: Begin offsets of spans (closed)
        :param ends: End offsets (open)
        :return: A new `SpanArray` object
        """
        if not isinstance(begins, (pd.Series, np.ndarray, list)):
            raise TypeError(
                f"begins is of unsupported type {type(begins)}. "
                f"Supported types are Series, ndarray and List[int].")
        if not isinstance(ends, (pd.Series, np.ndarray, list)):
            raise TypeError(
                f"ends is of unsupported type {type(ends)}. "
                f"Supported types are Series, ndarray and List[int].")
        if len(begins) != len(ends):
            raise ValueError(
                f"Received {len(begins)} begin offsets and {len(ends)} "
                f"offsets. Lengths should be equal.")
        begins = to_int_array(begins)
        ends = to_int_array(ends)

        if isinstance(text, str):
            # With a single string, every row gets string ID 0
            string_table = StringTable.create_single(text)  # type: StringTable
            text_ids = np.zeros_like(begins)  # type: np.ndarray
        elif isinstance(text, tuple):
            # INTERNAL USE ONLY: String table specified directly.
            # Note that this branch MUST come before the branch that checks for
            # sequences of strings, because tuples are sequences.
            string_table, text_ids = text
        elif isinstance(text, (collections.abc.Sequence, np.ndarray)):
            if len(text) != len(
                    begins):  # Checked len(begins) == len(ends) earlier
                raise ValueError(
                    f"Received {len(text)} target text values and "
                    f"{len(begins)} begin offsets. Lengths should be equal.")
            string_table, text_ids = StringTable.merge_things(text)

        else:
            raise TypeError(
                f"Text argument is of unsupported type {type(text)}")

        # Begin and end offsets in characters
        self._begins = begins  # type: np.ndarray
        self._ends = ends  # type: np.ndarray

        self._string_table = string_table  # type: Union[StringTable, None]
        self._text_ids = text_ids

        # Cached list of other SpanArrays that are exactly the same as this
        # one. Each element is the result of calling id()
        self._equivalent_arrays = []  # type: List[int]

        # Version numbers of elements in self._equivalent_arrays, to ensure that
        # a change hasn't made the arrays no longer equal
        self._equiv_array_versions = []  # type: List[int]

        # Monotonically increasing version number for tracking changes and
        # invalidating caches
        self._version = 0

        # Flag that tells whether to display details of offsets in Jupyter notebooks
        self._repr_html_show_offsets = True  # type: bool