示例#1
0
def test_count():
    assert count((1, 2, 3)) == 3
    assert count([]) == 0
    assert count(iter((1, 2, 3, 4))) == 4

    assert count('hello') == 5
    assert count(iter('hello')) == 5
def test_count():
    assert count((1, 2, 3)) == 3
    assert count([]) == 0
    assert count(iter((1, 2, 3, 4))) == 4

    assert count("hello") == 5
    assert count(iter("hello")) == 5
示例#3
0
def test_count():
    assert count((1, 2, 3)) == 3
    assert count([]) == 0
    assert count(iter((1, 2, 3, 4))) == 4

    assert count('hello') == 5
    assert count(iter('hello')) == 5
示例#4
0
def test_count():
    assert count((1, 2, 3)) == 3
    assert count([]) == 0
    assert count(iter((1, 2, 3, 4))) == 4

    assert count("hello") == 5
    assert count(iter("hello")) == 5
def load_wili_data(dirpath, iso_lang_map, min_len=25):
    """
    Args:
        dirpath (str)
        iso_lang_map (Dict[str, str])
        min_len (int): minimum text length in *chars*

    Returns:
        List[Tuple[str, str]]

    References:
        https://zenodo.org/record/841984
    """
    dirpath = textacy.utils.to_path(dirpath).resolve()
    ds = []
    for subset in ("train", "test"):
        text_lines = textacy.io.read_text(dirpath.joinpath(
            "x_{}.txt".format(subset)),
                                          lines=True)
        lang_lines = textacy.io.read_text(dirpath.joinpath(
            "y_{}.txt".format(subset)),
                                          lines=True)
        texts = (line.strip() for line in text_lines)
        langs = (line.strip() for line in lang_lines)
        langs_set = set(iso_lang_map.keys())
        ds.extend((text, iso_lang_map[lang])
                  for text, lang in zip(texts, langs)
                  if lang in langs_set and itertoolz.count(
                      char for char in text if char.isalnum()) >= min_len)
    return ds
示例#6
0
    def load(self, langs: Set[str], min_len: int = 25) -> List[Tuple[str, str]]:
        """
        Args:
            langs
            min_len: Minimum text length in *chars* for a given example to be included.

        Returns:
            Sequence of (text, lang) examples.
        """
        data = []
        fstubs = [
            "dslcc3/train/task1-train.txt",
            "dslcc3/train/task1-dev.txt",
            "dslcc4/DSL-TRAIN.txt",
            "dslcc4/DSL-DEV.txt",
        ]
        for fstub in fstubs:
            filepath = self.data_dir.joinpath(fstub)
            lines = tio.read_text(filepath, mode="rt", encoding="utf-8", lines=True)
            for line in lines:
                if not line.strip():
                    continue
                try:
                    text, lang = line.split("\t")
                    if (
                        lang[:2] in langs
                        and itertoolz.count(c for c in text if c.isalnum()) >= min_len
                    ):
                        data.append((text, lang[:2]))
                except Exception:
                    LOGGER.debug("bad line in data")
                    pass
        data = sorted(set(data), key=operator.itemgetter(1))
        LOGGER.info("loaded DSLCCDataset data:\n%s ...", data[:3])
        return data
示例#7
0
    def load(
        self,
        iso_lang_map: Dict[str, str],
        min_len: int = 25,
    ) -> List[Tuple[str, str]]:
        """
        Args:
            iso_lang_map
            min_len: Minimum text length in *chars* for a given example to be included.

        Returns:
            Sequence of (text, lang) examples.
        """
        data = []
        # we'll combine train/test from individual datasets
        # and instead split on the full, aggregated dataset
        for subset in ("train", "test"):
            text_lines = tio.read_text(
                self.data_dir.joinpath(f"x_{subset}.txt"), lines=True
            )
            lang_lines = tio.read_text(
                self.data_dir.joinpath(f"y_{subset}.txt"), lines=True
            )
            texts = (line.strip() for line in text_lines)
            langs = (line.strip() for line in lang_lines)
            data.extend(
                (text, iso_lang_map[lang])
                for text, lang in zip(texts, langs)
                if lang in iso_lang_map
                and itertoolz.count(char for char in text if char.isalnum()) >= min_len
            )
        LOGGER.info("loaded Wili2018Dataset data:\n%s ...", data[:3])
        return data
def load_twitter_data(dirpath, langs, min_len=25):
    """
    Args:
        dirpath (str)
        langs (Set[str])
        min_len (int): minimum text length in *chars*

    Returns:
        List[Tuple[str, str]]
    """
    dirpath = textacy.utils.to_path(dirpath).resolve()
    raw_tweets = textacy.io.read_json(dirpath.joinpath("tweets.jsonl"),
                                      mode="rt",
                                      lines=True)
    tweets = []
    for tweet in raw_tweets:
        # totally remove any URLS from tweet text
        for url in tweet.get("urls", []):
            for item in url.values():
                tweet["text"] = tweet["text"].replace(item, "")
        tweets.append(tweet)
    ds = [(tweet["text"], tweet["lang"]) for tweet in tweets
          if tweet["lang"] in langs and itertoolz.count(
              char for char in tweet["text"] if char.isalnum()) >= min_len]
    return ds
示例#9
0
    def load(
        self,
        iso_lang_map: Dict[str, str],
        min_len: int = 25,
    ) -> List[Tuple[str, str]]:
        """
        Args:
            iso_lang_map
            min_len: Minimum text length in *chars* for a given example to be included.

        Returns:
            Sequence of (text, lang) examples.
        """
        rows = tio.read_csv(
            self.data_dir.joinpath("sentences.csv"),
            fieldnames=["sent_id", "iso-639-3", "text"],
            delimiter="\t",
            quoting=1,
        )
        data = [
            (row["text"], iso_lang_map[row["iso-639-3"]])
            for row in rows
            if row["iso-639-3"] in iso_lang_map
            and itertoolz.count(char for char in row["text"] if char.isalnum()) >= min_len
        ]
        LOGGER.info("loaded TatoebaDataset data:\n%s ...", data[:3])
        return data
示例#10
0
 def _add_valid_doc(self, doc: Doc) -> None:
     self.docs.append(doc)
     self._doc_ids.append(id(doc))
     self.n_docs += 1
     self.n_tokens += len(doc)
     if doc.is_sentenced:
         self.n_sents += itertoolz.count(doc.sents)
示例#11
0
 def _add_valid_doc(self, doc: Doc) -> None:
     self.docs.append(doc)
     self._doc_ids.append(id(doc))
     self.n_docs += 1
     self.n_tokens += len(doc)
     if doc.has_annotation("SENT_START"):
         self.n_sents += itertoolz.count(doc.sents)
示例#12
0
 def _remove_one_doc_by_index(self, idx: int) -> None:
     doc = self.docs[idx]
     self.n_docs -= 1
     self.n_tokens -= len(doc)
     if doc.has_annotation("SENT_START"):
         self.n_sents -= itertoolz.count(doc.sents)
     del self.docs[idx]
     del self._doc_ids[idx]
示例#13
0
 def _remove_one_doc_by_index(self, idx: int) -> None:
     doc = self.docs[idx]
     self.n_docs -= 1
     self.n_tokens -= len(doc)
     if doc.is_sentenced:
         self.n_sents -= itertoolz.count(doc.sents)
     del self.docs[idx]
     del self._doc_ids[idx]
示例#14
0
def n_monosyllable_words(n_syllables_per_word: Tuple[int, ...]) -> int:
    """
    Compute the number of monosyllobic words in a document.

    Args:
        n_syllables_per_word: Number of syllables per word in a given document,
            as computed by :func:`n_syllables_per_word()`.
    """
    return itertoolz.count(ns for ns in n_syllables_per_word if ns == 1)
示例#15
0
def n_words(doc_or_words: Union[Doc, Iterable[Token]]) -> int:
    """
    Compute the number of words in a document.

    Args:
        doc_or_words: If a spaCy ``Doc``, non-punctuation tokens (words) are extracted;
            if an iterable of spaCy ``Token`` s, all are included as-is.
    """
    words = _get_words(doc_or_words)
    return itertoolz.count(words)
示例#16
0
def get_n_sents(doc):
    """
    Get the number of sentences in ``Doc``.

    Args:
        doc (:class:`spacy.tokens.Doc`)

    Returns:
        int
    """
    return itertoolz.count(doc.sents)
示例#17
0
def load_and_agg_data(root_dirpath: pathlib.Path, force: bool, min_len: int,
                      min_obs: int) -> List[Tuple[str, str]]:
    """Download, load, and aggregate datasets."""
    iso_lang_resource = textacy.lang_id_._datasets.IsoLangResource(
        root_dirpath.joinpath("iso-639"))
    iso_lang_resource.download(force=force)
    iso_lang_map = iso_lang_resource.load(exclude={"sh"
                                                   })  # TODO: why exclude sh?
    valid_langs = set(iso_lang_map.values())

    udhr = textacy.datasets.UDHR(root_dirpath.joinpath("udhr"))
    udhr.download(force=force)
    udhr_data = [(snippet, meta["lang"]) for text, meta in udhr.records()
                 for snippet in text.split("\n")
                 if meta["lang"] in valid_langs and itertoolz.count(
                     char for char in snippet if char.isalnum()) >= min_len]

    dslcc = textacy.lang_id_._datasets.DSLCCDataset(
        root_dirpath.joinpath("dslcc"))
    dslcc.download(force=force)
    dslcc_data = dslcc.load(valid_langs, min_len=min_len)

    wili = textacy.lang_id_._datasets.Wili2018Dataset(
        root_dirpath.joinpath("wili"))
    wili.download(force=force)
    wili_data = wili.load(iso_lang_map, min_len=min_len)

    tatoeba = textacy.lang_id_._datasets.TatoebaDataset(
        root_dirpath.joinpath("tatoeba"))
    tatoeba.download(force=force)
    tatoeba_data = tatoeba.load(iso_lang_map, min_len=min_len)

    ud = textacy.lang_id_._datasets.UDDataset(root_dirpath.joinpath("ud"))
    ud.download(force=force)
    ud_data = ud.load(valid_langs, min_len=min_len)

    # aggregate and sample datasets
    agg_data = (
        udhr_data + wili_data + get_random_sample(
            tatoeba_data, 200000, stratify=True, random_state=42) +
        get_random_sample(ud_data, 200000, stratify=True, random_state=42)
        # add additional examples for hard-to-distinguish language groups
        + get_random_sample(dslcc_data, 50000, stratify=True, random_state=42)
        # add some extra english examples, since there's apparently a fair amount
        # of english sprinkled throughout other languages, causing meh performance
        + get_random_sample(
            [item for item in tatoeba_data if item[1] == "en"],
            10000,
            stratify=False,
            random_state=42,
        ))
    agg_data = filter_data_by_lang_count(agg_data, min_obs)
    return agg_data
示例#18
0
def n_long_words(n_chars_per_word: Tuple[int, ...],
                 min_n_chars: int = 7) -> int:
    """
    Compute the number of long words in a document.

    Args:
        n_chars_per_word: Number of characters per word in a given document,
            as computed by :func:`n_chars_per_word()`.
        min_n_chars: Minimum number of characters required for a word to be
            considered "long".
    """
    return itertoolz.count(nc for nc in n_chars_per_word if nc >= min_n_chars)
示例#19
0
def n_unique_words(doc_or_words: Union[Doc, Iterable[Token]]) -> int:
    """
    Compute the number of *unique* words in a document.

    Args:
        doc_or_words: If a spaCy ``Doc``, non-punctuation tokens (words) are extracted;
            if an iterable of spaCy ``Token`` s, all are included as-is.
    """
    words = _get_words(doc_or_words)
    # NOTE: this stdlib solution is slower than itertoolz for docs with ~250+ words
    # so let's take a small hit on short docs for the sake of big wins on long docs
    # return len({word.lower for word in words})
    return itertoolz.count(itertoolz.unique(word.lower for word in words))
示例#20
0
def _compute_word_scores(doc, word_occ_vals, word_freqs, stop_words):
    """
    Aggregate values from per-word occurrence values, compute per-word weights
    of several components, then combine components into per-word scores.

    Args:
        doc (:class:`spacy.tokens.Doc`)
        word_occ_vals (Dict[int, Dict[str, list]])
        word_freqs (Dict[int, int])
        stop_words (Set[str])

    Returns:
        Dict[int, float]
    """
    word_weights = collections.defaultdict(dict)
    # compute summary stats for word frequencies
    freqs_nsw = [
        freq for w_id, freq in word_freqs.items() if w_id not in stop_words
    ]
    freq_max = max(word_freqs.values())
    freq_baseline = statistics.mean(freqs_nsw) + statistics.stdev(freqs_nsw)
    n_sents = itertoolz.count(doc.sents)
    for w_id, vals in word_occ_vals.items():
        freq = word_freqs[w_id]
        word_weights[w_id]["case"] = sum(vals["is_uc"]) / math.log2(1 + freq)
        word_weights[w_id]["pos"] = math.log2(
            math.log2(3 + statistics.mean(vals["sent_idx"])))
        word_weights[w_id]["freq"] = freq / freq_baseline
        word_weights[w_id]["disp"] = len(set(vals["sent_idx"])) / n_sents
        n_unique_lc = len(set(vals["l_context"]))
        n_unique_rc = len(set(vals["r_context"]))
        try:
            wl = n_unique_lc / len(vals["l_context"])
        except ZeroDivisionError:
            wl = 0.0
        try:
            wr = n_unique_rc / len(vals["r_context"])
        except ZeroDivisionError:
            wr = 0.0
        pl = n_unique_lc / freq_max
        pr = n_unique_rc / freq_max
        word_weights[w_id]["rel"] = 1.0 + (wl + wr) * (freq /
                                                       freq_max) + pl + pr

    # combine individual weights into per-word scores
    word_scores = {
        w_id: (wts["rel"] * wts["pos"]) /
        (wts["case"] + (wts["freq"] / wts["rel"]) + (wts["disp"] / wts["rel"]))
        for w_id, wts in word_weights.items()
    }
    return word_scores
示例#21
0
def n_sents(doc: Doc) -> int:
    """
    Compute the number of sentences in a document.

    Warning:
        If ``doc`` has not been segmented into sentences, it will be modified in-place
        using spaCy's rule-based ``Sentencizer`` pipeline component before counting.
    """
    if not doc.is_sentenced:
        LOGGER.warning(
            "`doc` has not been segmented into sentences; applying spaCy's rule-based, "
            "`Sentencizer` pipeline component to `doc` before counting...")
        doc = _SENTENCIZER(doc)
    return itertoolz.count(doc.sents)
示例#22
0
def n_polysyllable_words(
    n_syllables_per_word: Tuple[int, ...],
    min_n_syllables: int = 3,
) -> int:
    """
    Compute the number of polysyllobic words in a document.

    Args:
        n_syllables_per_word: Number of syllables per word in a given document,
            as computed by :func:`n_syllables_per_word()`.
        min_n_syllables: Minimum number of syllables required for a word to be
            considered "polysyllobic".
    """
    return itertoolz.count(ns for ns in n_syllables_per_word
                           if ns >= min_n_syllables)
示例#23
0
def load_udhr_data(dirpath, langs, min_len=25):
    """
    Args:
        dirpath (str or :class:`pathlib.Path`)
        langs (Set[str])
        min_len (int)

    Returns:
        List[Tuple[str, str]]
    """
    ds = textacy.datasets.UDHR(data_dir=dirpath)
    data = [(snippet, meta["lang"]) for text, meta in ds.records()
            for snippet in text.split("\n")
            if meta["lang"] in langs and itertoolz.count(
                char for char in snippet if char.isalnum()) >= min_len]
    return data
示例#24
0
def load_tatoeba_data(dirpath, iso_lang_map, min_len=25):
    """
    Args:
        dirpath (str or :class:`pathlib.Path`)
        iso_lang_map (Dict[str, str])
        min_len (int): minimum text length in *chars*

    Returns:
        List[Tuple[str, str]]
    """
    dirpath = textacy.utils.to_path(dirpath).resolve()
    rows = textacy.io.read_csv(
        dirpath.joinpath("sentences.csv"),
        fieldnames=["sent_id", "iso-639-3", "text"],
        delimiter="\t",
        quoting=1,
    )
    langs = set(iso_lang_map.keys())
    ds = [(row["text"], iso_lang_map[row["iso-639-3"]]) for row in rows
          if row["iso-639-3"] in langs and itertoolz.count(
              char for char in row["text"] if char.isalnum()) >= min_len]
    return ds
示例#25
0
 def __init__(self, doc):
     self.lang = doc.vocab.lang
     self.n_sents = itertoolz.count(doc.sents) if doc.is_sentenced else None
     # get objs for basic count computations
     hyphenator = cache.load_hyphenator(lang=self.lang)
     words = tuple(
         extract.words(doc,
                       filter_punct=True,
                       filter_stops=False,
                       filter_nums=False))
     syllables_per_word = tuple(
         len(hyphenator.positions(word.lower_)) + 1 for word in words)
     chars_per_word = tuple(len(word) for word in words)
     # compute basic counts needed for most readability stats
     self.n_words = len(words)
     self.n_unique_words = len({word.lower for word in words})
     self.n_chars = sum(chars_per_word)
     self.n_long_words = sum(1 for cpw in chars_per_word if cpw >= 7)
     self.n_syllables = sum(syllables_per_word)
     self.n_monosyllable_words = sum(1 for spw in syllables_per_word
                                     if spw == 1)
     self.n_polysyllable_words = sum(1 for spw in syllables_per_word
                                     if spw >= 3)
示例#26
0
def scake(
    doc: Doc,
    *,
    normalize: Optional[Union[str, Callable[[Token], str]]] = "lemma",
    include_pos: Optional[Union[str,
                                Collection[str]]] = ("NOUN", "PROPN", "ADJ"),
    topn: Union[int, float] = 10,
) -> List[Tuple[str, float]]:
    """
    Extract key terms from a document using the sCAKE algorithm.

    Args:
        doc: spaCy ``Doc`` from which to extract keyterms. Must be sentence-segmented;
            optionally POS-tagged.
        normalize: If "lemma", lemmatize terms; if "lower", lowercase terms; if None,
            use the form of terms as they appeared in ``doc``; if a callable,
            must accept a ``Token`` and return a str,
            e.g. :func:`textacy.spacier.utils.get_normalized_text()`.
        include_pos: One or more POS tags with which to filter for good candidate keyterms.
            If None, include tokens of all POS tags
            (which also allows keyterm extraction from docs without POS-tagging.)
        topn: Number of top-ranked terms to return as key terms.
            If an integer, represents the absolute number; if a float, value
            must be in the interval (0.0, 1.0], which is converted to an int by
            ``int(round(len(candidates) * topn))``

    Returns:
        Sorted list of top ``topn`` key terms and their corresponding scores.

    References:
        Duari, Swagata & Bhatnagar, Vasudha. (2018). sCAKE: Semantic Connectivity
        Aware Keyword Extraction. Information Sciences. 477.
        https://arxiv.org/abs/1811.10831v1
    """
    # validate / transform args
    include_pos = cast(Set[str], utils.to_collection(include_pos, str, set))
    if isinstance(topn, float):
        if not 0.0 < topn <= 1.0:
            raise ValueError(
                "topn={} is invalid; "
                "must be an int, or a float between 0.0 and 1.0".format(topn))

    # bail out on empty docs
    if not doc:
        return []

    # build up a graph of good words, edges weighting by adjacent sentence co-occurrence
    cooc_mat: Counter[Tuple[str, str]] = collections.Counter()
    # handle edge case where doc only has 1 sentence
    n_sents = itertoolz.count(doc.sents)
    for window_sents in itertoolz.sliding_window(min(2, n_sents), doc.sents):
        if n_sents == 1:
            window_sents = (window_sents[0], [])
        window_words: Iterable[str] = (
            word for word in itertoolz.concat(window_sents)
            if not (word.is_stop or word.is_punct or word.is_space) and (
                not include_pos or word.pos_ in include_pos))
        window_words = ke_utils.normalize_terms(window_words, normalize)
        cooc_mat.update(
            w1_w2 for w1_w2 in itertools.combinations(sorted(window_words), 2)
            if w1_w2[0] != w1_w2[1])
    # doc doesn't have any valid words...
    if not cooc_mat:
        return []

    graph = nx.Graph()
    graph.add_edges_from((w1, w2, {
        "weight": weight
    }) for (w1, w2), weight in cooc_mat.items())

    word_scores = _compute_word_scores(doc, graph, cooc_mat, normalize)
    if not word_scores:
        return []

    # generate a list of candidate terms
    candidates = _get_candidates(doc, normalize, include_pos)
    if isinstance(topn, float):
        topn = int(round(len(set(candidates)) * topn))
    # rank candidates by aggregating constituent word scores
    candidate_scores = {
        " ".join(candidate):
        sum(word_scores.get(word, 0.0) for word in candidate)
        for candidate in candidates
    }
    sorted_candidate_scores = sorted(candidate_scores.items(),
                                     key=operator.itemgetter(1, 0),
                                     reverse=True)
    return ke_utils.get_filtered_topn_terms(sorted_candidate_scores,
                                            topn,
                                            match_threshold=0.8)
示例#27
0
def get_n_sents(doc: Doc) -> int:
    """Get the number of sentences in ``Doc``."""
    return itertoolz.count(doc.sents)