def test_count(): assert count((1, 2, 3)) == 3 assert count([]) == 0 assert count(iter((1, 2, 3, 4))) == 4 assert count('hello') == 5 assert count(iter('hello')) == 5
def test_count(): assert count((1, 2, 3)) == 3 assert count([]) == 0 assert count(iter((1, 2, 3, 4))) == 4 assert count("hello") == 5 assert count(iter("hello")) == 5
def test_count(): assert count((1, 2, 3)) == 3 assert count([]) == 0 assert count(iter((1, 2, 3, 4))) == 4 assert count('hello') == 5 assert count(iter('hello')) == 5
def test_count(): assert count((1, 2, 3)) == 3 assert count([]) == 0 assert count(iter((1, 2, 3, 4))) == 4 assert count("hello") == 5 assert count(iter("hello")) == 5
def load_wili_data(dirpath, iso_lang_map, min_len=25): """ Args: dirpath (str) iso_lang_map (Dict[str, str]) min_len (int): minimum text length in *chars* Returns: List[Tuple[str, str]] References: https://zenodo.org/record/841984 """ dirpath = textacy.utils.to_path(dirpath).resolve() ds = [] for subset in ("train", "test"): text_lines = textacy.io.read_text(dirpath.joinpath( "x_{}.txt".format(subset)), lines=True) lang_lines = textacy.io.read_text(dirpath.joinpath( "y_{}.txt".format(subset)), lines=True) texts = (line.strip() for line in text_lines) langs = (line.strip() for line in lang_lines) langs_set = set(iso_lang_map.keys()) ds.extend((text, iso_lang_map[lang]) for text, lang in zip(texts, langs) if lang in langs_set and itertoolz.count( char for char in text if char.isalnum()) >= min_len) return ds
def load(self, langs: Set[str], min_len: int = 25) -> List[Tuple[str, str]]: """ Args: langs min_len: Minimum text length in *chars* for a given example to be included. Returns: Sequence of (text, lang) examples. """ data = [] fstubs = [ "dslcc3/train/task1-train.txt", "dslcc3/train/task1-dev.txt", "dslcc4/DSL-TRAIN.txt", "dslcc4/DSL-DEV.txt", ] for fstub in fstubs: filepath = self.data_dir.joinpath(fstub) lines = tio.read_text(filepath, mode="rt", encoding="utf-8", lines=True) for line in lines: if not line.strip(): continue try: text, lang = line.split("\t") if ( lang[:2] in langs and itertoolz.count(c for c in text if c.isalnum()) >= min_len ): data.append((text, lang[:2])) except Exception: LOGGER.debug("bad line in data") pass data = sorted(set(data), key=operator.itemgetter(1)) LOGGER.info("loaded DSLCCDataset data:\n%s ...", data[:3]) return data
def load( self, iso_lang_map: Dict[str, str], min_len: int = 25, ) -> List[Tuple[str, str]]: """ Args: iso_lang_map min_len: Minimum text length in *chars* for a given example to be included. Returns: Sequence of (text, lang) examples. """ data = [] # we'll combine train/test from individual datasets # and instead split on the full, aggregated dataset for subset in ("train", "test"): text_lines = tio.read_text( self.data_dir.joinpath(f"x_{subset}.txt"), lines=True ) lang_lines = tio.read_text( self.data_dir.joinpath(f"y_{subset}.txt"), lines=True ) texts = (line.strip() for line in text_lines) langs = (line.strip() for line in lang_lines) data.extend( (text, iso_lang_map[lang]) for text, lang in zip(texts, langs) if lang in iso_lang_map and itertoolz.count(char for char in text if char.isalnum()) >= min_len ) LOGGER.info("loaded Wili2018Dataset data:\n%s ...", data[:3]) return data
def load_twitter_data(dirpath, langs, min_len=25): """ Args: dirpath (str) langs (Set[str]) min_len (int): minimum text length in *chars* Returns: List[Tuple[str, str]] """ dirpath = textacy.utils.to_path(dirpath).resolve() raw_tweets = textacy.io.read_json(dirpath.joinpath("tweets.jsonl"), mode="rt", lines=True) tweets = [] for tweet in raw_tweets: # totally remove any URLS from tweet text for url in tweet.get("urls", []): for item in url.values(): tweet["text"] = tweet["text"].replace(item, "") tweets.append(tweet) ds = [(tweet["text"], tweet["lang"]) for tweet in tweets if tweet["lang"] in langs and itertoolz.count( char for char in tweet["text"] if char.isalnum()) >= min_len] return ds
def load( self, iso_lang_map: Dict[str, str], min_len: int = 25, ) -> List[Tuple[str, str]]: """ Args: iso_lang_map min_len: Minimum text length in *chars* for a given example to be included. Returns: Sequence of (text, lang) examples. """ rows = tio.read_csv( self.data_dir.joinpath("sentences.csv"), fieldnames=["sent_id", "iso-639-3", "text"], delimiter="\t", quoting=1, ) data = [ (row["text"], iso_lang_map[row["iso-639-3"]]) for row in rows if row["iso-639-3"] in iso_lang_map and itertoolz.count(char for char in row["text"] if char.isalnum()) >= min_len ] LOGGER.info("loaded TatoebaDataset data:\n%s ...", data[:3]) return data
def _add_valid_doc(self, doc: Doc) -> None: self.docs.append(doc) self._doc_ids.append(id(doc)) self.n_docs += 1 self.n_tokens += len(doc) if doc.is_sentenced: self.n_sents += itertoolz.count(doc.sents)
def _add_valid_doc(self, doc: Doc) -> None: self.docs.append(doc) self._doc_ids.append(id(doc)) self.n_docs += 1 self.n_tokens += len(doc) if doc.has_annotation("SENT_START"): self.n_sents += itertoolz.count(doc.sents)
def _remove_one_doc_by_index(self, idx: int) -> None: doc = self.docs[idx] self.n_docs -= 1 self.n_tokens -= len(doc) if doc.has_annotation("SENT_START"): self.n_sents -= itertoolz.count(doc.sents) del self.docs[idx] del self._doc_ids[idx]
def _remove_one_doc_by_index(self, idx: int) -> None: doc = self.docs[idx] self.n_docs -= 1 self.n_tokens -= len(doc) if doc.is_sentenced: self.n_sents -= itertoolz.count(doc.sents) del self.docs[idx] del self._doc_ids[idx]
def n_monosyllable_words(n_syllables_per_word: Tuple[int, ...]) -> int: """ Compute the number of monosyllobic words in a document. Args: n_syllables_per_word: Number of syllables per word in a given document, as computed by :func:`n_syllables_per_word()`. """ return itertoolz.count(ns for ns in n_syllables_per_word if ns == 1)
def n_words(doc_or_words: Union[Doc, Iterable[Token]]) -> int: """ Compute the number of words in a document. Args: doc_or_words: If a spaCy ``Doc``, non-punctuation tokens (words) are extracted; if an iterable of spaCy ``Token`` s, all are included as-is. """ words = _get_words(doc_or_words) return itertoolz.count(words)
def get_n_sents(doc): """ Get the number of sentences in ``Doc``. Args: doc (:class:`spacy.tokens.Doc`) Returns: int """ return itertoolz.count(doc.sents)
def load_and_agg_data(root_dirpath: pathlib.Path, force: bool, min_len: int, min_obs: int) -> List[Tuple[str, str]]: """Download, load, and aggregate datasets.""" iso_lang_resource = textacy.lang_id_._datasets.IsoLangResource( root_dirpath.joinpath("iso-639")) iso_lang_resource.download(force=force) iso_lang_map = iso_lang_resource.load(exclude={"sh" }) # TODO: why exclude sh? valid_langs = set(iso_lang_map.values()) udhr = textacy.datasets.UDHR(root_dirpath.joinpath("udhr")) udhr.download(force=force) udhr_data = [(snippet, meta["lang"]) for text, meta in udhr.records() for snippet in text.split("\n") if meta["lang"] in valid_langs and itertoolz.count( char for char in snippet if char.isalnum()) >= min_len] dslcc = textacy.lang_id_._datasets.DSLCCDataset( root_dirpath.joinpath("dslcc")) dslcc.download(force=force) dslcc_data = dslcc.load(valid_langs, min_len=min_len) wili = textacy.lang_id_._datasets.Wili2018Dataset( root_dirpath.joinpath("wili")) wili.download(force=force) wili_data = wili.load(iso_lang_map, min_len=min_len) tatoeba = textacy.lang_id_._datasets.TatoebaDataset( root_dirpath.joinpath("tatoeba")) tatoeba.download(force=force) tatoeba_data = tatoeba.load(iso_lang_map, min_len=min_len) ud = textacy.lang_id_._datasets.UDDataset(root_dirpath.joinpath("ud")) ud.download(force=force) ud_data = ud.load(valid_langs, min_len=min_len) # aggregate and sample datasets agg_data = ( udhr_data + wili_data + get_random_sample( tatoeba_data, 200000, stratify=True, random_state=42) + get_random_sample(ud_data, 200000, stratify=True, random_state=42) # add additional examples for hard-to-distinguish language groups + get_random_sample(dslcc_data, 50000, stratify=True, random_state=42) # add some extra english examples, since there's apparently a fair amount # of english sprinkled throughout other languages, causing meh performance + get_random_sample( [item for item in tatoeba_data if item[1] == "en"], 10000, stratify=False, random_state=42, )) agg_data = filter_data_by_lang_count(agg_data, min_obs) return agg_data
def n_long_words(n_chars_per_word: Tuple[int, ...], min_n_chars: int = 7) -> int: """ Compute the number of long words in a document. Args: n_chars_per_word: Number of characters per word in a given document, as computed by :func:`n_chars_per_word()`. min_n_chars: Minimum number of characters required for a word to be considered "long". """ return itertoolz.count(nc for nc in n_chars_per_word if nc >= min_n_chars)
def n_unique_words(doc_or_words: Union[Doc, Iterable[Token]]) -> int: """ Compute the number of *unique* words in a document. Args: doc_or_words: If a spaCy ``Doc``, non-punctuation tokens (words) are extracted; if an iterable of spaCy ``Token`` s, all are included as-is. """ words = _get_words(doc_or_words) # NOTE: this stdlib solution is slower than itertoolz for docs with ~250+ words # so let's take a small hit on short docs for the sake of big wins on long docs # return len({word.lower for word in words}) return itertoolz.count(itertoolz.unique(word.lower for word in words))
def _compute_word_scores(doc, word_occ_vals, word_freqs, stop_words): """ Aggregate values from per-word occurrence values, compute per-word weights of several components, then combine components into per-word scores. Args: doc (:class:`spacy.tokens.Doc`) word_occ_vals (Dict[int, Dict[str, list]]) word_freqs (Dict[int, int]) stop_words (Set[str]) Returns: Dict[int, float] """ word_weights = collections.defaultdict(dict) # compute summary stats for word frequencies freqs_nsw = [ freq for w_id, freq in word_freqs.items() if w_id not in stop_words ] freq_max = max(word_freqs.values()) freq_baseline = statistics.mean(freqs_nsw) + statistics.stdev(freqs_nsw) n_sents = itertoolz.count(doc.sents) for w_id, vals in word_occ_vals.items(): freq = word_freqs[w_id] word_weights[w_id]["case"] = sum(vals["is_uc"]) / math.log2(1 + freq) word_weights[w_id]["pos"] = math.log2( math.log2(3 + statistics.mean(vals["sent_idx"]))) word_weights[w_id]["freq"] = freq / freq_baseline word_weights[w_id]["disp"] = len(set(vals["sent_idx"])) / n_sents n_unique_lc = len(set(vals["l_context"])) n_unique_rc = len(set(vals["r_context"])) try: wl = n_unique_lc / len(vals["l_context"]) except ZeroDivisionError: wl = 0.0 try: wr = n_unique_rc / len(vals["r_context"]) except ZeroDivisionError: wr = 0.0 pl = n_unique_lc / freq_max pr = n_unique_rc / freq_max word_weights[w_id]["rel"] = 1.0 + (wl + wr) * (freq / freq_max) + pl + pr # combine individual weights into per-word scores word_scores = { w_id: (wts["rel"] * wts["pos"]) / (wts["case"] + (wts["freq"] / wts["rel"]) + (wts["disp"] / wts["rel"])) for w_id, wts in word_weights.items() } return word_scores
def n_sents(doc: Doc) -> int: """ Compute the number of sentences in a document. Warning: If ``doc`` has not been segmented into sentences, it will be modified in-place using spaCy's rule-based ``Sentencizer`` pipeline component before counting. """ if not doc.is_sentenced: LOGGER.warning( "`doc` has not been segmented into sentences; applying spaCy's rule-based, " "`Sentencizer` pipeline component to `doc` before counting...") doc = _SENTENCIZER(doc) return itertoolz.count(doc.sents)
def n_polysyllable_words( n_syllables_per_word: Tuple[int, ...], min_n_syllables: int = 3, ) -> int: """ Compute the number of polysyllobic words in a document. Args: n_syllables_per_word: Number of syllables per word in a given document, as computed by :func:`n_syllables_per_word()`. min_n_syllables: Minimum number of syllables required for a word to be considered "polysyllobic". """ return itertoolz.count(ns for ns in n_syllables_per_word if ns >= min_n_syllables)
def load_udhr_data(dirpath, langs, min_len=25): """ Args: dirpath (str or :class:`pathlib.Path`) langs (Set[str]) min_len (int) Returns: List[Tuple[str, str]] """ ds = textacy.datasets.UDHR(data_dir=dirpath) data = [(snippet, meta["lang"]) for text, meta in ds.records() for snippet in text.split("\n") if meta["lang"] in langs and itertoolz.count( char for char in snippet if char.isalnum()) >= min_len] return data
def load_tatoeba_data(dirpath, iso_lang_map, min_len=25): """ Args: dirpath (str or :class:`pathlib.Path`) iso_lang_map (Dict[str, str]) min_len (int): minimum text length in *chars* Returns: List[Tuple[str, str]] """ dirpath = textacy.utils.to_path(dirpath).resolve() rows = textacy.io.read_csv( dirpath.joinpath("sentences.csv"), fieldnames=["sent_id", "iso-639-3", "text"], delimiter="\t", quoting=1, ) langs = set(iso_lang_map.keys()) ds = [(row["text"], iso_lang_map[row["iso-639-3"]]) for row in rows if row["iso-639-3"] in langs and itertoolz.count( char for char in row["text"] if char.isalnum()) >= min_len] return ds
def __init__(self, doc): self.lang = doc.vocab.lang self.n_sents = itertoolz.count(doc.sents) if doc.is_sentenced else None # get objs for basic count computations hyphenator = cache.load_hyphenator(lang=self.lang) words = tuple( extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False)) syllables_per_word = tuple( len(hyphenator.positions(word.lower_)) + 1 for word in words) chars_per_word = tuple(len(word) for word in words) # compute basic counts needed for most readability stats self.n_words = len(words) self.n_unique_words = len({word.lower for word in words}) self.n_chars = sum(chars_per_word) self.n_long_words = sum(1 for cpw in chars_per_word if cpw >= 7) self.n_syllables = sum(syllables_per_word) self.n_monosyllable_words = sum(1 for spw in syllables_per_word if spw == 1) self.n_polysyllable_words = sum(1 for spw in syllables_per_word if spw >= 3)
def scake( doc: Doc, *, normalize: Optional[Union[str, Callable[[Token], str]]] = "lemma", include_pos: Optional[Union[str, Collection[str]]] = ("NOUN", "PROPN", "ADJ"), topn: Union[int, float] = 10, ) -> List[Tuple[str, float]]: """ Extract key terms from a document using the sCAKE algorithm. Args: doc: spaCy ``Doc`` from which to extract keyterms. Must be sentence-segmented; optionally POS-tagged. normalize: If "lemma", lemmatize terms; if "lower", lowercase terms; if None, use the form of terms as they appeared in ``doc``; if a callable, must accept a ``Token`` and return a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`. include_pos: One or more POS tags with which to filter for good candidate keyterms. If None, include tokens of all POS tags (which also allows keyterm extraction from docs without POS-tagging.) topn: Number of top-ranked terms to return as key terms. If an integer, represents the absolute number; if a float, value must be in the interval (0.0, 1.0], which is converted to an int by ``int(round(len(candidates) * topn))`` Returns: Sorted list of top ``topn`` key terms and their corresponding scores. References: Duari, Swagata & Bhatnagar, Vasudha. (2018). sCAKE: Semantic Connectivity Aware Keyword Extraction. Information Sciences. 477. https://arxiv.org/abs/1811.10831v1 """ # validate / transform args include_pos = cast(Set[str], utils.to_collection(include_pos, str, set)) if isinstance(topn, float): if not 0.0 < topn <= 1.0: raise ValueError( "topn={} is invalid; " "must be an int, or a float between 0.0 and 1.0".format(topn)) # bail out on empty docs if not doc: return [] # build up a graph of good words, edges weighting by adjacent sentence co-occurrence cooc_mat: Counter[Tuple[str, str]] = collections.Counter() # handle edge case where doc only has 1 sentence n_sents = itertoolz.count(doc.sents) for window_sents in itertoolz.sliding_window(min(2, n_sents), doc.sents): if n_sents == 1: window_sents = (window_sents[0], []) window_words: Iterable[str] = ( word for word in itertoolz.concat(window_sents) if not (word.is_stop or word.is_punct or word.is_space) and ( not include_pos or word.pos_ in include_pos)) window_words = ke_utils.normalize_terms(window_words, normalize) cooc_mat.update( w1_w2 for w1_w2 in itertools.combinations(sorted(window_words), 2) if w1_w2[0] != w1_w2[1]) # doc doesn't have any valid words... if not cooc_mat: return [] graph = nx.Graph() graph.add_edges_from((w1, w2, { "weight": weight }) for (w1, w2), weight in cooc_mat.items()) word_scores = _compute_word_scores(doc, graph, cooc_mat, normalize) if not word_scores: return [] # generate a list of candidate terms candidates = _get_candidates(doc, normalize, include_pos) if isinstance(topn, float): topn = int(round(len(set(candidates)) * topn)) # rank candidates by aggregating constituent word scores candidate_scores = { " ".join(candidate): sum(word_scores.get(word, 0.0) for word in candidate) for candidate in candidates } sorted_candidate_scores = sorted(candidate_scores.items(), key=operator.itemgetter(1, 0), reverse=True) return ke_utils.get_filtered_topn_terms(sorted_candidate_scores, topn, match_threshold=0.8)
def get_n_sents(doc: Doc) -> int: """Get the number of sentences in ``Doc``.""" return itertoolz.count(doc.sents)