def map_irregular_forms(word_str_ser, can_replace_mask): # replaces all strings and stop rules # need to process it irregular_forms = { "sky": ["sky", "skies"], "die": ["dying"], "lie": ["lying"], "tie": ["tying"], "news": ["news"], "inning": ["innings", "inning"], "outing": ["outings", "outing"], "canning": ["cannings", "canning"], "howe": ["howe"], "proceed": ["proceed"], "exceed": ["exceed"], "succeed": ["succeed"], } for replacement, form_ls in irregular_forms.items(): for form in form_ls: equal_flag = word_str_ser == form stem_ser = get_stem_series(word_str_ser, len(form), can_replace_mask & equal_flag) replacement_ser = get_str_replacement_series( replacement, can_replace_mask & equal_flag) word_str_ser = stem_ser.str.cat(replacement_ser) can_replace_mask = can_replace_mask & cudf.logical_not(equal_flag) return word_str_ser, can_replace_mask
def apply_rule(word_str_ser, rule, w_in_c_flag): """Applies the first applicable suffix-removal rule to the word Takes a word and a list of suffix-removal rules represented as 3-tuples, with the first element being the suffix to remove, the second element being the string to replace it with, and the final element being the condition for the rule to be applicable, or None if the rule is unconditional. """ suffix, replacement, condition = rule if suffix == "*d": double_consonant_mask = ends_with_double_constant(word_str_ser) # all flags needed here # with **d in nltk we pass word_series rather than stem_series # see below: # lambda stem: intermediate_stem[-1] not in ('l', 's', 'z'), # condition is on intermediate_stem intermediate_stem = word_str_ser.str.slice(stop=-1) condition_mask = get_condition_flag(intermediate_stem, condition) # mask where replacement will happen valid_mask = double_consonant_mask & condition_mask & w_in_c_flag # new series with updated valid_mask word_str_ser = replace_suffix(word_str_ser, suffix, replacement, valid_mask) w_in_c_flag = w_in_c_flag & cudf.logical_not(double_consonant_mask) else: suffix_mask = ends_with_suffix(word_str_ser, suffix) valid_mask = suffix_mask & w_in_c_flag stem_ser = replace_suffix(word_str_ser, suffix, "", valid_mask) condition_mask = get_condition_flag(stem_ser, condition) # mask where replacement will happen valid_mask = condition_mask & suffix_mask & w_in_c_flag word_str_ser = replace_suffix(word_str_ser, suffix, replacement, valid_mask) # we wont apply further rules if it has a matching suffix w_in_c_flag = w_in_c_flag & cudf.logical_not(suffix_mask) return word_str_ser, w_in_c_flag
def test_series_not(dtype): import pandas as pd arr = pd.Series(np.random.choice([True, False], 1000)).astype(dtype) if dtype is not np.bool_: arr = arr * (np.random.random(1000) * 100).astype(dtype) sr = Series(arr) result = cudf.logical_not(sr).to_array() expect = np.logical_not(arr) np.testing.assert_equal(result, expect) np.testing.assert_equal((~sr).to_array(), ~arr)
def ends_cvc(string_ser, mode="NLTK_EXTENSIONS"): """Implements condition *o from the paper From the paper: *o - the stem ends cvc, where the second c is not W, X or Y (e.g. -WIL, -HOP). """ if mode == "NLTK_EXTENSIONS": # rule_1 # len(word) >= 3 # and self._is_consonant(word, len(word) - 3) # and not self._is_consonant(word, len(word) - 2) # and self._is_consonant(word, len(word) - 1) # and word[-1] not in ("w", "x", "y") len_flag = len_gt_n(string_ser, 2) first_consonant = is_consonant(string_ser, -3) middle_vowel = is_vowel(string_ser, -2) last_consonant = is_consonant(string_ser, -1) last_char_strs = string_ser.str.get(-1) # converting to series to all strings last_char_ser = cudf.Series(last_char_strs) last_char_flag = None for char in ["w", "x", "y"]: if last_char_flag is not None: last_char_flag = last_char_flag & (last_char_ser != char) else: last_char_flag = last_char_ser != char rule_1 = (len_flag & first_consonant & middle_vowel & last_consonant & last_char_flag) # rule_2 # self.mode == self.NLTK_EXTENSIONS # and len(word) == 2 # and not self._is_consonant(word, 0) # and self._is_consonant(word, 1) len_flag = len_eq_n(string_ser, 2) first_char = cudf.logical_not(is_consonant(string_ser, 0)) second_char = is_consonant(string_ser, 1) rule_2 = len_flag & first_char & second_char return rule_1 | rule_2 else: assert NotImplementedError
def _step1a(self, word_str_ser, can_replace_mask=None): """Implements Step 1a from "An algorithm for suffix stripping" From the paper: SSES -> SS caresses -> caress IES -> I ponies -> poni ties -> ti (### this is for orignal impl) SS -> SS caress -> caress S -> cats -> cat """ can_replace_mask = build_can_replace_mask(len_mask=len(word_str_ser), mask=can_replace_mask) # this NLTK-only rule extends the original algorithm, so # that 'flies'->'fli' but 'dies'->'die' etc # ties -> tie if self.mode == "NLTK_EXTENSIONS": # equivalent to # word.endswith('ies') and len(word) == 4: suffix_mask = ends_with_suffix(word_str_ser, "ies") len_mask = len_eq_n(word_str_ser, 4) condition_mask = suffix_mask & len_mask valid_mask = can_replace_mask & condition_mask word_str_ser = replace_suffix(word_str_ser, "ies", "ie", valid_mask) # update can replace mask can_replace_mask = can_replace_mask & cudf.logical_not( condition_mask) return apply_rule_list( word_str_ser, [ ("sses", "ss", None), # SSES -> SS ("ies", "i", None), # IES -> I ("ss", "ss", None), # SS -> SS ("s", "", None), # S -> ], can_replace_mask, )[0]
def _step5a(self, word_str_ser, can_replace_mask=None): """Implements Step 5a from "An algorithm for suffix stripping" From the paper: Step 5a (m>1) E -> probate -> probat rate -> rate (m=1 and not *o) E -> cease -> ceas """ can_replace_mask = build_can_replace_mask(len_mask=len(word_str_ser), mask=can_replace_mask) # Note that Martin's test vocabulary and reference # implementations are inconsistent in how they handle the case # where two rules both refer to a suffix that matches the word # to be stemmed, but only the condition of the second one is # true. # Earlier in step2b we had the rules: # (m>0) EED -> EE # (*v*) ED -> # but the examples in the paper included "feed"->"feed", even # though (*v*) is true for "fe" and therefore the second rule # alone would map "feed"->"fe". # However, in THIS case, we need to handle the consecutive rules # differently and try both conditions (obviously; the second # rule here would be redundant otherwise). Martin's paper makes # no explicit mention of the inconsistency; you have to infer it # from the examples. # For this reason, we can't use _apply_rule_list here. ## # logic is equivalent to below # if word.endswith('e'): # stem = self._replace_suffix(word, 'e', '') # if self._measure(stem) > 1: # return stem rule_1 # if self._measure(stem) == 1 and not self._ends_cvc(stem): # return stem rule_2 # e_suffix_flag = ends_with_suffix(word_str_ser, "e") stem = replace_suffix(word_str_ser, "e", "", e_suffix_flag & can_replace_mask) measure_gt_1_flag = measure_gt_n(stem, 1) # if self._measure(stem) > 1: rule_1_flag = measure_gt_1_flag # if measure==1 and not self._ends_cvc(stem): measure_eq_1_flag = measure_eq_n(stem, 1) does_not_ends_with_cvc_flag = cudf.logical_not(ends_cvc(stem)) rule_2_flag = measure_eq_1_flag & does_not_ends_with_cvc_flag overall_rule_flag = ((rule_1_flag | rule_2_flag) & e_suffix_flag & can_replace_mask) return replace_suffix(word_str_ser, "e", "", overall_rule_flag)
def _step1b(self, word_str_ser, can_replace_mask=None): """Implements Step 1b from "An algorithm for suffix stripping" From the paper: (m>0) EED -> EE feed -> feed agreed -> agree (*v*) ED -> plastered -> plaster bled -> bled (*v*) ING -> motoring -> motor sing -> sing If the second or third of the rules in Step 1b is successful, the following is done: AT -> ATE conflat(ed) -> conflate BL -> BLE troubl(ed) -> trouble IZ -> IZE siz(ed) -> size (*d and not (*L or *S or *Z)) -> single letter hopp(ing) -> hop tann(ed) -> tan fall(ing) -> fall hiss(ing) -> hiss fizz(ed) -> fizz (m=1 and *o) -> E fail(ing) -> fail fil(ing) -> file The rule to map to a single letter causes the removal of one of the double letter pair. The -E is put back on -AT, -BL and -IZ, so that the suffixes -ATE, -BLE and -IZE can be recognised later. This E may be removed in step 4. """ can_replace_mask = build_can_replace_mask(len_mask=len(word_str_ser), mask=can_replace_mask) # this NLTK-only block extends the original algorithm, so that # 'spied'->'spi' but 'died'->'die' etc if self.mode == "NLTK_EXTENSIONS": # word.endswith('ied'): suffix_mask = ends_with_suffix(word_str_ser, "ied") len_mask = len_eq_n(word_str_ser, 4) condition_mask = suffix_mask & len_mask valid_mask = can_replace_mask & condition_mask word_str_ser = replace_suffix(word_str_ser, "ied", "ie", valid_mask) # update can replace mask can_replace_mask = can_replace_mask & cudf.logical_not( condition_mask) condition_mask = suffix_mask valid_mask = can_replace_mask & condition_mask word_str_ser = replace_suffix(word_str_ser, "ied", "i", valid_mask) # update can replace mask can_replace_mask = can_replace_mask & cudf.logical_not( condition_mask) # (m>0) EED -> EE # if suffix ==eed we stop processing # to be consistent with nltk suffix_mask = ends_with_suffix(word_str_ser, "eed") valid_mask = suffix_mask & can_replace_mask stem = replace_suffix(word_str_ser, "eed", "", valid_mask) measure_mask = measure_gt_n(stem, 0) valid_mask = measure_mask & suffix_mask & can_replace_mask # adding ee series to stem word_str_ser = replace_suffix(word_str_ser, "eed", "ee", valid_mask) # to be consistent with nltk we dont replace # if word.endswith('eed') we stop proceesing can_replace_mask = can_replace_mask & cudf.logical_not(suffix_mask) # rule 2 # (*v*) ED -> plastered -> plaster # bled -> bled ed_suffix_mask = ends_with_suffix(word_str_ser, "ed") intermediate_stem = replace_suffix(word_str_ser, "ed", "", ed_suffix_mask & can_replace_mask) vowel_mask = contains_vowel(intermediate_stem) rule_2_mask = vowel_mask & ed_suffix_mask & can_replace_mask # rule 3 # (*v*) ING -> motoring -> motor # sing -> sing ing_suffix_mask = ends_with_suffix(word_str_ser, "ing") intermediate_stem = replace_suffix(word_str_ser, "ing", "", ing_suffix_mask & can_replace_mask) vowel_mask = contains_vowel(intermediate_stem) rule_3_mask = vowel_mask & ing_suffix_mask & can_replace_mask rule_2_or_rule_3_mask = rule_2_mask | rule_3_mask # replace masks only if rule_2_or_rule_3_mask intermediate_stem_1 = replace_suffix(word_str_ser, "ed", "", rule_2_mask) intermediate_stem_2 = replace_suffix(intermediate_stem_1, "ing", "", rule_3_mask) can_replace_mask = can_replace_mask & rule_2_or_rule_3_mask return apply_rule_list( intermediate_stem_2, [ ("at", "ate", None), # AT -> ATE ("bl", "ble", None), # BL -> BLE ("iz", "ize", None), # IZ -> IZE # (*d and not (*L or *S or *Z)) # -> single letter ( "*d", -1, # intermediate_stem[-1], lambda stem: last_char_not_in(stem, characters=["l", "s", "z"]), ), # (m=1 and *o) -> E ( "", "e", lambda stem: measure_eq_n(stem, n=1) & ends_cvc(stem), ), ], can_replace_mask, )[0]