def sub_split(phrase1): """ split u"I/We/They… always/usually/often/sometimes…" into sub strs, e.g. "I... always...". """ old_sbg = SplitBlockGroup.extract(phrase1) new_sbg_list = [SplitBlockGroup()] current_skiped_idxes = [] for idx1, sb1 in enumerate(old_sbg): if idx1 in current_skiped_idxes: continue current_skiped_idxes = [idx1] check_slash(sb1, idx1, current_skiped_idxes) new_sbg_list_dup = [] is_split = len(current_skiped_idxes) > 1 for sbg1 in new_sbg_list: for skip_idx1 in current_skiped_idxes: current_sbg1 = SplitBlockGroup(sbg1) # make a dup sb2 = old_sbg[skip_idx1] if is_split and (not sb2.is_letter): continue current_sbg1.append(sb2) # strip blank into 1 for sb1 in current_sbg1: if z(sb1) and sb1.is_blank and (len(sb1) > 1): sb1.string = u' ' new_sbg_list_dup.append(current_sbg1.concat_items()) if len(new_sbg_list_dup): new_sbg_list = new_sbg_list_dup return new_sbg_list
def recursive_match(self, matched_strs__to__phrase, sb1_current, sb1_list_current, key1_dict_current, key1_current, key1_next=None): if self.inspect: print "#" * 30, "[key1_current]", key1_current, "[key1_next]", key1_next key1_dict_next = None if key1_next is None else key1_dict_current[key1_next] phrase_current = ([p1 for p1 in key1_dict_current if isinstance(p1, Phrase)] or [None])[0] # every level has only one phrase. def params_with(k1): """ encapsulate parameters in the current scope. """ return [key1_dict_next, key1_next, "placeholder", matched_strs__to__phrase] + \ [k1] + \ [SplitBlockGroup(sb1_list_current), key1_dict_current, key1_current] if key1_next == dots: self.recursive_match_sub(*params_with(sb1_current)) while sb1_current.n_sb: sb1_current = sb1_current.n_sb # direct to next, cause current is appended to `sb1_list_current` # sb1_current_str = sb1_current.utf8low() sb1_list_current.append(sb1_current) if self.inspect: print "[sb1_current]", "\"%s\"" % sb1_current if self.inspect: print "len [sb1_list_current]", len(sb1_list_current) if phrase_current: is_ender = ((sb1_current.is_other and (sb1_current.utf8low() not in ["'"])) or (sb1_current.n_sb is None)) if ((key1_current == dots) and is_ender) or \ (key1_current != dots): matched_strs__to__phrase[phrase_current] = SplitBlockGroup(sb1_list_current) # make a copy if self.inspect: print print "[end candidate_split_block_s loop : sb1_list_current]", sb1_list_current print if key1_next is None: break if key1_current == dots: if not z(sb1_current.n_sb): break if key1_next: if key1_next == ld.lemmatize(sb1_current.n_sb.utf8low()): self.recursive_match_sub(*params_with(sb1_current)) break else: continue else: if key1_next: if key1_next == ld.lemmatize(sb1_current.utf8low()): self.recursive_match_sub(*params_with(sb1_current)) continue else: break
def process(self, sentence, inspect=False, replace=False): self.sentence = sentence # TODO remove me inspect = self.inspect or inspect if self.inspect: print "#|" * 80 print "processing \"%s\"" % self.sentence split_block_group = SplitBlockGroup.extract(self.sentence) candidate_split_block_s = [] # generate candidate_split_block_s for letter1 in split_block_group.letters(): # First string must be chars if ld.lemmatize(letter1.utf8low()) in self.first_strs_dict: candidate_split_block_s.append(letter1) if dots in self.tree: candidate_split_block_s.append(dots) # TODO # generate letter1_sb_list matched_strs__to__phrase = dict() for letter1 in candidate_split_block_s: # iterate each matched letter1 sb1_list_current = SplitBlockGroup([letter1]) # actually we append it before the current loop here. if letter1 == dots: key1_current = dots sb1_current = split_block_group[0] else: key1_current = ld.lemmatize(letter1.utf8low()) sb1_current = letter1 key1_dict_current = self.tree[key1_current] for key1_next in key1_dict_current: self.recursive_match(matched_strs__to__phrase, sb1_current, sb1_list_current, key1_dict_current, key1_current, key1_next) letter1_sb_list = sorted(matched_strs__to__phrase.values(), key=lambda i1: -len(i1)) if inspect: print print "[letter1_sb_list]", letter1_sb_list if replace: self.sentence = self.generate_replaced_sentence(letter1_sb_list, split_block_group) return [self.sentence, sorted(matched_strs__to__phrase.keys())]
def article_segment(self, sentence, inspect=False): sentence = re.sub("\xc2\xa0", " ", sentence) split_block_group = SplitBlockGroup.extract(sentence) index_block__to__fixed_words = dict() # Generate fixed words and their indexes. for chapped_group1 in split_block_group.maybe_chapped_groups(): chapped_group1 = SplitBlockGroup(chapped_group1) # Reject upper words # Iterate to remove continuous upper items rejected_items = set([]) letters = chapped_group1.letters() for idx1, letter1 in enumerate(letters): if (idx1 + 1) == len(letters): break if inspect: print letters if self.isupper(letter1.string) and self.isupper( letters[idx1 + 1].string, 1): rejected_items.add(letter1) rejected_items.add(letters[idx1 + 1]) for rejected_item1 in rejected_items: chapped_group1.remove(rejected_item1) chapped_strs = "".join(chapped_group1.concat_items().split(" ")) fixed_words = " ".join(self.segment(chapped_strs)) if inspect: print fixed_words index_block__to__fixed_words[( chapped_group1[0].pos_begin, chapped_group1[-1].pos_end, )] = fixed_words if inspect: print print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups( ) print "[index_block__to__fixed_words]", index_block__to__fixed_words print "\n" * 5 # Fill fixed words by their indexes. for begin_end_pos in index_block__to__fixed_words: begin_idx1, end_idx1 = None, None for idx2, sb2 in enumerate(split_block_group): if isinstance(sb2, str): continue if begin_end_pos[0] == sb2.pos_begin: begin_idx1 = idx2 if begin_end_pos[1] == sb2.pos_end: end_idx1 = idx2 split_block_group[begin_idx1:end_idx1 + 1] = index_block__to__fixed_words[begin_end_pos] if inspect: print split_block_group print # Fix blanks for idx1, item1 in enumerate(split_block_group[:]): if not isinstance(item1, str): continue if (idx1 + 1) == len(split_block_group) - 1: break self.fix_blanks(split_block_group, item1, idx1, 1) self.fix_blanks(split_block_group, item1, idx1, -1) if inspect: print split_block_group.concat_items() print return split_block_group.concat_items()
def params_with(k1): """ encapsulate parameters in the current scope. """ return [key1_dict_next, key1_next, "placeholder", matched_strs__to__phrase] + \ [k1] + \ [SplitBlockGroup(sb1_list_current), key1_dict_current, key1_current]
def test_maybe_chapped_groups(self): groups = SplitBlockGroup.extract("A. s un B.no s e C.fa c e D.ri c e").maybe_chapped_groups() groups = [SplitBlockGroup(g1).concat_items() for g1 in groups] self.assertEqual(groups, ['s un ', 'no s e ', 'fa c e ', 'ri c e'])