def sub_split(phrase1): """ split u"I/We/They… always/usually/often/sometimes…" into sub strs, e.g. "I... always...". """ old_sbg = SplitBlockGroup.extract(phrase1) new_sbg_list = [SplitBlockGroup()] current_skiped_idxes = [] for idx1, sb1 in enumerate(old_sbg): if idx1 in current_skiped_idxes: continue current_skiped_idxes = [idx1] check_slash(sb1, idx1, current_skiped_idxes) new_sbg_list_dup = [] is_split = len(current_skiped_idxes) > 1 for sbg1 in new_sbg_list: for skip_idx1 in current_skiped_idxes: current_sbg1 = SplitBlockGroup(sbg1) # make a dup sb2 = old_sbg[skip_idx1] if is_split and (not sb2.is_letter): continue current_sbg1.append(sb2) # strip blank into 1 for sb1 in current_sbg1: if z(sb1) and sb1.is_blank and (len(sb1) > 1): sb1.string = u' ' new_sbg_list_dup.append(current_sbg1.concat_items()) if len(new_sbg_list_dup): new_sbg_list = new_sbg_list_dup return new_sbg_list
def generate_SplitBlock_list(self, env): # 1. generate SplitBlock list env.split_block_group = SplitBlockGroup.extract(env.sentence) if env.inspect: print "[split_block_group]", env.split_block_group print
def __init__(self, phrasal_collocation_s): # generate self.first_strs_dict and self.tree these two data structure phrase_to_strs_list = dict() for phrasal_collocation1 in phrasal_collocation_s: assert isinstance(phrasal_collocation1, unicode) phrasal_collocation2 = Phrase(phrasal_collocation1) for pc2 in PhrasalRecognizer.split(phrasal_collocation1): split_block_group = SplitBlockGroup.extract(str(pc2).lower()) # strip both words and spaces strs_list = [sb1.utf8low() for sb1 in split_block_group if sb1.pos_begin is not None] # skip fake sb1 if phrasal_collocation2 not in phrase_to_strs_list: phrase_to_strs_list[phrasal_collocation2] = [] phrase_to_strs_list[phrasal_collocation2].append(strs_list) self.first_strs_dict = {i2[0]: True for i1 in phrase_to_strs_list.values() for i2 in i1} self.tree = dict() for phrase1 in phrase_to_strs_list: for strs1 in phrase_to_strs_list[phrase1]: current_dict = self.tree for idx2, s2 in enumerate(strs1): if not s2: continue # ignore spaces, and will ignore at search by the way. # always a dict if s2 not in current_dict: current_dict[s2] = dict() current_dict = current_dict[s2] # mark a ender if idx2 == (len(strs1) - 1): current_dict[phrase1] = True self.inspect = False
def unnormal_words_count(cls, sentence, inspect=False): if inspect: print "[unnormal_words_count]", sentence c = 0 for sb1 in SplitBlockGroup.extract(sentence): if inspect: print [sb1] if sb1.is_letter and (not sb1.is_regular): c += 1 return c
def process(self, sentence, inspect=False, replace=False): self.sentence = sentence # TODO remove me inspect = self.inspect or inspect if self.inspect: print "#|" * 80 print "processing \"%s\"" % self.sentence split_block_group = SplitBlockGroup.extract(self.sentence) candidate_split_block_s = [] # generate candidate_split_block_s for letter1 in split_block_group.letters(): # First string must be chars if ld.lemmatize(letter1.utf8low()) in self.first_strs_dict: candidate_split_block_s.append(letter1) if dots in self.tree: candidate_split_block_s.append(dots) # TODO # generate letter1_sb_list matched_strs__to__phrase = dict() for letter1 in candidate_split_block_s: # iterate each matched letter1 sb1_list_current = SplitBlockGroup([letter1]) # actually we append it before the current loop here. if letter1 == dots: key1_current = dots sb1_current = split_block_group[0] else: key1_current = ld.lemmatize(letter1.utf8low()) sb1_current = letter1 key1_dict_current = self.tree[key1_current] for key1_next in key1_dict_current: self.recursive_match(matched_strs__to__phrase, sb1_current, sb1_list_current, key1_dict_current, key1_current, key1_next) letter1_sb_list = sorted(matched_strs__to__phrase.values(), key=lambda i1: -len(i1)) if inspect: print print "[letter1_sb_list]", letter1_sb_list if replace: self.sentence = self.generate_replaced_sentence(letter1_sb_list, split_block_group) return [self.sentence, sorted(matched_strs__to__phrase.keys())]
def is_normal_sentence(cls, sentence, inspect=False): """ only check english words. """ if inspect: print "[is_normal_sentence]", sentence for sb1 in SplitBlockGroup.extract(sentence): if inspect: print [sb1] #if ("Liu Xing" in sentence) and (sb1.string == 'it'): import pdb; pdb.set_trace() if sb1.is_letter and (not sb1.is_regular): # compact with name. e.g. is, am, of, ... if unicode(sb1.string) in ld.two_length_words: continue # compact with name. e.g. Liu Xing if sb1.p_sb and regexp.upper.match(sb1.string[0]): continue # compact with "(p,r,i,g,n,s)" if (sb1.p_sb and sb1.p_sb.is_other) and (sb1.n_sb and sb1.n_sb.is_other): continue # TODO remove am, is if sb1.string not in ['s', 'm', 'am', 'is']: return False # compact with plural. e.g. ["it is 5 s (英镑)", "pound"] if sb1.p_sb and sb1.p_sb.can_fill: continue # compact with I'm. if sb1.p_sb and sb1.p_sb.string == "'": continue # compact with " is " if (sb1.p_sb and sb1.p_sb.is_blank) and (sb1.n_sb and sb1.n_sb.is_blank): continue # compact with "I am ..." if (sb1.p_sb and sb1.p_sb.is_blank) and (sb1.relative_to_current(-2) and sb1.relative_to_current(-2).string == 'I'): continue return True
def lemmatize_sentence(sentence): sbg = SplitBlockGroup.extract(sentence) for sb1 in sbg: if sb1.is_letter: sb1.string = ld.lemmatize(sb1.string) return sbg.concat_items()
def article_segment(self, sentence, inspect=False): sentence = re.sub("\xc2\xa0", " ", sentence) split_block_group = SplitBlockGroup.extract(sentence) index_block__to__fixed_words = dict() # Generate fixed words and their indexes. for chapped_group1 in split_block_group.maybe_chapped_groups(): chapped_group1 = SplitBlockGroup(chapped_group1) # Reject upper words # Iterate to remove continuous upper items rejected_items = set([]) letters = chapped_group1.letters() for idx1, letter1 in enumerate(letters): if (idx1 + 1) == len(letters): break if inspect: print letters if self.isupper(letter1.string) and self.isupper(letters[idx1 + 1].string, 1): rejected_items.add(letter1) rejected_items.add(letters[idx1 + 1]) for rejected_item1 in rejected_items: chapped_group1.remove(rejected_item1) chapped_strs = "".join(chapped_group1.concat_items().split(" ")) fixed_words = " ".join(self.segment(chapped_strs)) if inspect: print fixed_words index_block__to__fixed_words[(chapped_group1[0].pos_begin, chapped_group1[-1].pos_end,)] = fixed_words if inspect: print print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups() print "[index_block__to__fixed_words]", index_block__to__fixed_words print "\n" * 5 # Fill fixed words by their indexes. for begin_end_pos in index_block__to__fixed_words: begin_idx1, end_idx1 = None, None for idx2, sb2 in enumerate(split_block_group): if isinstance(sb2, str): continue if begin_end_pos[0] == sb2.pos_begin: begin_idx1 = idx2 if begin_end_pos[1] == sb2.pos_end: end_idx1 = idx2 split_block_group[begin_idx1:end_idx1 + 1] = index_block__to__fixed_words[begin_end_pos] if inspect: print split_block_group print # Fix blanks for idx1, item1 in enumerate(split_block_group[:]): if not isinstance(item1, str): continue if (idx1 + 1) == len(split_block_group) - 1: break self.fix_blanks(split_block_group, item1, idx1, 1) self.fix_blanks(split_block_group, item1, idx1, -1) if inspect: print split_block_group.concat_items() print return split_block_group.concat_items()
def article_segment(self, sentence, inspect=False): sentence = re.sub("\xc2\xa0", " ", sentence) split_block_group = SplitBlockGroup.extract(sentence) index_block__to__fixed_words = dict() # Generate fixed words and their indexes. for chapped_group1 in split_block_group.maybe_chapped_groups(): chapped_group1 = SplitBlockGroup(chapped_group1) # Reject upper words # Iterate to remove continuous upper items rejected_items = set([]) letters = chapped_group1.letters() for idx1, letter1 in enumerate(letters): if (idx1 + 1) == len(letters): break if inspect: print letters if self.isupper(letter1.string) and self.isupper( letters[idx1 + 1].string, 1): rejected_items.add(letter1) rejected_items.add(letters[idx1 + 1]) for rejected_item1 in rejected_items: chapped_group1.remove(rejected_item1) chapped_strs = "".join(chapped_group1.concat_items().split(" ")) fixed_words = " ".join(self.segment(chapped_strs)) if inspect: print fixed_words index_block__to__fixed_words[( chapped_group1[0].pos_begin, chapped_group1[-1].pos_end, )] = fixed_words if inspect: print print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups( ) print "[index_block__to__fixed_words]", index_block__to__fixed_words print "\n" * 5 # Fill fixed words by their indexes. for begin_end_pos in index_block__to__fixed_words: begin_idx1, end_idx1 = None, None for idx2, sb2 in enumerate(split_block_group): if isinstance(sb2, str): continue if begin_end_pos[0] == sb2.pos_begin: begin_idx1 = idx2 if begin_end_pos[1] == sb2.pos_end: end_idx1 = idx2 split_block_group[begin_idx1:end_idx1 + 1] = index_block__to__fixed_words[begin_end_pos] if inspect: print split_block_group print # Fix blanks for idx1, item1 in enumerate(split_block_group[:]): if not isinstance(item1, str): continue if (idx1 + 1) == len(split_block_group) - 1: break self.fix_blanks(split_block_group, item1, idx1, 1) self.fix_blanks(split_block_group, item1, idx1, -1) if inspect: print split_block_group.concat_items() print return split_block_group.concat_items()
def test_is_a_SplitBlock(self): sb1 = SplitBlockGroup.extract("hello")[0] self.assertTrue(SplitBlockGroup.z(sb1))
def test_maybe_chapped_groups(self): groups = SplitBlockGroup.extract("A. s un B.no s e C.fa c e D.ri c e").maybe_chapped_groups() groups = [SplitBlockGroup(g1).concat_items() for g1 in groups] self.assertEqual(groups, ['s un ', 'no s e ', 'fa c e ', 'ri c e'])