def sub_split(phrase1): """ split u"I/We/They… always/usually/often/sometimes…" into sub strs, e.g. "I... always...". """ old_sbg = SplitBlockGroup.extract(phrase1) new_sbg_list = [SplitBlockGroup()] current_skiped_idxes = [] for idx1, sb1 in enumerate(old_sbg): if idx1 in current_skiped_idxes: continue current_skiped_idxes = [idx1] check_slash(sb1, idx1, current_skiped_idxes) new_sbg_list_dup = [] is_split = len(current_skiped_idxes) > 1 for sbg1 in new_sbg_list: for skip_idx1 in current_skiped_idxes: current_sbg1 = SplitBlockGroup(sbg1) # make a dup sb2 = old_sbg[skip_idx1] if is_split and (not sb2.is_letter): continue current_sbg1.append(sb2) # strip blank into 1 for sb1 in current_sbg1: if z(sb1) and sb1.is_blank and (len(sb1) > 1): sb1.string = u' ' new_sbg_list_dup.append(current_sbg1.concat_items()) if len(new_sbg_list_dup): new_sbg_list = new_sbg_list_dup return new_sbg_list
def article_segment(self, sentence, inspect=False): sentence = re.sub("\xc2\xa0", " ", sentence) split_block_group = SplitBlockGroup.extract(sentence) index_block__to__fixed_words = dict() # Generate fixed words and their indexes. for chapped_group1 in split_block_group.maybe_chapped_groups(): chapped_group1 = SplitBlockGroup(chapped_group1) # Reject upper words # Iterate to remove continuous upper items rejected_items = set([]) letters = chapped_group1.letters() for idx1, letter1 in enumerate(letters): if (idx1 + 1) == len(letters): break if inspect: print letters if self.isupper(letter1.string) and self.isupper(letters[idx1 + 1].string, 1): rejected_items.add(letter1) rejected_items.add(letters[idx1 + 1]) for rejected_item1 in rejected_items: chapped_group1.remove(rejected_item1) chapped_strs = "".join(chapped_group1.concat_items().split(" ")) fixed_words = " ".join(self.segment(chapped_strs)) if inspect: print fixed_words index_block__to__fixed_words[(chapped_group1[0].pos_begin, chapped_group1[-1].pos_end,)] = fixed_words if inspect: print print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups() print "[index_block__to__fixed_words]", index_block__to__fixed_words print "\n" * 5 # Fill fixed words by their indexes. for begin_end_pos in index_block__to__fixed_words: begin_idx1, end_idx1 = None, None for idx2, sb2 in enumerate(split_block_group): if isinstance(sb2, str): continue if begin_end_pos[0] == sb2.pos_begin: begin_idx1 = idx2 if begin_end_pos[1] == sb2.pos_end: end_idx1 = idx2 split_block_group[begin_idx1:end_idx1 + 1] = index_block__to__fixed_words[begin_end_pos] if inspect: print split_block_group print # Fix blanks for idx1, item1 in enumerate(split_block_group[:]): if not isinstance(item1, str): continue if (idx1 + 1) == len(split_block_group) - 1: break self.fix_blanks(split_block_group, item1, idx1, 1) self.fix_blanks(split_block_group, item1, idx1, -1) if inspect: print split_block_group.concat_items() print return split_block_group.concat_items()
def article_segment(self, sentence, inspect=False): sentence = re.sub("\xc2\xa0", " ", sentence) split_block_group = SplitBlockGroup.extract(sentence) index_block__to__fixed_words = dict() # Generate fixed words and their indexes. for chapped_group1 in split_block_group.maybe_chapped_groups(): chapped_group1 = SplitBlockGroup(chapped_group1) # Reject upper words # Iterate to remove continuous upper items rejected_items = set([]) letters = chapped_group1.letters() for idx1, letter1 in enumerate(letters): if (idx1 + 1) == len(letters): break if inspect: print letters if self.isupper(letter1.string) and self.isupper( letters[idx1 + 1].string, 1): rejected_items.add(letter1) rejected_items.add(letters[idx1 + 1]) for rejected_item1 in rejected_items: chapped_group1.remove(rejected_item1) chapped_strs = "".join(chapped_group1.concat_items().split(" ")) fixed_words = " ".join(self.segment(chapped_strs)) if inspect: print fixed_words index_block__to__fixed_words[( chapped_group1[0].pos_begin, chapped_group1[-1].pos_end, )] = fixed_words if inspect: print print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups( ) print "[index_block__to__fixed_words]", index_block__to__fixed_words print "\n" * 5 # Fill fixed words by their indexes. for begin_end_pos in index_block__to__fixed_words: begin_idx1, end_idx1 = None, None for idx2, sb2 in enumerate(split_block_group): if isinstance(sb2, str): continue if begin_end_pos[0] == sb2.pos_begin: begin_idx1 = idx2 if begin_end_pos[1] == sb2.pos_end: end_idx1 = idx2 split_block_group[begin_idx1:end_idx1 + 1] = index_block__to__fixed_words[begin_end_pos] if inspect: print split_block_group print # Fix blanks for idx1, item1 in enumerate(split_block_group[:]): if not isinstance(item1, str): continue if (idx1 + 1) == len(split_block_group) - 1: break self.fix_blanks(split_block_group, item1, idx1, 1) self.fix_blanks(split_block_group, item1, idx1, -1) if inspect: print split_block_group.concat_items() print return split_block_group.concat_items()