def sub_split(phrase1):
    """ split u"I/We/They… always/usually/often/sometimes…" into sub strs, e.g. "I... always...". """
    old_sbg = SplitBlockGroup.extract(phrase1)
    new_sbg_list = [SplitBlockGroup()]

    current_skiped_idxes = []
    for idx1, sb1 in enumerate(old_sbg):
        if idx1 in current_skiped_idxes:
            continue

        current_skiped_idxes = [idx1]

        check_slash(sb1, idx1, current_skiped_idxes)

        new_sbg_list_dup = []
        is_split = len(current_skiped_idxes) > 1
        for sbg1 in new_sbg_list:
            for skip_idx1 in current_skiped_idxes:
                current_sbg1 = SplitBlockGroup(sbg1)  # make a dup
                sb2 = old_sbg[skip_idx1]
                if is_split and (not sb2.is_letter):
                    continue
                current_sbg1.append(sb2)
                # strip blank into 1
                for sb1 in current_sbg1:
                    if z(sb1) and sb1.is_blank and (len(sb1) > 1):
                        sb1.string = u' '
                new_sbg_list_dup.append(current_sbg1.concat_items())
        if len(new_sbg_list_dup):
            new_sbg_list = new_sbg_list_dup

    return new_sbg_list
예제 #2
0
def sub_split(phrase1):
    """ split u"I/We/They… always/usually/often/sometimes…" into sub strs, e.g. "I... always...". """
    old_sbg = SplitBlockGroup.extract(phrase1)
    new_sbg_list = [SplitBlockGroup()]

    current_skiped_idxes = []
    for idx1, sb1 in enumerate(old_sbg):
        if idx1 in current_skiped_idxes:
            continue

        current_skiped_idxes = [idx1]

        check_slash(sb1, idx1, current_skiped_idxes)

        new_sbg_list_dup = []
        is_split = len(current_skiped_idxes) > 1
        for sbg1 in new_sbg_list:
            for skip_idx1 in current_skiped_idxes:
                current_sbg1 = SplitBlockGroup(sbg1)  # make a dup
                sb2 = old_sbg[skip_idx1]
                if is_split and (not sb2.is_letter):
                    continue
                current_sbg1.append(sb2)
                # strip blank into 1
                for sb1 in current_sbg1:
                    if z(sb1) and sb1.is_blank and (len(sb1) > 1):
                        sb1.string = u' '
                new_sbg_list_dup.append(current_sbg1.concat_items())
        if len(new_sbg_list_dup):
            new_sbg_list = new_sbg_list_dup

    return new_sbg_list
예제 #3
0
    def article_segment(self, sentence, inspect=False):
        sentence = re.sub("\xc2\xa0", " ", sentence)

        split_block_group = SplitBlockGroup.extract(sentence)
        index_block__to__fixed_words = dict()

        # Generate fixed words and their indexes.
        for chapped_group1 in split_block_group.maybe_chapped_groups():
            chapped_group1 = SplitBlockGroup(chapped_group1)

            # Reject upper words
            # Iterate to remove continuous upper items
            rejected_items = set([])
            letters = chapped_group1.letters()
            for idx1, letter1 in enumerate(letters):
                if (idx1 + 1) == len(letters):
                    break
                if inspect:
                    print letters
                if self.isupper(letter1.string) and self.isupper(letters[idx1 + 1].string, 1):
                    rejected_items.add(letter1)
                    rejected_items.add(letters[idx1 + 1])
            for rejected_item1 in rejected_items:
                chapped_group1.remove(rejected_item1)

            chapped_strs   = "".join(chapped_group1.concat_items().split(" "))
            fixed_words    = " ".join(self.segment(chapped_strs))
            if inspect:
                print fixed_words

            index_block__to__fixed_words[(chapped_group1[0].pos_begin, chapped_group1[-1].pos_end,)] = fixed_words
        if inspect:
            print
            print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups()
            print "[index_block__to__fixed_words]", index_block__to__fixed_words
            print "\n" * 5

        # Fill fixed words by their indexes.
        for begin_end_pos in index_block__to__fixed_words:
            begin_idx1, end_idx1 = None, None
            for idx2, sb2 in enumerate(split_block_group):
                if isinstance(sb2, str):
                    continue
                if begin_end_pos[0] == sb2.pos_begin:
                    begin_idx1 = idx2
                if begin_end_pos[1] == sb2.pos_end:
                    end_idx1   = idx2
            split_block_group[begin_idx1:end_idx1 + 1] = index_block__to__fixed_words[begin_end_pos]
            if inspect:
                print split_block_group
                print

        # Fix blanks
        for idx1, item1 in enumerate(split_block_group[:]):
            if not isinstance(item1, str):
                continue
            if (idx1 + 1) == len(split_block_group) - 1:
                break

            self.fix_blanks(split_block_group, item1, idx1, 1)
            self.fix_blanks(split_block_group, item1, idx1, -1)

        if inspect:
            print split_block_group.concat_items()
            print
        return split_block_group.concat_items()
예제 #4
0
    def article_segment(self, sentence, inspect=False):
        sentence = re.sub("\xc2\xa0", " ", sentence)

        split_block_group = SplitBlockGroup.extract(sentence)
        index_block__to__fixed_words = dict()

        # Generate fixed words and their indexes.
        for chapped_group1 in split_block_group.maybe_chapped_groups():
            chapped_group1 = SplitBlockGroup(chapped_group1)

            # Reject upper words
            # Iterate to remove continuous upper items
            rejected_items = set([])
            letters = chapped_group1.letters()
            for idx1, letter1 in enumerate(letters):
                if (idx1 + 1) == len(letters):
                    break
                if inspect:
                    print letters
                if self.isupper(letter1.string) and self.isupper(
                        letters[idx1 + 1].string, 1):
                    rejected_items.add(letter1)
                    rejected_items.add(letters[idx1 + 1])
            for rejected_item1 in rejected_items:
                chapped_group1.remove(rejected_item1)

            chapped_strs = "".join(chapped_group1.concat_items().split(" "))
            fixed_words = " ".join(self.segment(chapped_strs))
            if inspect:
                print fixed_words

            index_block__to__fixed_words[(
                chapped_group1[0].pos_begin,
                chapped_group1[-1].pos_end,
            )] = fixed_words
        if inspect:
            print
            print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups(
            )
            print "[index_block__to__fixed_words]", index_block__to__fixed_words
            print "\n" * 5

        # Fill fixed words by their indexes.
        for begin_end_pos in index_block__to__fixed_words:
            begin_idx1, end_idx1 = None, None
            for idx2, sb2 in enumerate(split_block_group):
                if isinstance(sb2, str):
                    continue
                if begin_end_pos[0] == sb2.pos_begin:
                    begin_idx1 = idx2
                if begin_end_pos[1] == sb2.pos_end:
                    end_idx1 = idx2
            split_block_group[begin_idx1:end_idx1 +
                              1] = index_block__to__fixed_words[begin_end_pos]
            if inspect:
                print split_block_group
                print

        # Fix blanks
        for idx1, item1 in enumerate(split_block_group[:]):
            if not isinstance(item1, str):
                continue
            if (idx1 + 1) == len(split_block_group) - 1:
                break

            self.fix_blanks(split_block_group, item1, idx1, 1)
            self.fix_blanks(split_block_group, item1, idx1, -1)

        if inspect:
            print split_block_group.concat_items()
            print
        return split_block_group.concat_items()