示例#1
0
def sub_split(phrase1):
    """ split u"I/We/They… always/usually/often/sometimes…" into sub strs, e.g. "I... always...". """
    old_sbg = SplitBlockGroup.extract(phrase1)
    new_sbg_list = [SplitBlockGroup()]

    current_skiped_idxes = []
    for idx1, sb1 in enumerate(old_sbg):
        if idx1 in current_skiped_idxes:
            continue

        current_skiped_idxes = [idx1]

        check_slash(sb1, idx1, current_skiped_idxes)

        new_sbg_list_dup = []
        is_split = len(current_skiped_idxes) > 1
        for sbg1 in new_sbg_list:
            for skip_idx1 in current_skiped_idxes:
                current_sbg1 = SplitBlockGroup(sbg1)  # make a dup
                sb2 = old_sbg[skip_idx1]
                if is_split and (not sb2.is_letter):
                    continue
                current_sbg1.append(sb2)
                # strip blank into 1
                for sb1 in current_sbg1:
                    if z(sb1) and sb1.is_blank and (len(sb1) > 1):
                        sb1.string = u' '
                new_sbg_list_dup.append(current_sbg1.concat_items())
        if len(new_sbg_list_dup):
            new_sbg_list = new_sbg_list_dup

    return new_sbg_list
    def recursive_match(self, matched_strs__to__phrase, sb1_current, sb1_list_current, key1_dict_current, key1_current, key1_next=None):
        if self.inspect:
            print "#" * 30, "[key1_current]", key1_current, "[key1_next]", key1_next
        key1_dict_next = None if key1_next is None else key1_dict_current[key1_next]
        phrase_current = ([p1 for p1 in key1_dict_current if isinstance(p1, Phrase)] or [None])[0]  # every level has only one phrase.

        def params_with(k1):
            """ encapsulate parameters in the current scope. """
            return [key1_dict_next, key1_next, "placeholder", matched_strs__to__phrase] + \
                   [k1] + \
                   [SplitBlockGroup(sb1_list_current), key1_dict_current, key1_current]

        if key1_next == dots:
            self.recursive_match_sub(*params_with(sb1_current))

        while sb1_current.n_sb:
            sb1_current     = sb1_current.n_sb  # direct to next, cause current is appended to `sb1_list_current`
            # sb1_current_str = sb1_current.utf8low()
            sb1_list_current.append(sb1_current)
            if self.inspect:
                print "[sb1_current]", "\"%s\"" % sb1_current
            if self.inspect:
                print "len [sb1_list_current]", len(sb1_list_current)

            if phrase_current:
                is_ender = ((sb1_current.is_other and (sb1_current.utf8low() not in ["'"])) or (sb1_current.n_sb is None))
                if ((key1_current == dots) and is_ender) or \
                   (key1_current != dots):
                        matched_strs__to__phrase[phrase_current] = SplitBlockGroup(sb1_list_current)  # make a copy
                        if self.inspect:
                            print
                            print "[end candidate_split_block_s loop : sb1_list_current]", sb1_list_current
                            print
                        if key1_next is None:
                            break

            if key1_current == dots:
                if not z(sb1_current.n_sb):
                    break
                if key1_next:
                    if key1_next == ld.lemmatize(sb1_current.n_sb.utf8low()):
                        self.recursive_match_sub(*params_with(sb1_current))
                        break
                    else:
                        continue
            else:
                if key1_next:
                    if key1_next == ld.lemmatize(sb1_current.utf8low()):
                        self.recursive_match_sub(*params_with(sb1_current))
                    continue
                else:
                    break
    def process(self, sentence, inspect=False, replace=False):
        self.sentence = sentence  # TODO remove me

        inspect = self.inspect or inspect
        if self.inspect:
            print "#|" * 80
            print "processing \"%s\"" % self.sentence

        split_block_group = SplitBlockGroup.extract(self.sentence)
        candidate_split_block_s = []

        # generate candidate_split_block_s
        for letter1 in split_block_group.letters():
            # First string must be chars
            if ld.lemmatize(letter1.utf8low()) in self.first_strs_dict:
                candidate_split_block_s.append(letter1)

        if dots in self.tree:
            candidate_split_block_s.append(dots)  # TODO

        # generate letter1_sb_list
        matched_strs__to__phrase = dict()
        for letter1 in candidate_split_block_s:  # iterate each matched letter1
            sb1_list_current = SplitBlockGroup([letter1])  # actually we append it before the current loop here.

            if letter1 == dots:
                key1_current = dots
                sb1_current = split_block_group[0]
            else:
                key1_current = ld.lemmatize(letter1.utf8low())
                sb1_current = letter1

            key1_dict_current = self.tree[key1_current]

            for key1_next in key1_dict_current:
                self.recursive_match(matched_strs__to__phrase, sb1_current, sb1_list_current, key1_dict_current, key1_current, key1_next)

        letter1_sb_list = sorted(matched_strs__to__phrase.values(), key=lambda i1: -len(i1))
        if inspect:
            print
            print "[letter1_sb_list]", letter1_sb_list

        if replace:
            self.sentence = self.generate_replaced_sentence(letter1_sb_list, split_block_group)

        return [self.sentence, sorted(matched_strs__to__phrase.keys())]
示例#4
0
    def article_segment(self, sentence, inspect=False):
        sentence = re.sub("\xc2\xa0", " ", sentence)

        split_block_group = SplitBlockGroup.extract(sentence)
        index_block__to__fixed_words = dict()

        # Generate fixed words and their indexes.
        for chapped_group1 in split_block_group.maybe_chapped_groups():
            chapped_group1 = SplitBlockGroup(chapped_group1)

            # Reject upper words
            # Iterate to remove continuous upper items
            rejected_items = set([])
            letters = chapped_group1.letters()
            for idx1, letter1 in enumerate(letters):
                if (idx1 + 1) == len(letters):
                    break
                if inspect:
                    print letters
                if self.isupper(letter1.string) and self.isupper(
                        letters[idx1 + 1].string, 1):
                    rejected_items.add(letter1)
                    rejected_items.add(letters[idx1 + 1])
            for rejected_item1 in rejected_items:
                chapped_group1.remove(rejected_item1)

            chapped_strs = "".join(chapped_group1.concat_items().split(" "))
            fixed_words = " ".join(self.segment(chapped_strs))
            if inspect:
                print fixed_words

            index_block__to__fixed_words[(
                chapped_group1[0].pos_begin,
                chapped_group1[-1].pos_end,
            )] = fixed_words
        if inspect:
            print
            print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups(
            )
            print "[index_block__to__fixed_words]", index_block__to__fixed_words
            print "\n" * 5

        # Fill fixed words by their indexes.
        for begin_end_pos in index_block__to__fixed_words:
            begin_idx1, end_idx1 = None, None
            for idx2, sb2 in enumerate(split_block_group):
                if isinstance(sb2, str):
                    continue
                if begin_end_pos[0] == sb2.pos_begin:
                    begin_idx1 = idx2
                if begin_end_pos[1] == sb2.pos_end:
                    end_idx1 = idx2
            split_block_group[begin_idx1:end_idx1 +
                              1] = index_block__to__fixed_words[begin_end_pos]
            if inspect:
                print split_block_group
                print

        # Fix blanks
        for idx1, item1 in enumerate(split_block_group[:]):
            if not isinstance(item1, str):
                continue
            if (idx1 + 1) == len(split_block_group) - 1:
                break

            self.fix_blanks(split_block_group, item1, idx1, 1)
            self.fix_blanks(split_block_group, item1, idx1, -1)

        if inspect:
            print split_block_group.concat_items()
            print
        return split_block_group.concat_items()
 def params_with(k1):
     """ encapsulate parameters in the current scope. """
     return [key1_dict_next, key1_next, "placeholder", matched_strs__to__phrase] + \
            [k1] + \
            [SplitBlockGroup(sb1_list_current), key1_dict_current, key1_current]
示例#6
0
 def test_maybe_chapped_groups(self):
     groups = SplitBlockGroup.extract("A. s un  B.no s e C.fa c e  D.ri c e").maybe_chapped_groups()
     groups = [SplitBlockGroup(g1).concat_items() for g1 in groups]
     self.assertEqual(groups, ['s un  ', 'no s e ', 'fa c e  ', 'ri c e'])