def sub_split(phrase1):
    """ split u"I/We/They… always/usually/often/sometimes…" into sub strs, e.g. "I... always...". """
    old_sbg = SplitBlockGroup.extract(phrase1)
    new_sbg_list = [SplitBlockGroup()]

    current_skiped_idxes = []
    for idx1, sb1 in enumerate(old_sbg):
        if idx1 in current_skiped_idxes:
            continue

        current_skiped_idxes = [idx1]

        check_slash(sb1, idx1, current_skiped_idxes)

        new_sbg_list_dup = []
        is_split = len(current_skiped_idxes) > 1
        for sbg1 in new_sbg_list:
            for skip_idx1 in current_skiped_idxes:
                current_sbg1 = SplitBlockGroup(sbg1)  # make a dup
                sb2 = old_sbg[skip_idx1]
                if is_split and (not sb2.is_letter):
                    continue
                current_sbg1.append(sb2)
                # strip blank into 1
                for sb1 in current_sbg1:
                    if z(sb1) and sb1.is_blank and (len(sb1) > 1):
                        sb1.string = u' '
                new_sbg_list_dup.append(current_sbg1.concat_items())
        if len(new_sbg_list_dup):
            new_sbg_list = new_sbg_list_dup

    return new_sbg_list
    def __init__(self, phrasal_collocation_s):
        # generate self.first_strs_dict and self.tree these two data structure
        phrase_to_strs_list = dict()
        for phrasal_collocation1 in phrasal_collocation_s:
            assert isinstance(phrasal_collocation1, unicode)
            phrasal_collocation2 = Phrase(phrasal_collocation1)
            for pc2 in PhrasalRecognizer.split(phrasal_collocation1):
                split_block_group = SplitBlockGroup.extract(str(pc2).lower())
                # strip both words and spaces
                strs_list = [sb1.utf8low() for sb1 in split_block_group if sb1.pos_begin is not None]  # skip fake sb1
                if phrasal_collocation2 not in phrase_to_strs_list:
                    phrase_to_strs_list[phrasal_collocation2] = []
                phrase_to_strs_list[phrasal_collocation2].append(strs_list)

        self.first_strs_dict = {i2[0]: True for i1 in phrase_to_strs_list.values() for i2 in i1}

        self.tree = dict()
        for phrase1 in phrase_to_strs_list:
            for strs1 in phrase_to_strs_list[phrase1]:
                current_dict = self.tree
                for idx2, s2 in enumerate(strs1):
                    if not s2:
                        continue  # ignore spaces, and will ignore at search by the way.
                    # always a dict
                    if s2 not in current_dict:
                        current_dict[s2] = dict()
                    current_dict = current_dict[s2]
                    # mark a ender
                    if idx2 == (len(strs1) - 1):
                        current_dict[phrase1] = True

        self.inspect = False
示例#3
0
    def generate_SplitBlock_list(self, env):
        # 1. generate SplitBlock list
        env.split_block_group = SplitBlockGroup.extract(env.sentence)

        if env.inspect:
            print "[split_block_group]", env.split_block_group
            print
    def process(self, sentence, inspect=False, replace=False):
        self.sentence = sentence  # TODO remove me

        inspect = self.inspect or inspect
        if self.inspect:
            print "#|" * 80
            print "processing \"%s\"" % self.sentence

        split_block_group = SplitBlockGroup.extract(self.sentence)
        candidate_split_block_s = []

        # generate candidate_split_block_s
        for letter1 in split_block_group.letters():
            # First string must be chars
            if ld.lemmatize(letter1.utf8low()) in self.first_strs_dict:
                candidate_split_block_s.append(letter1)

        if dots in self.tree:
            candidate_split_block_s.append(dots)  # TODO

        # generate letter1_sb_list
        matched_strs__to__phrase = dict()
        for letter1 in candidate_split_block_s:  # iterate each matched letter1
            sb1_list_current = SplitBlockGroup([letter1])  # actually we append it before the current loop here.

            if letter1 == dots:
                key1_current = dots
                sb1_current = split_block_group[0]
            else:
                key1_current = ld.lemmatize(letter1.utf8low())
                sb1_current = letter1

            key1_dict_current = self.tree[key1_current]

            for key1_next in key1_dict_current:
                self.recursive_match(matched_strs__to__phrase, sb1_current, sb1_list_current, key1_dict_current, key1_current, key1_next)

        letter1_sb_list = sorted(matched_strs__to__phrase.values(), key=lambda i1: -len(i1))
        if inspect:
            print
            print "[letter1_sb_list]", letter1_sb_list

        if replace:
            self.sentence = self.generate_replaced_sentence(letter1_sb_list, split_block_group)

        return [self.sentence, sorted(matched_strs__to__phrase.keys())]
示例#5
0
def sub_split(phrase1):
    """ split u"I/We/They… always/usually/often/sometimes…" into sub strs, e.g. "I... always...". """
    old_sbg = SplitBlockGroup.extract(phrase1)
    new_sbg_list = [SplitBlockGroup()]

    current_skiped_idxes = []
    for idx1, sb1 in enumerate(old_sbg):
        if idx1 in current_skiped_idxes:
            continue

        current_skiped_idxes = [idx1]

        check_slash(sb1, idx1, current_skiped_idxes)

        new_sbg_list_dup = []
        is_split = len(current_skiped_idxes) > 1
        for sbg1 in new_sbg_list:
            for skip_idx1 in current_skiped_idxes:
                current_sbg1 = SplitBlockGroup(sbg1)  # make a dup
                sb2 = old_sbg[skip_idx1]
                if is_split and (not sb2.is_letter):
                    continue
                current_sbg1.append(sb2)
                # strip blank into 1
                for sb1 in current_sbg1:
                    if z(sb1) and sb1.is_blank and (len(sb1) > 1):
                        sb1.string = u' '
                new_sbg_list_dup.append(current_sbg1.concat_items())
        if len(new_sbg_list_dup):
            new_sbg_list = new_sbg_list_dup

    return new_sbg_list
示例#6
0
    def unnormal_words_count(cls, sentence, inspect=False):
        if inspect:
            print "[unnormal_words_count]", sentence

        c = 0
        for sb1 in SplitBlockGroup.extract(sentence):
            if inspect:
                print [sb1]
            if sb1.is_letter and (not sb1.is_regular):
                c += 1
        return c
    def recursive_match(self, matched_strs__to__phrase, sb1_current, sb1_list_current, key1_dict_current, key1_current, key1_next=None):
        if self.inspect:
            print "#" * 30, "[key1_current]", key1_current, "[key1_next]", key1_next
        key1_dict_next = None if key1_next is None else key1_dict_current[key1_next]
        phrase_current = ([p1 for p1 in key1_dict_current if isinstance(p1, Phrase)] or [None])[0]  # every level has only one phrase.

        def params_with(k1):
            """ encapsulate parameters in the current scope. """
            return [key1_dict_next, key1_next, "placeholder", matched_strs__to__phrase] + \
                   [k1] + \
                   [SplitBlockGroup(sb1_list_current), key1_dict_current, key1_current]

        if key1_next == dots:
            self.recursive_match_sub(*params_with(sb1_current))

        while sb1_current.n_sb:
            sb1_current     = sb1_current.n_sb  # direct to next, cause current is appended to `sb1_list_current`
            # sb1_current_str = sb1_current.utf8low()
            sb1_list_current.append(sb1_current)
            if self.inspect:
                print "[sb1_current]", "\"%s\"" % sb1_current
            if self.inspect:
                print "len [sb1_list_current]", len(sb1_list_current)

            if phrase_current:
                is_ender = ((sb1_current.is_other and (sb1_current.utf8low() not in ["'"])) or (sb1_current.n_sb is None))
                if ((key1_current == dots) and is_ender) or \
                   (key1_current != dots):
                        matched_strs__to__phrase[phrase_current] = SplitBlockGroup(sb1_list_current)  # make a copy
                        if self.inspect:
                            print
                            print "[end candidate_split_block_s loop : sb1_list_current]", sb1_list_current
                            print
                        if key1_next is None:
                            break

            if key1_current == dots:
                if not z(sb1_current.n_sb):
                    break
                if key1_next:
                    if key1_next == ld.lemmatize(sb1_current.n_sb.utf8low()):
                        self.recursive_match_sub(*params_with(sb1_current))
                        break
                    else:
                        continue
            else:
                if key1_next:
                    if key1_next == ld.lemmatize(sb1_current.utf8low()):
                        self.recursive_match_sub(*params_with(sb1_current))
                    continue
                else:
                    break
示例#8
0
    def is_normal_sentence(cls, sentence, inspect=False):
        """ only check english words. """
        if inspect:
            print "[is_normal_sentence]", sentence

        for sb1 in SplitBlockGroup.extract(sentence):
            if inspect:
                print [sb1]

            #if ("Liu Xing" in sentence) and (sb1.string == 'it'): import pdb; pdb.set_trace()
            if sb1.is_letter and (not sb1.is_regular):
                # compact with name. e.g. is, am, of, ...
                if unicode(sb1.string) in ld.two_length_words:
                    continue
                # compact with name. e.g. Liu Xing
                if sb1.p_sb and regexp.upper.match(sb1.string[0]):
                    continue
                # compact with "(p,r,i,g,n,s)"
                if (sb1.p_sb and sb1.p_sb.is_other) and (sb1.n_sb and sb1.n_sb.is_other):
                    continue

                # TODO remove am, is
                if sb1.string not in ['s', 'm', 'am', 'is']:
                    return False
                # compact with plural. e.g. ["it is 5         s (英镑)", "pound"]
                if sb1.p_sb and sb1.p_sb.can_fill:
                    continue
                # compact with I'm.
                if sb1.p_sb and sb1.p_sb.string == "'":
                    continue
                # compact with " is "
                if (sb1.p_sb and sb1.p_sb.is_blank) and (sb1.n_sb and sb1.n_sb.is_blank):
                    continue
                # compact with "I am ..."
                if (sb1.p_sb and sb1.p_sb.is_blank) and (sb1.relative_to_current(-2) and sb1.relative_to_current(-2).string == 'I'):
                    continue
        return True
def lemmatize_sentence(sentence):
    sbg = SplitBlockGroup.extract(sentence)
    for sb1 in sbg:
        if sb1.is_letter:
            sb1.string = ld.lemmatize(sb1.string)
    return sbg.concat_items()
示例#10
0
    def article_segment(self, sentence, inspect=False):
        sentence = re.sub("\xc2\xa0", " ", sentence)

        split_block_group = SplitBlockGroup.extract(sentence)
        index_block__to__fixed_words = dict()

        # Generate fixed words and their indexes.
        for chapped_group1 in split_block_group.maybe_chapped_groups():
            chapped_group1 = SplitBlockGroup(chapped_group1)

            # Reject upper words
            # Iterate to remove continuous upper items
            rejected_items = set([])
            letters = chapped_group1.letters()
            for idx1, letter1 in enumerate(letters):
                if (idx1 + 1) == len(letters):
                    break
                if inspect:
                    print letters
                if self.isupper(letter1.string) and self.isupper(letters[idx1 + 1].string, 1):
                    rejected_items.add(letter1)
                    rejected_items.add(letters[idx1 + 1])
            for rejected_item1 in rejected_items:
                chapped_group1.remove(rejected_item1)

            chapped_strs   = "".join(chapped_group1.concat_items().split(" "))
            fixed_words    = " ".join(self.segment(chapped_strs))
            if inspect:
                print fixed_words

            index_block__to__fixed_words[(chapped_group1[0].pos_begin, chapped_group1[-1].pos_end,)] = fixed_words
        if inspect:
            print
            print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups()
            print "[index_block__to__fixed_words]", index_block__to__fixed_words
            print "\n" * 5

        # Fill fixed words by their indexes.
        for begin_end_pos in index_block__to__fixed_words:
            begin_idx1, end_idx1 = None, None
            for idx2, sb2 in enumerate(split_block_group):
                if isinstance(sb2, str):
                    continue
                if begin_end_pos[0] == sb2.pos_begin:
                    begin_idx1 = idx2
                if begin_end_pos[1] == sb2.pos_end:
                    end_idx1   = idx2
            split_block_group[begin_idx1:end_idx1 + 1] = index_block__to__fixed_words[begin_end_pos]
            if inspect:
                print split_block_group
                print

        # Fix blanks
        for idx1, item1 in enumerate(split_block_group[:]):
            if not isinstance(item1, str):
                continue
            if (idx1 + 1) == len(split_block_group) - 1:
                break

            self.fix_blanks(split_block_group, item1, idx1, 1)
            self.fix_blanks(split_block_group, item1, idx1, -1)

        if inspect:
            print split_block_group.concat_items()
            print
        return split_block_group.concat_items()
示例#11
0
def lemmatize_sentence(sentence):
    sbg = SplitBlockGroup.extract(sentence)
    for sb1 in sbg:
        if sb1.is_letter:
            sb1.string = ld.lemmatize(sb1.string)
    return sbg.concat_items()
示例#12
0
    def article_segment(self, sentence, inspect=False):
        sentence = re.sub("\xc2\xa0", " ", sentence)

        split_block_group = SplitBlockGroup.extract(sentence)
        index_block__to__fixed_words = dict()

        # Generate fixed words and their indexes.
        for chapped_group1 in split_block_group.maybe_chapped_groups():
            chapped_group1 = SplitBlockGroup(chapped_group1)

            # Reject upper words
            # Iterate to remove continuous upper items
            rejected_items = set([])
            letters = chapped_group1.letters()
            for idx1, letter1 in enumerate(letters):
                if (idx1 + 1) == len(letters):
                    break
                if inspect:
                    print letters
                if self.isupper(letter1.string) and self.isupper(
                        letters[idx1 + 1].string, 1):
                    rejected_items.add(letter1)
                    rejected_items.add(letters[idx1 + 1])
            for rejected_item1 in rejected_items:
                chapped_group1.remove(rejected_item1)

            chapped_strs = "".join(chapped_group1.concat_items().split(" "))
            fixed_words = " ".join(self.segment(chapped_strs))
            if inspect:
                print fixed_words

            index_block__to__fixed_words[(
                chapped_group1[0].pos_begin,
                chapped_group1[-1].pos_end,
            )] = fixed_words
        if inspect:
            print
            print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups(
            )
            print "[index_block__to__fixed_words]", index_block__to__fixed_words
            print "\n" * 5

        # Fill fixed words by their indexes.
        for begin_end_pos in index_block__to__fixed_words:
            begin_idx1, end_idx1 = None, None
            for idx2, sb2 in enumerate(split_block_group):
                if isinstance(sb2, str):
                    continue
                if begin_end_pos[0] == sb2.pos_begin:
                    begin_idx1 = idx2
                if begin_end_pos[1] == sb2.pos_end:
                    end_idx1 = idx2
            split_block_group[begin_idx1:end_idx1 +
                              1] = index_block__to__fixed_words[begin_end_pos]
            if inspect:
                print split_block_group
                print

        # Fix blanks
        for idx1, item1 in enumerate(split_block_group[:]):
            if not isinstance(item1, str):
                continue
            if (idx1 + 1) == len(split_block_group) - 1:
                break

            self.fix_blanks(split_block_group, item1, idx1, 1)
            self.fix_blanks(split_block_group, item1, idx1, -1)

        if inspect:
            print split_block_group.concat_items()
            print
        return split_block_group.concat_items()
 def params_with(k1):
     """ encapsulate parameters in the current scope. """
     return [key1_dict_next, key1_next, "placeholder", matched_strs__to__phrase] + \
            [k1] + \
            [SplitBlockGroup(sb1_list_current), key1_dict_current, key1_current]
示例#14
0
 def test_is_a_SplitBlock(self):
     sb1 = SplitBlockGroup.extract("hello")[0]
     self.assertTrue(SplitBlockGroup.z(sb1))
示例#15
0
 def test_maybe_chapped_groups(self):
     groups = SplitBlockGroup.extract("A. s un  B.no s e C.fa c e  D.ri c e").maybe_chapped_groups()
     groups = [SplitBlockGroup(g1).concat_items() for g1 in groups]
     self.assertEqual(groups, ['s un  ', 'no s e ', 'fa c e  ', 'ri c e'])