Python SplitBlockGroup.extract примеры, split_block.SplitBlockGroup.extract Python примеры использования

Пример #1

0

Показать файл

Файл: phrase_rules_split.py Проект: 17zuoye/phrase_recognizer

def sub_split(phrase1):
    """ split u"I/We/They… always/usually/often/sometimes…" into sub strs, e.g. "I... always...". """
    old_sbg = SplitBlockGroup.extract(phrase1)
    new_sbg_list = [SplitBlockGroup()]

    current_skiped_idxes = []
    for idx1, sb1 in enumerate(old_sbg):
        if idx1 in current_skiped_idxes:
            continue

        current_skiped_idxes = [idx1]

        check_slash(sb1, idx1, current_skiped_idxes)

        new_sbg_list_dup = []
        is_split = len(current_skiped_idxes) > 1
        for sbg1 in new_sbg_list:
            for skip_idx1 in current_skiped_idxes:
                current_sbg1 = SplitBlockGroup(sbg1)  # make a dup
                sb2 = old_sbg[skip_idx1]
                if is_split and (not sb2.is_letter):
                    continue
                current_sbg1.append(sb2)
                # strip blank into 1
                for sb1 in current_sbg1:
                    if z(sb1) and sb1.is_blank and (len(sb1) > 1):
                        sb1.string = u' '
                new_sbg_list_dup.append(current_sbg1.concat_items())
        if len(new_sbg_list_dup):
            new_sbg_list = new_sbg_list_dup

    return new_sbg_list

Пример #2

0

Показать файл

def sub_split(phrase1):
    """ split u"I/We/They… always/usually/often/sometimes…" into sub strs, e.g. "I... always...". """
    old_sbg = SplitBlockGroup.extract(phrase1)
    new_sbg_list = [SplitBlockGroup()]

    current_skiped_idxes = []
    for idx1, sb1 in enumerate(old_sbg):
        if idx1 in current_skiped_idxes:
            continue

        current_skiped_idxes = [idx1]

        check_slash(sb1, idx1, current_skiped_idxes)

        new_sbg_list_dup = []
        is_split = len(current_skiped_idxes) > 1
        for sbg1 in new_sbg_list:
            for skip_idx1 in current_skiped_idxes:
                current_sbg1 = SplitBlockGroup(sbg1)  # make a dup
                sb2 = old_sbg[skip_idx1]
                if is_split and (not sb2.is_letter):
                    continue
                current_sbg1.append(sb2)
                # strip blank into 1
                for sb1 in current_sbg1:
                    if z(sb1) and sb1.is_blank and (len(sb1) > 1):
                        sb1.string = u' '
                new_sbg_list_dup.append(current_sbg1.concat_items())
        if len(new_sbg_list_dup):
            new_sbg_list = new_sbg_list_dup

    return new_sbg_list

Пример #3

0

Показать файл

    def generate_SplitBlock_list(self, env):
        # 1. generate SplitBlock list
        env.split_block_group = SplitBlockGroup.extract(env.sentence)

        if env.inspect:
            print "[split_block_group]", env.split_block_group
            print

Пример #4

0

Показать файл

Файл: phrase_recognizer.py Проект: lover2668/phrase_recognizer

    def __init__(self, phrasal_collocation_s):
        # generate self.first_strs_dict and self.tree these two data structure
        phrase_to_strs_list = dict()
        for phrasal_collocation1 in phrasal_collocation_s:
            assert isinstance(phrasal_collocation1, unicode)
            phrasal_collocation2 = Phrase(phrasal_collocation1)
            for pc2 in PhrasalRecognizer.split(phrasal_collocation1):
                split_block_group = SplitBlockGroup.extract(str(pc2).lower())
                # strip both words and spaces
                strs_list = [sb1.utf8low() for sb1 in split_block_group if sb1.pos_begin is not None]  # skip fake sb1
                if phrasal_collocation2 not in phrase_to_strs_list:
                    phrase_to_strs_list[phrasal_collocation2] = []
                phrase_to_strs_list[phrasal_collocation2].append(strs_list)

        self.first_strs_dict = {i2[0]: True for i1 in phrase_to_strs_list.values() for i2 in i1}

        self.tree = dict()
        for phrase1 in phrase_to_strs_list:
            for strs1 in phrase_to_strs_list[phrase1]:
                current_dict = self.tree
                for idx2, s2 in enumerate(strs1):
                    if not s2:
                        continue  # ignore spaces, and will ignore at search by the way.
                    # always a dict
                    if s2 not in current_dict:
                        current_dict[s2] = dict()
                    current_dict = current_dict[s2]
                    # mark a ender
                    if idx2 == (len(strs1) - 1):
                        current_dict[phrase1] = True

        self.inspect = False

Пример #5

0

Показать файл

    def unnormal_words_count(cls, sentence, inspect=False):
        if inspect:
            print "[unnormal_words_count]", sentence

        c = 0
        for sb1 in SplitBlockGroup.extract(sentence):
            if inspect:
                print [sb1]
            if sb1.is_letter and (not sb1.is_regular):
                c += 1
        return c

Пример #6

0

Показать файл

Файл: phrase_recognizer.py Проект: lover2668/phrase_recognizer

    def process(self, sentence, inspect=False, replace=False):
        self.sentence = sentence  # TODO remove me

        inspect = self.inspect or inspect
        if self.inspect:
            print "#|" * 80
            print "processing \"%s\"" % self.sentence

        split_block_group = SplitBlockGroup.extract(self.sentence)
        candidate_split_block_s = []

        # generate candidate_split_block_s
        for letter1 in split_block_group.letters():
            # First string must be chars
            if ld.lemmatize(letter1.utf8low()) in self.first_strs_dict:
                candidate_split_block_s.append(letter1)

        if dots in self.tree:
            candidate_split_block_s.append(dots)  # TODO

        # generate letter1_sb_list
        matched_strs__to__phrase = dict()
        for letter1 in candidate_split_block_s:  # iterate each matched letter1
            sb1_list_current = SplitBlockGroup([letter1])  # actually we append it before the current loop here.

            if letter1 == dots:
                key1_current = dots
                sb1_current = split_block_group[0]
            else:
                key1_current = ld.lemmatize(letter1.utf8low())
                sb1_current = letter1

            key1_dict_current = self.tree[key1_current]

            for key1_next in key1_dict_current:
                self.recursive_match(matched_strs__to__phrase, sb1_current, sb1_list_current, key1_dict_current, key1_current, key1_next)

        letter1_sb_list = sorted(matched_strs__to__phrase.values(), key=lambda i1: -len(i1))
        if inspect:
            print
            print "[letter1_sb_list]", letter1_sb_list

        if replace:
            self.sentence = self.generate_replaced_sentence(letter1_sb_list, split_block_group)

        return [self.sentence, sorted(matched_strs__to__phrase.keys())]

Пример #7

0

Показать файл

    def is_normal_sentence(cls, sentence, inspect=False):
        """ only check english words. """
        if inspect:
            print "[is_normal_sentence]", sentence

        for sb1 in SplitBlockGroup.extract(sentence):
            if inspect:
                print [sb1]

            #if ("Liu Xing" in sentence) and (sb1.string == 'it'): import pdb; pdb.set_trace()
            if sb1.is_letter and (not sb1.is_regular):
                # compact with name. e.g. is, am, of, ...
                if unicode(sb1.string) in ld.two_length_words:
                    continue
                # compact with name. e.g. Liu Xing
                if sb1.p_sb and regexp.upper.match(sb1.string[0]):
                    continue
                # compact with "(p,r,i,g,n,s)"
                if (sb1.p_sb and sb1.p_sb.is_other) and (sb1.n_sb and sb1.n_sb.is_other):
                    continue

                # TODO remove am, is
                if sb1.string not in ['s', 'm', 'am', 'is']:
                    return False
                # compact with plural. e.g. ["it is 5         s (英镑)", "pound"]
                if sb1.p_sb and sb1.p_sb.can_fill:
                    continue
                # compact with I'm.
                if sb1.p_sb and sb1.p_sb.string == "'":
                    continue
                # compact with " is "
                if (sb1.p_sb and sb1.p_sb.is_blank) and (sb1.n_sb and sb1.n_sb.is_blank):
                    continue
                # compact with "I am ..."
                if (sb1.p_sb and sb1.p_sb.is_blank) and (sb1.relative_to_current(-2) and sb1.relative_to_current(-2).string == 'I'):
                    continue
        return True

Пример #8

0

Показать файл

Файл: phrase_rules_split.py Проект: 17zuoye/phrase_recognizer

def lemmatize_sentence(sentence):
    sbg = SplitBlockGroup.extract(sentence)
    for sb1 in sbg:
        if sb1.is_letter:
            sb1.string = ld.lemmatize(sb1.string)
    return sbg.concat_items()

Пример #9

0

Показать файл

Файл: __init__.py Проект: 17zuoye/article_segment

    def article_segment(self, sentence, inspect=False):
        sentence = re.sub("\xc2\xa0", " ", sentence)

        split_block_group = SplitBlockGroup.extract(sentence)
        index_block__to__fixed_words = dict()

        # Generate fixed words and their indexes.
        for chapped_group1 in split_block_group.maybe_chapped_groups():
            chapped_group1 = SplitBlockGroup(chapped_group1)

            # Reject upper words
            # Iterate to remove continuous upper items
            rejected_items = set([])
            letters = chapped_group1.letters()
            for idx1, letter1 in enumerate(letters):
                if (idx1 + 1) == len(letters):
                    break
                if inspect:
                    print letters
                if self.isupper(letter1.string) and self.isupper(letters[idx1 + 1].string, 1):
                    rejected_items.add(letter1)
                    rejected_items.add(letters[idx1 + 1])
            for rejected_item1 in rejected_items:
                chapped_group1.remove(rejected_item1)

            chapped_strs   = "".join(chapped_group1.concat_items().split(" "))
            fixed_words    = " ".join(self.segment(chapped_strs))
            if inspect:
                print fixed_words

            index_block__to__fixed_words[(chapped_group1[0].pos_begin, chapped_group1[-1].pos_end,)] = fixed_words
        if inspect:
            print
            print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups()
            print "[index_block__to__fixed_words]", index_block__to__fixed_words
            print "\n" * 5

        # Fill fixed words by their indexes.
        for begin_end_pos in index_block__to__fixed_words:
            begin_idx1, end_idx1 = None, None
            for idx2, sb2 in enumerate(split_block_group):
                if isinstance(sb2, str):
                    continue
                if begin_end_pos[0] == sb2.pos_begin:
                    begin_idx1 = idx2
                if begin_end_pos[1] == sb2.pos_end:
                    end_idx1   = idx2
            split_block_group[begin_idx1:end_idx1 + 1] = index_block__to__fixed_words[begin_end_pos]
            if inspect:
                print split_block_group
                print

        # Fix blanks
        for idx1, item1 in enumerate(split_block_group[:]):
            if not isinstance(item1, str):
                continue
            if (idx1 + 1) == len(split_block_group) - 1:
                break

            self.fix_blanks(split_block_group, item1, idx1, 1)
            self.fix_blanks(split_block_group, item1, idx1, -1)

        if inspect:
            print split_block_group.concat_items()
            print
        return split_block_group.concat_items()

Пример #10

0

Показать файл

def lemmatize_sentence(sentence):
    sbg = SplitBlockGroup.extract(sentence)
    for sb1 in sbg:
        if sb1.is_letter:
            sb1.string = ld.lemmatize(sb1.string)
    return sbg.concat_items()

Пример #11

0

Показать файл

Файл: __init__.py Проект: lover2668/article_segment

    def article_segment(self, sentence, inspect=False):
        sentence = re.sub("\xc2\xa0", " ", sentence)

        split_block_group = SplitBlockGroup.extract(sentence)
        index_block__to__fixed_words = dict()

        # Generate fixed words and their indexes.
        for chapped_group1 in split_block_group.maybe_chapped_groups():
            chapped_group1 = SplitBlockGroup(chapped_group1)

            # Reject upper words
            # Iterate to remove continuous upper items
            rejected_items = set([])
            letters = chapped_group1.letters()
            for idx1, letter1 in enumerate(letters):
                if (idx1 + 1) == len(letters):
                    break
                if inspect:
                    print letters
                if self.isupper(letter1.string) and self.isupper(
                        letters[idx1 + 1].string, 1):
                    rejected_items.add(letter1)
                    rejected_items.add(letters[idx1 + 1])
            for rejected_item1 in rejected_items:
                chapped_group1.remove(rejected_item1)

            chapped_strs = "".join(chapped_group1.concat_items().split(" "))
            fixed_words = " ".join(self.segment(chapped_strs))
            if inspect:
                print fixed_words

            index_block__to__fixed_words[(
                chapped_group1[0].pos_begin,
                chapped_group1[-1].pos_end,
            )] = fixed_words
        if inspect:
            print
            print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups(
            )
            print "[index_block__to__fixed_words]", index_block__to__fixed_words
            print "\n" * 5

        # Fill fixed words by their indexes.
        for begin_end_pos in index_block__to__fixed_words:
            begin_idx1, end_idx1 = None, None
            for idx2, sb2 in enumerate(split_block_group):
                if isinstance(sb2, str):
                    continue
                if begin_end_pos[0] == sb2.pos_begin:
                    begin_idx1 = idx2
                if begin_end_pos[1] == sb2.pos_end:
                    end_idx1 = idx2
            split_block_group[begin_idx1:end_idx1 +
                              1] = index_block__to__fixed_words[begin_end_pos]
            if inspect:
                print split_block_group
                print

        # Fix blanks
        for idx1, item1 in enumerate(split_block_group[:]):
            if not isinstance(item1, str):
                continue
            if (idx1 + 1) == len(split_block_group) - 1:
                break

            self.fix_blanks(split_block_group, item1, idx1, 1)
            self.fix_blanks(split_block_group, item1, idx1, -1)

        if inspect:
            print split_block_group.concat_items()
            print
        return split_block_group.concat_items()

Пример #12

0

Показать файл

 def test_is_a_SplitBlock(self):
     sb1 = SplitBlockGroup.extract("hello")[0]
     self.assertTrue(SplitBlockGroup.z(sb1))

Пример #13

0

Показать файл

 def test_maybe_chapped_groups(self):
     groups = SplitBlockGroup.extract("A. s un  B.no s e C.fa c e  D.ri c e").maybe_chapped_groups()
     groups = [SplitBlockGroup(g1).concat_items() for g1 in groups]
     self.assertEqual(groups, ['s un  ', 'no s e ', 'fa c e  ', 'ri c e'])

Python SplitBlockGroup.extract примеры использования