Python TMUtilsMatching.check_stopwords示例

编程语言: Python

命名空间/包名称: TMMatching.TMUtilsMatching

类/类型: TMUtilsMatching

方法/功能: check_stopwords

hotexamples.com的示例: 6

Python TMUtilsMatching.check_stopwords - 已找到6个示例。这些是从开源项目中提取的最受好评的TMMatching.TMUtilsMatching.TMUtilsMatching.check_stopwords现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

pre_process(12)

check_stopwords(6)

segment_2_universal(4)

_edit_distance(3)

_ter_score(3)

len_compare(2)

pos_bool(2)

position_distance(2)

strip_tags(2)

transfer_tags(2)

empty_list(1)

reduce_tags(1)

ter_distance(1)

un_match_distance(1)

示例#1

显示文件

文件： TMSplit.py 项目： MittagQI/nectm

    def __init__(self):
        self.order = ['comma', 'conjunction', 'compose_sub',
                      'last']  #subordinate

        self.sw = TMUtilsMatching.check_stopwords(
            'ES'
        )  #TMTextProcessors.stop_words('spanish')#stopwords.words('spanish')
        self.ut = TMTextProcessors.univ_pos_tagger(
            'ES')  #TMUniversalPosTag('ES')

        self.rules = {
            'conjunction':
            RulesPattern(
                '<.*>*<V.*><.*>*<CC|CCNEG|CCAD><.*>*<V.*><.*>*',  #, '<CC|CCNEG|CCAD>','',''
                '<.*>*<V.*><.*>*',
                '<CC|CCNEG|CCAD><.*>*<V.*><.*>*'
            ),  #'?!<V.*>', '<CC|CCNEG|CCAD><.*>*'
            'comma':
            RulesPattern(
                '<.*>*<V.*><.*>*<CM|COLON|DASH|SEMICOLON><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*', '<CM|COLON|DASH|SEMICOLON><.*>*<V.*><.*>*'
            ),  #, '<CM|COLON|DASH|SEMICOLON>','',''
            'compose_sub':
            RulesPattern(
                '<.*>*<V.*><.*>*<CC|CCNEG|CCAD|CM|COLON|DASH|SEMICOLON|PREP|PDEL><CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',
                '<CC|CCNEG|CCAD|CM|COLON|DASH|SEMICOLON|PREP|PDEL><CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*'
            ),  # --> subordinate
            'last':
            RulesPattern(
                '<.*>*<V.*><.*>*<CQUE|CSUBF|CSUBI|CSUBX><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*', '<CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*'
            ),  #--> subordinate
        }

示例#2

显示文件

文件： TMSplit.py 项目： MittagQI/nectm

    def __init__(self):  # language,#
        self.order = ['comma', 'conjunction', 'compose_sub',
                      'last']  # 'subordinate', --> Last = subordinate

        self.sw = TMUtilsMatching.check_stopwords(
            'EN'
        )  #TMTextProcessors.stop_words('english') #stopwords.words('english')
        self.ut = TMTextProcessors.univ_pos_tagger(
            'EN')  #TMUniversalPosTag('EN')
        self.rules = {
            #pattern ---> left ---> right
            'conjunction':
            RulesPattern('<.*>*<V.*><.*>*<CC><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*',
                         '<CC><.*>*<V.*><.*>*'),  #, '<CC>','',''
            'last':
            RulesPattern('<.*>*<V.*><.*>*<WDT|WP|WRB|IN/that><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*',
                         '<WDT|WP|WRB|IN/that><.*>*<V.*><.*>*'),
            'compose_sub':
            RulesPattern(
                '<.*>*<V.*><.*>*<CC|\,|\;|\:|\-><WDT|WP|WRB|IN/that><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',
                '<CC|\,|\;|\:|\-><WDT|WP|WRB|IN/that><.*>*<V.*><.*>*'),
            # --> wh_words               <V> <NP|PP>*
            'comma':
            RulesPattern(
                '<.*>*<V.*><.*>*<\,|\;|\:|\-><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',
                '<\,|\;|\:|\-><.*>*<V.*><.*>*'),  #, '<\,|\;|\:|\->', '', ''
        }

示例#3

显示文件

  def _only_word_sequence(text, lang): # Receive original sequence
    only_word = []
    only_st = []
    l_src_st = TMUtilsMatching.check_stopwords(lang)
    for match in re.finditer(r'[a-zA-Z0-9\u4e00-\u9fff\u3040-\u309Fー\u30A0-\u30FF]+', text):  # Get all the words and numbers

      if l_src_st: # For some language we don't have stopwords list
        if match.group() in l_src_st:
          only_st.append(match.group())
        else:
          only_st.append('P')
          only_word.append(match.group())

    return only_word, only_st

示例#4

显示文件

文件： TMSplit.py 项目： MittagQI/nectm

    def __init__(self):
        self.order = ['comma', 'last']  # last = conjuntion

        self.sw = TMUtilsMatching.check_stopwords('FR')  # stop words
        self.ut = TMTextProcessors.univ_pos_tagger(
            'FR')  # TMUniversalPosTag('ES')

        self.rules = {
            'last':
            RulesPattern('<.*>*<V.*><.*>*<KON><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*', '<KON><.*>*<V.*><.*>*'),
            'comma':
            RulesPattern(
                '<.*>*<V.*><.*>*<PUN|SENT><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',  # Punctuation marks
                '<PUN|SENT><.*>*<V.*><.*>*')
        }

示例#5

显示文件

文件： TMSplit.py 项目： MittagQI/nectm

    def __init__(self):  # language,#
        self.order = ['comma', 'last']  # 'conjunction',

        self.sw = TMUtilsMatching.check_stopwords(
            'EN'
        )  #TMTextProcessors.stop_words('english')  # stopwords.words('english')
        self.ut = TMTextProcessors.univ_pos_tagger(
            'EN')  # TMUniversalPosTag('EN')
        self.rules = {
            # pattern ---> left ---> right
            'comma':
            RulesPattern('<.*>*<V.*><.*>*<CC><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*', '<CC><.*>*<V.*><.*>*'),
            'last':
            RulesPattern('<.*>*<V.*><.*>*<\,|\;|\:|\-><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*', '<\,|\;|\:|\-><.*>*<V.*><.*>*'),
        }

示例#6

显示文件

文件： TMSplit.py 项目： MittagQI/nectm

    def __init__(self, lang):
        self.order = ['comma', 'last']  # subordinate

        self.sw = TMUtilsMatching.check_stopwords(
            lang)  # stopwords.words('spanish')
        self.ut = TMTextProcessors.univ_pos_tagger(
            lang)  # TMUniversalPosTag('ES')

        self.rules = {
            'last':
            RulesPattern(
                '<.*>*<VERB><.*>*<CONJ|SCONJ><.*>*<VERB><.*>*',  # conjunctions
                '<.*>*<VERB><.*>*',
                '<CONJ><.*>*<VERB><.*>*'),
            'comma':
            RulesPattern('<.*>*<VERB><.*>*<\.><.*>*<VERB><.*>*',
                         '<.*>*<VERB><.*>*', '<\.><.*>*<VERB><.*>*'),  # comma
        }