def __init__(self): self.order = ['comma', 'conjunction', 'compose_sub', 'last'] #subordinate self.sw = TMUtilsMatching.check_stopwords( 'ES' ) #TMTextProcessors.stop_words('spanish')#stopwords.words('spanish') self.ut = TMTextProcessors.univ_pos_tagger( 'ES') #TMUniversalPosTag('ES') self.rules = { 'conjunction': RulesPattern( '<.*>*<V.*><.*>*<CC|CCNEG|CCAD><.*>*<V.*><.*>*', #, '<CC|CCNEG|CCAD>','','' '<.*>*<V.*><.*>*', '<CC|CCNEG|CCAD><.*>*<V.*><.*>*' ), #'?!<V.*>', '<CC|CCNEG|CCAD><.*>*' 'comma': RulesPattern( '<.*>*<V.*><.*>*<CM|COLON|DASH|SEMICOLON><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<CM|COLON|DASH|SEMICOLON><.*>*<V.*><.*>*' ), #, '<CM|COLON|DASH|SEMICOLON>','','' 'compose_sub': RulesPattern( '<.*>*<V.*><.*>*<CC|CCNEG|CCAD|CM|COLON|DASH|SEMICOLON|PREP|PDEL><CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<CC|CCNEG|CCAD|CM|COLON|DASH|SEMICOLON|PREP|PDEL><CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*' ), # --> subordinate 'last': RulesPattern( '<.*>*<V.*><.*>*<CQUE|CSUBF|CSUBI|CSUBX><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*' ), #--> subordinate }
def __init__(self): # language,# self.order = ['comma', 'conjunction', 'compose_sub', 'last'] # 'subordinate', --> Last = subordinate self.sw = TMUtilsMatching.check_stopwords( 'EN' ) #TMTextProcessors.stop_words('english') #stopwords.words('english') self.ut = TMTextProcessors.univ_pos_tagger( 'EN') #TMUniversalPosTag('EN') self.rules = { #pattern ---> left ---> right 'conjunction': RulesPattern('<.*>*<V.*><.*>*<CC><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<CC><.*>*<V.*><.*>*'), #, '<CC>','','' 'last': RulesPattern('<.*>*<V.*><.*>*<WDT|WP|WRB|IN/that><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<WDT|WP|WRB|IN/that><.*>*<V.*><.*>*'), 'compose_sub': RulesPattern( '<.*>*<V.*><.*>*<CC|\,|\;|\:|\-><WDT|WP|WRB|IN/that><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<CC|\,|\;|\:|\-><WDT|WP|WRB|IN/that><.*>*<V.*><.*>*'), # --> wh_words <V> <NP|PP>* 'comma': RulesPattern( '<.*>*<V.*><.*>*<\,|\;|\:|\-><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<\,|\;|\:|\-><.*>*<V.*><.*>*'), #, '<\,|\;|\:|\->', '', '' }
def _only_word_sequence(text, lang): # Receive original sequence only_word = [] only_st = [] l_src_st = TMUtilsMatching.check_stopwords(lang) for match in re.finditer(r'[a-zA-Z0-9\u4e00-\u9fff\u3040-\u309Fー\u30A0-\u30FF]+', text): # Get all the words and numbers if l_src_st: # For some language we don't have stopwords list if match.group() in l_src_st: only_st.append(match.group()) else: only_st.append('P') only_word.append(match.group()) return only_word, only_st
def __init__(self): self.order = ['comma', 'last'] # last = conjuntion self.sw = TMUtilsMatching.check_stopwords('FR') # stop words self.ut = TMTextProcessors.univ_pos_tagger( 'FR') # TMUniversalPosTag('ES') self.rules = { 'last': RulesPattern('<.*>*<V.*><.*>*<KON><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<KON><.*>*<V.*><.*>*'), 'comma': RulesPattern( '<.*>*<V.*><.*>*<PUN|SENT><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', # Punctuation marks '<PUN|SENT><.*>*<V.*><.*>*') }
def __init__(self): # language,# self.order = ['comma', 'last'] # 'conjunction', self.sw = TMUtilsMatching.check_stopwords( 'EN' ) #TMTextProcessors.stop_words('english') # stopwords.words('english') self.ut = TMTextProcessors.univ_pos_tagger( 'EN') # TMUniversalPosTag('EN') self.rules = { # pattern ---> left ---> right 'comma': RulesPattern('<.*>*<V.*><.*>*<CC><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<CC><.*>*<V.*><.*>*'), 'last': RulesPattern('<.*>*<V.*><.*>*<\,|\;|\:|\-><.*>*<V.*><.*>*', '<.*>*<V.*><.*>*', '<\,|\;|\:|\-><.*>*<V.*><.*>*'), }
def __init__(self, lang): self.order = ['comma', 'last'] # subordinate self.sw = TMUtilsMatching.check_stopwords( lang) # stopwords.words('spanish') self.ut = TMTextProcessors.univ_pos_tagger( lang) # TMUniversalPosTag('ES') self.rules = { 'last': RulesPattern( '<.*>*<VERB><.*>*<CONJ|SCONJ><.*>*<VERB><.*>*', # conjunctions '<.*>*<VERB><.*>*', '<CONJ><.*>*<VERB><.*>*'), 'comma': RulesPattern('<.*>*<VERB><.*>*<\.><.*>*<VERB><.*>*', '<.*>*<VERB><.*>*', '<\.><.*>*<VERB><.*>*'), # comma }