Пример #1
0
    def calc_terms(self):

        the_regexp = Util.get_the_regexp(self.kwargs)

        return Util.calc_local_regexp(self.kwargs["string"],
                                the_regexp,
                                self.kwargs["local_k"])
Пример #2
0
    def calc_terms(self):

        f_ignored_terms = open(self.kwargs["path_ignored_terms"])

        for line in f_ignored_terms:
            self.kwargs["string"] = self.kwargs["string"].replace(line.strip().lower(), "")
            print line.strip().lower()

        f_ignored_terms.close()

	the_regexp = Util.get_the_regexp(self.kwargs)	

        return Util.calc_regexp(self.kwargs["string"],
                                the_regexp)
Пример #3
0
 def calc_terms(self):
     print "#####################"
     print self.name
     print self.kwargs["boolBuildSetGlobal"]
     print "#####################"
     if self.kwargs["boolBuildSetGlobal"]:
         print "++++++++++++++++++++++#####################"
         print self.name
         print self.kwargs["boolBuildSetGlobal"]
         print "++++++++++++++++++++++#####################"
         self.kwargs["setAmbiguous"] = Util.calc_ambiguous_words_set(self.kwargs["string"])
         self.kwargs["boolBuildSetGlobal"] = False
         self.kwargs["mode"] = EnumModes.MODE_CORPUS_POS  # MODE_CORPUS_POS
         return self.kwargs["setAmbiguous"]
     else:
         tokens = Util.calc_ambiguous_words(self.kwargs["string"], self.kwargs["setAmbiguous"])
         return tokens
Пример #4
0
    def calc_terms(self):

        the_regexp = ".+_(.+)"
        if "regexp" in self.kwargs:
            the_regexp = Util.get_the_regexp(self.kwargs)
            
        tokens = Util.calc_split(self.kwargs["string"])
        tags = []
        for token in tokens:
            match = re.match(the_regexp, token)
            if match == None:
                pass 
                # print token
            else:
                tag = match.group(1)
                tags += [tag]      

        return tags
Пример #5
0
 def calc_terms(self):
     print "#####################"
     print self.name
     print self.kwargs["boolBuildSetGlobal"]
     print "#####################"
     if self.kwargs["boolBuildSetGlobal"]:
         print "++++++++++++++++++++++#####################"
         print self.name
         print self.kwargs["boolBuildSetGlobal"]
         print "++++++++++++++++++++++#####################"
         self.kwargs["setAmbiguous"] = Util.calc_ambiguous_words_set(self.kwargs['string'])
         self.kwargs["boolBuildSetGlobal"] = False
         self.kwargs["mode"] = EnumModes.MODE_CORPUS_POS#MODE_CORPUS_POS
         return self.kwargs["setAmbiguous"]
     else:
         tokens = Util.calc_ambiguous_words(self.kwargs['string'],
                                            self.kwargs['setAmbiguous'])
         return tokens
Пример #6
0
    def calc_terms(self, kwargs, f_src):
        # save the original corpus
        corpus_temp = kwargs["corpus"]

        groups = re.match(r'/home/aplm/nltk_data/corpora/c50/(.+)', corpus_temp.root.path)
        kwargs["corpus"] = LazyCorpusLoader("c50_term_SFM_23/" + groups.group(1), CategorizedPlaintextCorpusReader, r'.+/.+', cat_pattern=r'(.+)/.+')

        sfm_terms = Util.calc_SFM(kwargs["corpus"].raw(fileids=[f_src]))

        # restore the original corpus
        kwargs["corpus"] = corpus_temp
        return sfm_terms
Пример #7
0
 def calc_terms(self):
     return Util.calc_sent_nostopwords_lenght(self.kwargs["string"],
                                              RegExps.W_H_C)
Пример #8
0
 def calc_terms(self):
     return Util.calc_token_lenght(self.kwargs["string"],
                                   self.kwargs["regexp"],
                                   self.kwargs["template"])  #"word{len:%s}"
Пример #9
0
    def calc_terms(self):
        the_regexp = Util.get_the_regexp(self.kwargs)

        return Util.calc_ngrams_g(self.kwargs["string"], the_regexp,
                                  self.kwargs["nlen"])
Пример #10
0
 def calc_terms(self):
     return Util.calc_local_ngrams(self.kwargs["string"],
                                   self.kwargs["nlen"],
                                   self.kwargs["local_k"])
Пример #11
0
 def calc_collocation(self):
     return Util.calc_trigram_collocation(self.kwargs["string"],
                                         RegExps.STOPW_PUNTC,
                                         self.kwargs["boolStem"],
                                         self.kwargs["setCollocations"])
Пример #12
0
 def calc_terms(self):
     return self.calc_ngrams_g(Util.calc_split(self.kwargs["string"]),
                               self.kwargs["nlen"])
Пример #13
0
 def calc_terms(self):
     return Util.calc_split(self.kwargs["string"])
Пример #14
0
    def calc_terms(self):
        the_regexp = Util.get_the_regexp(self.kwargs)

        return Util.calc_word_pair_opcion(self.kwargs["string"],
                                 the_regexp, self.kwargs["opcion"])
Пример #15
0
    def calc_terms(self):
        the_regexp = Util.get_the_regexp(self.kwargs)

        return Util.calc_markers(self.kwargs["string"],
                                 the_regexp,
                 self.kwargs["type_marker"])
Пример #16
0
 def calc_terms(self):
     return Util.calc_trigrams(self.kwargs["string"], RegExps.STYLE_POS,
                               self.kwargs["boolStem"])
Пример #17
0
 def calc_collocation(self):
     return Util.calc_trigram_collocation(self.kwargs["string"],
                                          RegExps.STOPW_PUNTC,
                                          self.kwargs["boolStem"],
                                          self.kwargs["setCollocations"])
Пример #18
0
 def calc_terms(self):
     return Util.calc_local_ngrams(self.kwargs["string"],
                             self.kwargs["nlen"],
                             self.kwargs["local_k"])
Пример #19
0
 def calc_collocation_set(self):
     return Util.calc_trigram_collocation_set(self.kwargs["string"],
                                             RegExps.STOPW,
                                             self.kwargs["boolStem"])
Пример #20
0
    def calc_terms(self):
        the_regexp = Util.get_the_regexp(self.kwargs)

        return Util.calc_trigrams(self.kwargs["string"],
                                 the_regexp)
Пример #21
0
 def calc_terms(self):
     pos_terms = Util.calc_lazy_POS(self.kwargs["string"],)
     return pos_terms
Пример #22
0
    def calc_terms(self):
        the_regexp = Util.get_the_regexp(self.kwargs)

        return Util.calc_ngrams_g(self.kwargs["string"],
                                 the_regexp,
                                 self.kwargs["nlen"])
Пример #23
0
 def calc_terms(self):
     return Util.calc_split(self.kwargs["string"])
Пример #24
0
 def calc_collocation(self):
     return Util.calc_trigram_collocation(self.kwargs["string"],
                                         self.kwargs["regexp"],
                                         self.kwargs["boolStem"],
                                         self.kwargs["setCollocations"])
Пример #25
0
    def calc_terms(self):
        the_regexp = Util.get_the_regexp(self.kwargs)

        return Util.calc_trigrams(self.kwargs["string"], the_regexp)
Пример #26
0
 def calc_terms(self):
     return Util.calc_token_lenght(self.kwargs["string"],
                                   self.kwargs["regexp"],
                                   self.kwargs["template"])#"word{len:%s}"
Пример #27
0
 def calc_collocation(self):
     return Util.calc_trigram_collocation(self.kwargs["string"],
                                          self.kwargs["regexp"],
                                          self.kwargs["boolStem"],
                                          self.kwargs["setCollocations"])
Пример #28
0
 def calc_terms(self):
     return Util.calc_sent_lenght(self.kwargs["string"],
                                  self.kwargs["regexp"],
                                  self.kwargs["template"])#"sentToken{len:%s}"
Пример #29
0
 def calc_terms(self):
     return Util.calc_sent_lenght(
         self.kwargs["string"], self.kwargs["regexp"],
         self.kwargs["template"])  #"sentToken{len:%s}"
Пример #30
0
 def calc_terms(self):
     return Util.calc_sent_nostopwords_lenght(self.kwargs["string"],
                                              RegExps.W_H_C)
Пример #31
0
 def calc_terms(self):
     return Util.calc_bigrams(self.kwargs["string"], RegExps.STOPW,
                              self.kwargs["boolStem"])
Пример #32
0
 def calc_terms(self):
     return Util.calc_bigrams(self.kwargs["string"],
                              RegExps.STOPW,
                              self.kwargs["boolStem"])
Пример #33
0
 def calc_collocation_set(self):
     return Util.calc_trigram_collocation_set(self.kwargs["string"],
                                              RegExps.STOPW,
                                              self.kwargs["boolStem"])
Пример #34
0
 def calc_terms(self):
     return Util.calc_trigrams(self.kwargs["string"],
                               RegExps.STYLE_POS,
                               self.kwargs["boolStem"])
Пример #35
0
 def calc_terms(self):
     pos_terms = Util.calc_lazy_POS(self.kwargs["string"], )
     return pos_terms
Пример #36
0
 def calc_terms(self):
     pos_terms = Util.calc_POS_FREELING(self.kwargs["string"],
                                    self.kwargs["pos"],
                                    self.kwargs["regexp"])
     return pos_terms