def rate_terms(text): # do actual term rating terms = TermDatabase() compounds = CompoundTracker() if use_gazette: tracker = gazette.GazetteTracker() ix = 1 termlist = extract_terms(text) if not termlist: return terms lang = langmodules.get_language_module(termlist) collocator = Collocator(lang) high = math.log(len(termlist)) for term in termlist: if not acceptable_term(term): if use_gazette: tracker.skip() compounds.skip() collocator.skip() continue stem = lang.get_stem(term) compounds.track(term, stem) collocator.found(term) if lang.is_stop_word(term): continue # compounds can track stop words (sogn og fjordane) term = lang.clean_term(term) t = terms.get_term(term, stem) if DESCENDING: t.found(high - math.log(ix)) else: t.found(1) if use_gazette: tracker.track(t, term) ix = ix + 1 if use_topic_map: topicmap_adjust(terms, lang, compounds) # FIXME: use TM to form compound terms #collocator.print_stats() compounds.form_compounds(terms) if use_word_classes: wordnet_adjust(terms, lang) if use_concept_net: conceptnet_adjust(terms) if use_gazette: gazette_adjust(terms) if use_frequencies: frequency_adjust(terms, lang) return terms
def text_to_vector(text, blacklist={}, tracker=None, stemming=0): termlist = chew.extract_terms(text) lang = langmodules.get_language_module(termlist) vector = Vector() for term in termlist: term = string.lower(term) if chew.acceptable_term(term) and \ not lang.is_stop_word(term) and \ not blacklist.has_key(term): if stemming: stem = lang.get_stem(term) else: stem = term if tracker: tracker.add_occurrence(stem) vector.add_term(stem) return vector
def text_to_vector(text, blacklist = {}, tracker = None, stemming = 0): termlist = chew.extract_terms(text) lang = langmodules.get_language_module(termlist) vector = Vector() for term in termlist: term = string.lower(term) if chew.acceptable_term(term) and \ not lang.is_stop_word(term) and \ not blacklist.has_key(term): if stemming: stem = lang.get_stem(term) else: stem = term if tracker: tracker.add_occurrence(stem) vector.add_term(stem) return vector