Exemplo n.º 1
0
def rate_terms(text):
    # do actual term rating
    terms = TermDatabase()
    compounds = CompoundTracker()
    if use_gazette:
        tracker = gazette.GazetteTracker()
    ix = 1

    termlist = extract_terms(text)
    if not termlist:
        return terms
    lang = langmodules.get_language_module(termlist)
    collocator = Collocator(lang)

    high = math.log(len(termlist))
    for term in termlist:
        if not acceptable_term(term):
            if use_gazette:
                tracker.skip()
            compounds.skip()
            collocator.skip()
            continue

        stem = lang.get_stem(term)
        compounds.track(term, stem)
        collocator.found(term)
        if lang.is_stop_word(term):
            continue # compounds can track stop words (sogn og fjordane)

        term = lang.clean_term(term)
        t = terms.get_term(term, stem)
        if DESCENDING:
            t.found(high - math.log(ix))
        else:
            t.found(1)

        if use_gazette:
            tracker.track(t, term)

        ix = ix + 1

    if use_topic_map:
        topicmap_adjust(terms, lang, compounds)
    # FIXME: use TM to form compound terms

    #collocator.print_stats()
    compounds.form_compounds(terms)

    if use_word_classes:
        wordnet_adjust(terms, lang)
    if use_concept_net:
        conceptnet_adjust(terms)
    if use_gazette:
        gazette_adjust(terms)
    if use_frequencies:
        frequency_adjust(terms, lang)

    return terms
Exemplo n.º 2
0
def text_to_vector(text, blacklist={}, tracker=None, stemming=0):
    termlist = chew.extract_terms(text)
    lang = langmodules.get_language_module(termlist)

    vector = Vector()
    for term in termlist:
        term = string.lower(term)
        if chew.acceptable_term(term) and \
           not lang.is_stop_word(term) and \
           not blacklist.has_key(term):
            if stemming:
                stem = lang.get_stem(term)
            else:
                stem = term
            if tracker:
                tracker.add_occurrence(stem)
            vector.add_term(stem)

    return vector
Exemplo n.º 3
0
def text_to_vector(text, blacklist = {}, tracker = None, stemming = 0):
    termlist = chew.extract_terms(text)
    lang = langmodules.get_language_module(termlist)

    vector = Vector()
    for term in termlist:
        term = string.lower(term)
        if chew.acceptable_term(term) and \
           not lang.is_stop_word(term) and \
           not blacklist.has_key(term):
            if stemming:
                stem = lang.get_stem(term)
            else:
                stem = term
            if tracker:
                tracker.add_occurrence(stem)
            vector.add_term(stem)

    return vector