def test_lemmatized_bigrams_with_LSA(corpus_train_data, corpus_test_data,
                                     vocabulary_src, with_stopwords_removal,
                                     use_chi_features, use_raw_tokens,
                                     num_components):
    from commons.lemmatizing_tokenizer import LemmaTokenizer
    from commons.lemmatizing_tokenizer import RawLemmaTokenizer
    from clef_globals import min_df, min_tf, test_set_size, max_labels
    from clef_vocabulary_loader import load_vocabulary
    from sklearn.decomposition import TruncatedSVD
    from scipy import sparse
    import numpy

    max_ngram_size = 2

    if with_stopwords_removal == False:
        stopwords_pattern = ''
    else:
        stopwords_pattern = '_stopwords'
    if use_chi_features == False:
        chi_features_pattern = ''
    else:
        chi_features_pattern = '_chi'
    if use_raw_tokens == False:
        raw_tokens_pattern = ''
        tokenizer = LemmaTokenizer()
    else:
        raw_tokens_pattern = '_raw'
        tokenizer = RawLemmaTokenizer()

    # load vocabulary
    vocabulary_tbl_name = 'clef_2010_{0}{1}_lemmas{2}_bigrams{3}_df{4}_tf{5}'.format(
        vocabulary_src, raw_tokens_pattern, chi_features_pattern,
        stopwords_pattern, min_df, min_tf)
    vocabulary = load_vocabulary(vocabulary_tbl_name)

    # generate tfidf vectors
    corpus_train_tfidf_vectors = vectorize_corpus(corpus_train_data['corpus'],
                                                  tokenizer, vocabulary,
                                                  max_ngram_size)
    corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'],
                                                 tokenizer, vocabulary,
                                                 max_ngram_size)

    # apply LSA
    #print numpy.max(corpus_train_tfidf_vectors)
    #print numpy.min(corpus_train_tfidf_vectors)
    lsa = TruncatedSVD(n_components=num_components)
    lsa.fit(corpus_train_tfidf_vectors)
    #corpus_train_tfidf_vectors = numpy.dot(corpus_train_tfidf_vectors,pca.components_.transpose())
    corpus_train_tfidf_vectors = lsa.transform(corpus_train_tfidf_vectors)
    corpus_test_tfidf_vectors = lsa.transform(corpus_test_tfidf_vectors)

    # classify & evaluate
    results = classify(corpus_train_tfidf_vectors, corpus_train_data['labels'],
                       corpus_test_tfidf_vectors, corpus_test_data['labels'],
                       test_set_size, max_labels)

    print 'LSA ^', vocabulary_tbl_name, ' --> ', 'precision ', results[
        'precision'], 'recall ', results['recall'], 'f1 ', results['f1']
def test_lemmatized_wiktionary_google_bigrams(corpus_train_data,
                                              corpus_test_data, vocabulary_src,
                                              with_stopwords_removal,
                                              use_chi_features,
                                              use_raw_tokens):
    from commons.lemmatizing_tokenizer import LemmaTokenizer
    from commons.lemmatizing_tokenizer import RawLemmaTokenizer
    from clef_globals import min_df, min_tf, test_set_size, max_labels
    from clef_vocabulary_loader import load_common_vocabulary

    max_ngram_size = 2

    if with_stopwords_removal == False:
        stopwords_pattern = ''
    else:
        stopwords_pattern = '_stopwords'
    if use_chi_features == False:
        chi_features_pattern = ''
    else:
        chi_features_pattern = '_chi'
    if use_raw_tokens == False:
        raw_tokens_pattern = ''
        tokenizer = LemmaTokenizer()
    else:
        raw_tokens_pattern = '_raw'
        tokenizer = RawLemmaTokenizer()

    # load vocabulary
    vocabulary_tbl_name1 = 'clef_2010_{0}{1}_lemmas{2}_unigrams{3}_df{4}_tf{5}'.format(
        vocabulary_src, raw_tokens_pattern, chi_features_pattern,
        stopwords_pattern, min_df, min_tf)
    vocabulary_tbl_name2 = 'clef_2010_{0}{1}_lemmas_bigrams{3}_df{4}_tf{5}'.format(
        vocabulary_src, raw_tokens_pattern, chi_features_pattern,
        stopwords_pattern, min_df, min_tf)

    vocabulary_tbl_intersect = 'wiktionary_google_bigrams_vw'
    vocabulary = load_common_vocabulary(vocabulary_tbl_name1,
                                        vocabulary_tbl_name2,
                                        vocabulary_tbl_intersect, 'lemma')

    # generate tfidf vectors
    corpus_train_tfidf_vectors = vectorize_corpus(corpus_train_data['corpus'],
                                                  tokenizer, vocabulary,
                                                  max_ngram_size)
    corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'],
                                                 tokenizer, vocabulary,
                                                 max_ngram_size)

    # classify & evaluate
    results = classify(corpus_train_tfidf_vectors, corpus_train_data['labels'],
                       corpus_test_tfidf_vectors, corpus_test_data['labels'],
                       test_set_size, max_labels)

    print vocabulary_tbl_name1, '^', vocabulary_tbl_name2, '^', vocabulary_tbl_intersect, ' --> ', 'precision ', results[
        'precision'], 'recall ', results['recall'], 'f1 ', results['f1']
Пример #3
0
def build_raw_lemmatized_bigrams_stopwords_vocabulary(corpus,stop_words,):
    from ng20_globals import min_tf,min_df
    from commons.lemmatizing_tokenizer import RawLemmaTokenizer
    
    tokenizer = RawLemmaTokenizer()
    max_ngram_size = 2
    vocabulary = build_vocabulary(corpus,tokenizer,stop_words,max_ngram_size,min_df,min_tf)
    # save to DB
    tbl_name = 'ng20_raw_lemmas_bigrams_stopwords_df{0}_tf{1}'.format(min_df,min_tf)
    save_vocabulary(vocabulary,tbl_name)
    print 'done '+tbl_name    
Пример #4
0
def test_lemmatized_bigrams_unigrams(bigrams_src, corpus_train_data,
                                     corpus_test_data, label_names,
                                     with_stopwords_removal, use_chi_features,
                                     use_raw_tokens):
    import numpy as np
    from commons.lemmatizing_tokenizer import LemmaTokenizer
    from commons.lemmatizing_tokenizer import RawLemmaTokenizer
    from ng20_globals import max_labels, min_df, min_tf
    from ng20_vocabulary_loader import load_common_vocabulary_extend_unigrams

    max_ngram_size = 2

    if with_stopwords_removal == False:
        stopwords_pattern = ''
    else:
        stopwords_pattern = '_stopwords'
    if use_chi_features == False:
        chi_features_pattern = ''
    else:
        chi_features_pattern = '_chi'
    if use_raw_tokens == False:
        raw_tokens_pattern = ''
        tokenizer = LemmaTokenizer()
    else:
        raw_tokens_pattern = '_raw'
        tokenizer = RawLemmaTokenizer()

    # load vocabulary
    vocabulary_tbl_name = 'ng20{0}_lemmas{1}_bigrams{2}_df{3}_tf{4}'.format(
        raw_tokens_pattern, chi_features_pattern, stopwords_pattern, min_df,
        min_tf)

    if len(bigrams_src) == 1:
        vocabulary_tbl_intersect = '{0}_bigrams'.format(bigrams_src[0])
    else:
        vocabulary_tbl_intersect = '{0}_'.format(bigrams_src[0])
        for i in range(len(bigrams_src) - 1):
            vocabulary_tbl_intersect = '{0}{1}_'.format(
                vocabulary_tbl_intersect, bigrams_src[i + 1])
        vocabulary_tbl_intersect = '{0}bigrams_vw'.format(
            vocabulary_tbl_intersect)

    vocabulary = load_common_vocabulary_extend_unigrams(
        vocabulary_tbl_name, vocabulary_tbl_intersect, 'lemma')
    print 'done loading vocabulary'

    # generate tfidf vectors
    vectorizer, corpus_train_tfidf_vectors = vectorize_corpus(
        corpus_train_data['corpus'], tokenizer, vocabulary, max_ngram_size)
    _, corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'],
                                                    tokenizer, vocabulary,
                                                    max_ngram_size)

    # classify & evaluate
    results = classify(corpus_train_tfidf_vectors, corpus_train_data['labels'],
                       corpus_test_tfidf_vectors, corpus_test_data['labels'],
                       max_labels)

    print_top_feature_names(
        results['features_weights'],
        np.asarray(vectorizer.get_feature_names()),
        vocabulary_tbl_name + '_' + vocabulary_tbl_intersect, label_names)

    print vocabulary_tbl_name, '^', vocabulary_tbl_intersect, '(extended unigrams) --> ', 'accuracy ', results[
        'accuracy']  #print vocabulary_tbl_name,'^',vocabulary_tbl_intersect,'(extended unigrams) --> ','precision ',results['precision'],'recall ',results['recall'],'f1 ',results['f1']
Пример #5
0
def test_lemmatized_bigrams_with_LSA(corpus_train_data, corpus_test_data,
                                     label_names, with_stopwords_removal,
                                     use_chi_features, use_raw_tokens,
                                     num_components):
    import numpy as np
    from commons.lemmatizing_tokenizer import LemmaTokenizer
    from commons.lemmatizing_tokenizer import RawLemmaTokenizer
    from ng20_globals import max_labels, min_df, min_tf
    from ng20_vocabulary_loader import load_vocabulary
    from sklearn.decomposition import TruncatedSVD

    max_ngram_size = 2

    if with_stopwords_removal == False:
        stopwords_pattern = ''
    else:
        stopwords_pattern = '_stopwords'
    if use_chi_features == False:
        chi_features_pattern = ''
    else:
        chi_features_pattern = '_chi'
    if use_raw_tokens == False:
        raw_tokens_pattern = ''
        tokenizer = LemmaTokenizer()
    else:
        raw_tokens_pattern = '_raw'
        tokenizer = RawLemmaTokenizer()

    # load vocabulary
    vocabulary_tbl_name = 'ng20{0}_lemmas{1}_bigrams{2}_df{3}_tf{4}'.format(
        raw_tokens_pattern, chi_features_pattern, stopwords_pattern, min_df,
        min_tf)
    vocabulary = load_vocabulary(vocabulary_tbl_name)

    # generate tfidf vectors
    vectorizer, corpus_train_tfidf_vectors = vectorize_corpus(
        corpus_train_data['corpus'], tokenizer, vocabulary, max_ngram_size)
    _, corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'],
                                                    tokenizer, vocabulary,
                                                    max_ngram_size)

    # apply LSA
    #print numpy.max(corpus_train_tfidf_vectors)
    #print numpy.min(corpus_train_tfidf_vectors)
    lsa = TruncatedSVD(n_components=num_components)
    lsa.fit(corpus_train_tfidf_vectors)
    #corpus_train_tfidf_vectors = numpy.dot(corpus_train_tfidf_vectors,pca.components_.transpose())
    corpus_train_tfidf_vectors = lsa.transform(corpus_train_tfidf_vectors)
    corpus_test_tfidf_vectors = lsa.transform(corpus_test_tfidf_vectors)

    # classify & evaluate
    results = classify(corpus_train_tfidf_vectors, corpus_train_data['labels'],
                       corpus_test_tfidf_vectors, corpus_test_data['labels'],
                       max_labels)

    print_top_feature_names(results['features_weights'],
                            np.asarray(vectorizer.get_feature_names()),
                            vocabulary_tbl_name, 'lsa_' + label_names)

    print 'LSA ^', vocabulary_tbl_name, ' --> ', 'precision ', results[
        'precision'], 'recall ', results['recall'], 'f1 ', results['f1']
Пример #6
0
def run_experiment(corpus_train_data,
                   corpus_test_data,
                   label_names,
                   with_stopwords_removal=False,
                   use_chi_features=False,
                   preprocess_tokens='raw',
                   ngram_size=1,
                   knowledge_set=None):
    import numpy as np
    from commons.lemmatizing_tokenizer import RawTokenizer
    from commons.lemmatizing_tokenizer import RawLemmaTokenizer
    from commons.stemming_tokenizer import RawStemmingTokenizer
    from ng20_globals import max_labels, min_df, min_tf
    from ng20_vocabulary_loader import load_vocabulary
    from ng20_vocabulary_loader import load_common_vocabulary
    from commons.globals import knowledge_dic

    if with_stopwords_removal == False:
        stopwords_pattern = ''
    else:
        stopwords_pattern = '_stopwords'
    if use_chi_features == False:
        chi_features_pattern = ''
    else:
        chi_features_pattern = '_chi'

    if preprocess_tokens == 'lemmatize':
        raw_tokens_pattern = '_raw_lemmas'
        tokenizer = RawLemmaTokenizer()
    elif preprocess_tokens == 'stem':
        raw_tokens_pattern = '_raw_stems'
        tokenizer = RawStemmingTokenizer()
    else:
        raw_tokens_pattern = '_raw'
        tokenizer = RawTokenizer()

    if ngram_size == 1:
        ngram_pattern = '_unigrams'
    elif ngram_size == 2:
        ngram_pattern = '_bigrams'

    # load vocabulary
    if knowledge_set != None:
        vocabulary_tbl_name1 = 'ng20{0}{1}_unigrams{2}_df{3}_tf{4}'.format(
            raw_tokens_pattern, chi_features_pattern, stopwords_pattern,
            min_df, min_tf)
        vocabulary_tbl_name2 = 'ng20{0}{1}_bigrams{2}_df{3}_tf{4}'.format(
            raw_tokens_pattern, chi_features_pattern, stopwords_pattern,
            min_df, min_tf)

        vocabulary_tbl_intersect = knowledge_dic[knowledge_set.lower()]
        vocabulary = load_common_vocabulary(vocabulary_tbl_name1,
                                            vocabulary_tbl_name2,
                                            vocabulary_tbl_intersect, 'stem')
        vocabulary_tbl_name = vocabulary_tbl_name1, '^', vocabulary_tbl_name2, '^', vocabulary_tbl_intersect

    else:
        vocabulary_tbl_name = 'ng20{0}{1}{2}{3}_df{4}_tf{5}'.format(
            raw_tokens_pattern, chi_features_pattern, ngram_pattern,
            stopwords_pattern, min_df, min_tf)
        vocabulary = load_vocabulary(vocabulary_tbl_name)

    # generate tfidf vectors
    vectorizer, corpus_train_tfidf_vectors = vectorize_corpus(
        corpus_train_data['corpus'], tokenizer, vocabulary, ngram_size)
    _, corpus_test_tfidf_vectors = vectorize_corpus(corpus_test_data['corpus'],
                                                    tokenizer, vocabulary,
                                                    ngram_size)

    # classify & evaluate
    results = classify_cv(corpus_train_tfidf_vectors,
                          corpus_train_data['labels'],
                          corpus_test_tfidf_vectors,
                          corpus_test_data['labels'], max_labels)

    print_top_feature_names(results['features_weights'],
                            np.asarray(vectorizer.get_feature_names()),
                            vocabulary_tbl_name, label_names)

    print vocabulary_tbl_name, ' --> ', 'accuracy', results[
        'accuracy']  #print vocabulary_tbl_name,' --> ','precision ',results['precision'],'recall ',results['recall'],'f1 ',results['f1']