Пример #1
0
def match_codes():
    """Matching full dataset in parallel with customizable embedding method"""

    # 0.398657046863 hard, 0.275397642306 soft

    working_directory = './data/'

    data_codes, data_descriptions = get_data_to_match('slim')

    official_codes, official_descriptions = get_official_data()

    level = 1
    model = HashingEmbedder(
        level=level, analyzer='char_wb', ngram_range=(4, 5), norm='l2'
    )  # word2vecEmbedder() # word2vecEmbedder() # HashingEmbedder() #  [HashingEmbedder(level=level, analyzer='char', ngram_range=(3,5), norm='l2')] #[HashingEmbedder(level=level, analyzer='char', ngram_range=(2,3))]
    model.embed_data(data_descriptions)

    print 'loaded and embedded data'

    # test_nNN(model, data_descriptions, data_codes)

    official_code_labels = None
    true_data_codes = None
    use_section = False
    if use_section:
        official_code_labels = get_section_codes(model.official_codes)
        true_data_codes = get_section_codes(data_codes)
    else:
        official_code_labels = coarsen_codes(model.official_codes)
        true_data_codes = coarsen_codes(data_codes)

    nNN = 4
    classifier = KNNClassifier(n_neighbors=nNN)
    classifier.fit(model.official_embeddings, official_code_labels)
    pred_codes = classifier.predict(
        model.data_embeddings, pbar=True
    )  # classifier.predict_with_edit_dist(model.data_embeddings, data_descriptions, model.official_descriptions)

    errors = pred_codes - true_data_codes
    print 'Correctly predicted', 1.0 * np.sum(
        errors == 0) / errors.shape[0], 'percent of top level codes'
    # plot_confusion_matrix(true_data_codes, pred_codes)

    model = word2vecEmbedder()
    model.embed_data(data_descriptions)
    classifier = KNNClassifier(n_neighbors=nNN)
    classifier.fit(model.official_embeddings, official_code_labels)
    pred_codes = classifier.predict(
        model.data_embeddings
    )  # classifier.predict_with_edit_dist(model.data_embeddings, data_descriptions, model.official_descriptions)
Пример #2
0
def test_nNN(model, data_descriptions, data_codes, nNNmin=2, nNNmax=10):
    for nNN in xrange(nNNmin, nNNmax + 1):
        classifier = KNNClassifier(n_neighbors=nNN)

        t1 = time()
        classifier.fit(model.official_embeddings,
                       get_section_codes(model.official_codes))
        # classifier.fit(model.official_embeddings, coarsen_codes(model.official_codes))
        pred_section_codes = classifier.predict_with_edit_dist(
            model.data_embeddings, data_descriptions,
            model.official_descriptions)
        # true_coarse_codes = coarsen_codes(data_codes) # .reshape((-1,1))
        # errors = pred_codes - true_coarse_codes
        true_section_codes = get_section_codes(data_codes)  # .reshape((-1,1))
        errors = pred_section_codes - true_section_codes

        print '------------------------------'
        print 'nNN:', nNN
        print 'Correctly predicted', 1.0 * np.sum(
            errors ==
            0) / errors.shape[0], 'percent of top level codes w/ edit dist kNN'
        print 'Took', time() - t1, 'seconds'

        t1 = time()
        # pred_codes = classifier.predict(model.data_embeddings)
        # errors = pred_codes - true_coarse_codes
        pred_section_codes = classifier.predict(model.data_embeddings)
        errors = pred_section_codes - true_section_codes
        print 'Correctly predicted', 1.0 * np.sum(
            errors ==
            0) / errors.shape[0], 'percent of top level code w/ euclidean kNN'
        print 'Took', time() - t1, 'seconds'
        print '------------------------------'