def clusterDocument(title, abstract, tokenizer, token2indexMap, tfidf_model, lsi_model, gmm_model):
    tokens = tokenize(title + " " + abstract, tokenizer)
    tokenCounts = groupAndCount(tokens)
    matrix = build_csr_matrix(listOfMaps=[tokenCounts], token2IndexMap=token2indexMap)
    transformedMatrix = lsi_model.transform(tfidf_model.transform(matrix))
    prediction = gmm_model.predict(transformedMatrix)[0]
    return prediction
Exemplo n.º 2
0
def clusterDocument(title, abstract, tokenizer, token2indexMap, tfidf_model,
                    lsi_model, gmm_model):
    tokens = tokenize(title + " " + abstract, tokenizer)
    tokenCounts = groupAndCount(tokens)
    matrix = build_csr_matrix(listOfMaps=[tokenCounts],
                              token2IndexMap=token2indexMap)
    transformedMatrix = lsi_model.transform(tfidf_model.transform(matrix))
    prediction = gmm_model.predict(transformedMatrix)[0]
    return prediction
mat = load_csr_matrix("derived_data/tfidf_theorem_tdm_grouped_by_docs.npz")
train_mat = mat[train_doc_ind, :]
train_labels = itemgetter(*train_doc_ind)(ordered_document_labels)

svd = TruncatedSVD(n_components=1000)
svd.fit(train_mat)

test_mat = mat[test_doc_ind, :]
test_labels = itemgetter(*test_doc_ind)(ordered_document_labels)

clf = svm.LinearSVC()
clf.fit(svd.transform(train_mat), train_labels)

# eval results
predictions = clf.predict(svd.transform(test_mat)).tolist()


def f(pred_label_pair):
    if pred_label_pair[0] == 1 and pred_label_pair[1] == 1:
        return "tp"
    elif pred_label_pair[0] == 1 and pred_label_pair[1] == 0:
        return "fp"
    elif pred_label_pair[0] == 0 and pred_label_pair[1] == 1:
        return "fn"
    else:
        return "tn"


cats = map(lambda x: f(x), zip(predictions, test_labels))
print groupAndCount(cats)
mat = load_csr_matrix("derived_data/tfidf_theorem_tdm_grouped_by_docs.npz")
train_mat = mat[train_doc_ind, :]
train_labels = itemgetter(*train_doc_ind)(ordered_document_labels)

svd = TruncatedSVD(n_components=1000)
svd.fit(train_mat)

test_mat = mat[test_doc_ind, :]
test_labels = itemgetter(*test_doc_ind)(ordered_document_labels)

clf = svm.LinearSVC()
clf.fit(svd.transform(train_mat), train_labels)

# eval results
predictions = clf.predict(svd.transform(test_mat)).tolist()


def f(pred_label_pair):
    if pred_label_pair[0] == 1 and pred_label_pair[1] == 1:
        return "tp"
    elif pred_label_pair[0] == 1 and pred_label_pair[1] == 0:
        return "fp"
    elif pred_label_pair[0] == 0 and pred_label_pair[1] == 1:
        return "fn"
    else:
        return "tn"

cats = map(lambda x: f(x), zip(predictions, test_labels))
print groupAndCount(cats)