def run(training, validation, k, config): texts_by_motifs = defaultdict(list) motifs_in_docs = defaultdict(list) # construct the bigdocuments for i, (source, motifs, text) in enumerate(training): for motif in motifs: if motif != 'DUMMY': motifs_in_docs[motif].append(i) texts_by_motifs[motif].extend(text) labels, texts = zip(*texts_by_motifs.items()) indexer = Indexer() for label, text in zip(labels, texts): indexer.add(text, label) isError, OneError, nDocs = 0, 0, 0 margins, AP = [], [] for j, (source, motifs, text) in enumerate(validation): nDocs += 1 scores = list(indexer.predict_proba( text, config.getfloat('bm25', 'k1'), config.getfloat('bm25', 'b'))) preds = sorted(scores, key=lambda i: i[1], reverse=True) preds = [label for label,score in preds] refs = set(motifs) ap = average_precision(preds, refs) AP.append(ap) isError += is_error(ap) OneError += one_error(preds, refs) margins.append(margin(preds, refs)) return isError, OneError, nDocs, margins, AP
def run(training, validation, k, config=None): isError, OneError, nDocs = 0, 0, 0 margins, AP = [], [] class_index = Index() traindocs, train_X, train_y = zip(*load_data(training, class_index)) testdocs, test_X, test_y = zip(*load_data(validation, class_index)) n_iter = np.ceil(10**6 / len(traindocs)) clf = SGDClassifier(alpha=.000001, loss='log', n_iter=50, penalty='elasticnet') #clf = MultinomialNB(alpha=0.000001) classifier = Pipeline([ ('vectorizer', CountVectorizer(min_df=1, max_df=1.0, analyzer=lambda t: t)), ('tfidf', TfidfTransformer(norm='l2')), ('clf', OneVsRestClassifier(clf, n_jobs=-1))]) classifier.fit(train_X, train_y) predictions = classifier.predict_proba(test_X) for j, prediction in enumerate(predictions): nDocs += 1 refs = np.zeros(len(prediction)) refs[list(test_y[j])] = 1 preds = sorted(range(len(prediction)), key=lambda i: prediction[i], reverse=True) refs = set(test_y[j]) ap = average_precision(preds, refs) AP.append(ap) isError += is_error(ap) OneError += one_error(preds, refs) margins.append(margin(preds, refs)) return isError, OneError, nDocs, margins, AP
def run(training, validation, k, config): texts_by_motifs = defaultdict(list) motifs_in_docs = defaultdict(list) # construct the bigdocuments for i, (source, motifs, text) in enumerate(training): for motif in motifs: if motif != 'DUMMY': motifs_in_docs[motif].append(i) texts_by_motifs[motif].extend(text) labels, texts = zip(*texts_by_motifs.items()) indexer = Indexer() for label, text in zip(labels, texts): indexer.add(text, label) isError, OneError, nDocs = 0, 0, 0 margins, AP = [], [] for j, (source, motifs, text) in enumerate(validation): nDocs += 1 scores = list( indexer.predict_proba(text, config.getfloat('bm25', 'k1'), config.getfloat('bm25', 'b'))) preds = sorted(scores, key=lambda i: i[1], reverse=True) preds = [label for label, score in preds] refs = set(motifs) ap = average_precision(preds, refs) AP.append(ap) isError += is_error(ap) OneError += one_error(preds, refs) margins.append(margin(preds, refs)) return isError, OneError, nDocs, margins, AP
def run(training, validation, k, config): norm = config.get('tfidf', 'norm') smooth_idf = config.getboolean('tfidf', 'smooth_idf') bigdoc = config.getboolean('NB', 'bigdoc') clf = config.get('system', 'system') if clf == 'NB': clf = MultinomialNB(alpha=config.getfloat('NB', 'alpha')) if not bigdoc: clf = OneVsRestClassifier(clf, n_jobs=-1) elif clf == 'KNN': clf = KNeighborsClassifier(n_neighbors=10, weights='distance') if not bigdoc: clf = OneVsRestClassifier(clf) elif clf == 'SVC': clf = LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) if not bigdoc: clf = OneVsRestClassifier(clf) elif clf == 'dtree': clf = DecisionTreeClassifier() else: clf = OneVsRestClassifier( SGDClassifier(alpha=config.getfloat('sgd', 'alpha'), loss=config.get('sgd', 'loss'), n_iter=config.getint('sgd', 'iterations'), penalty=config.get('sgd', 'penalty')), n_jobs=-1) classifier = Pipeline([ ('vectorizer', CountVectorizer(min_df=1, max_df=1, analyzer=lambda t: t)), ('tfidf', TfidfTransformer(norm=norm, smooth_idf=smooth_idf)), ('clf', clf)]) if bigdoc: (train_y, train_X), class_index = construct_bigdocuments(training) _, test_y, test_X = zip(*validation) test_y = [set(class_index[l] for l in ls) for ls in test_y] else: class_index = Index() _, train_X, train_y = zip(*load_data(training, class_index)) _, test_X, test_y = zip(*load_data(validation, class_index)) classifier.fit(train_X, train_y) isError, OneError, nDocs = 0, 0, 0 margins, AP = [], [] predictions = classifier.predict_proba(test_X) for j, prediction in enumerate(predictions): nDocs += 1 preds = sorted(range(len(prediction)), key=lambda i: prediction[i], reverse=True) refs = test_y[j] ap = average_precision(preds, refs) AP.append(ap) isError += is_error(ap) OneError += one_error(preds, refs) margins.append(margin(preds, refs)) return isError, OneError, nDocs, margins, AP
def run(training, validation, k, config): norm = config.get('tfidf', 'norm') smooth_idf = config.getBoolean('tfidf', 'smooth_idf') bigdoc = False clf = config.get('system', 'system') if clf == 'NB': alpha = config.getFloat('NB', 'alpha') if config.getBoolean('NB', 'bigdoc'): bigdoc = True clf = MultinomialNB(alpha=alpha) else: clf = OneVsRestClassifier(BernoulliNB(alpha=alpha)) else: clf = SGDClassifier(alpha=config.getFloat('sgd', 'alpha'), loss=config.get('sgd', 'loss'), n_iter=config.getInt('sgd', 'iterations'), penalty=config.get('sgd', 'penalty')) classifier = Pipeline([('vectorizer', CountVectorizer(min_df=1, max_df=1.0, analyzer=lambda t: t)), ('tfidf', TfidfTransformer(norm=norm, smooth_idf=smooth_idf)), ('clf', clf)]) if bigdoc: (train_y, train_X), class_index = construct_bigdocuments(training) _, test_y, test_X = zip(*validation) test_y = [tuple(class_index[l] for l in ls) for ls in test_y] else: class_index = Index() _, train_X, train_y = zip(*load_data(training, class_index)) _, test_X, test_y = zip(*load_data(validation, class_index)) classifier.fit(train_X, train_y) isError, OneError, nDocs = 0, 0, 0 margins, AP = [], [] predictions = classifier.predict_proba(test_X) for j, prediction in enumerate(predictions): nDocs += 1 preds = sorted(range(len(prediction)), key=lambda i: prediction[i], reverse=True) refs = set(labelings[j]) ap = average_precision(preds, refs) AP.append(ap) isError += is_error(ap) OneError += one_error(preds, refs) margins.append(margin(preds, refs)) return isError, OneError, nDocs, margins, AP
def run(training, validation, k, config): ground_truth = {} ROOTDIR = config.get('filepaths', 'corpus') alpha, beta = config.get('llda', 'alpha'), config.get('llda', 'beta') iterations = config.get('llda', 'iterations') with open(ROOTDIR + 'training-%s.tmp' % k, 'w') as training_out: writer = csv.writer(training_out, quoting=csv.QUOTE_MINIMAL) for (source, motifs, text) in training: motifs = r' '.join(motifs) + ' DUMMY' writer.writerow([source, motifs, ' '.join(text)]) with open(ROOTDIR + 'testing-%s.tmp' % k, 'w') as testing_out: writer = csv.writer(testing_out, quoting=csv.QUOTE_MINIMAL) for (source, motifs, text) in validation: ground_truth[source] = motifs writer.writerow([source, r' '.join(motifs), ' '.join(text)]) # train LLDA with open(os.devnull, 'w') as null: subprocess.call('java -Xmx2000mb -jar tmt-0.4.0.jar llda-train.scala %s %s %s %s' % (ROOTDIR + 'training-%s.tmp' % k, alpha, beta, iterations), stdout=null, stderr=null, shell=True) # retrieve the model path modelpath = open(ROOTDIR + 'training-%s.tmp.config' % k).read().strip() # preform inference on led-out dataset using trained model with open(os.devnull, 'w') as null: subprocess.call('java -Xmx2000mb -jar tmt-0.4.0.jar llda-test.scala %s %s' % (modelpath, (ROOTDIR + 'testing-%s.tmp' % k)), stdout=sys.stdout, stderr=sys.stderr, shell=True) # evaluation starts here! isError, oneError, nDocs = 0, 0, 0 AP, margins = [], [] label_file = '/%05d/label-index.txt' % config.getint('llda', 'iterations') topicIndex = [topic.strip() for topic in open(modelpath + label_file)] reader = csv.reader(open(modelpath + '/testing-%s.tmp-document-topic-distributuions.csv' % k)) for row in reader: nDocs += 1 idnumber, topics = row[0], [float(score) for score in row[1:]] topics = sorted([(topicIndex[i], score) for i, score in enumerate(topics)], key=lambda i: i[1], reverse=True) preds = [topic for topic, _ in topics if topic != 'DUMMY'] refs = ground_truth[idnumber] ap = average_precision(preds, refs) isError += is_error(ap) oneError += one_error(preds, refs) margins.append(margin(preds, refs)) AP.append(ap) return isError, oneError, nDocs, margins, AP
def run(training, validation, k, config): norm = config.get('tfidf', 'norm') smooth_idf = config.getBoolean('tfidf', 'smooth_idf') bigdoc = False clf = config.get('system', 'system') if clf == 'NB': alpha=config.getFloat('NB', 'alpha') if config.getBoolean('NB', 'bigdoc'): bigdoc = True clf = MultinomialNB(alpha=alpha) else: clf = OneVsRestClassifier(BernoulliNB(alpha=alpha)) else: clf = SGDClassifier(alpha=config.getFloat('sgd', 'alpha'), loss=config.get('sgd', 'loss'), n_iter=config.getInt('sgd', 'iterations'), penalty=config.get('sgd', 'penalty')) classifier = Pipeline([ ('vectorizer', CountVectorizer(min_df=1, max_df=1.0, analyzer=lambda t: t)), ('tfidf', TfidfTransformer(norm=norm, smooth_idf=smooth_idf)), ('clf', clf)]) if bigdoc: (train_y, train_X), class_index = construct_bigdocuments(training) _, test_y, test_X = zip(*validation) test_y = [tuple(class_index[l] for l in ls) for ls in test_y] else: class_index = Index() _, train_X, train_y = zip(*load_data(training, class_index)) _, test_X, test_y = zip(*load_data(validation, class_index)) classifier.fit(train_X, train_y) isError, OneError, nDocs = 0, 0, 0 margins, AP = [], [] predictions = classifier.predict_proba(test_X) for j, prediction in enumerate(predictions): nDocs += 1 preds = sorted(range(len(prediction)), key=lambda i: prediction[i], reverse=True) refs = set(labelings[j]) ap = average_precision(preds, refs) AP.append(ap) isError += is_error(ap) OneError += one_error(preds, refs) margins.append(margin(preds, refs)) return isError, OneError, nDocs, margins, AP