Exemplo n.º 1
0
def calculate_statistics(labeled_profiles, semantic_model, calc_distances):
    print "Calculating statistics..."

    distance_sum, counter = 0.0, 0
    last_record_offset = -1

    #### online standard deviation
    mean, M2 = 0.0, 0.0
    ####

    unlabeled_document_iterator = DocumentIterator(
        document_batch_size=DOCUMENT_BATCH_SIZE,
        db_window_size=50000,
        doc_filter=UNLABELED_DOCUMENTS_CONDITION)

    all_distances = []

    for unlabeled_document_batch in unlabeled_document_iterator.batchIter():
        distances = calc_distances(labeled_profiles, semantic_model,
                                   unlabeled_document_batch)

        #### online standard deviation

        all_distances.extend(distances.flatten().tolist())

        for dist in distances.flatten():
            counter += 1
            delta = dist - mean
            mean += delta / counter
            delta2 = dist - mean
            M2 += delta * delta2
        ####

        distance_sum += numpy.sum(distances)

        if unlabeled_document_iterator.current_record_offset < last_record_offset:
            break

        last_record_offset = unlabeled_document_iterator.current_record_offset

        mean = distance_sum / counter
        standard_deviation = numpy.sqrt(M2 / (counter - 1))

        print mean, standard_deviation

    return mean, standard_deviation
Exemplo n.º 2
0
class Documents(object):
	def __init__(self, where):
		self.document_iterator = DocumentIterator()
		self.where =  where

	def __iter__(self):
		for document_batch in self.document_iterator.getAllInBatches(cond=self.where):
			for document in document_batch:
				if len(document.tokenized_text) > 0:
					yield document
Exemplo n.º 3
0
def get_labeled_set():
    labeled_documents = DocumentIterator(where=LABELED_DOCUMENTS_CONDITION).getAll()
	
    labeled_profiles = []
    labels = []
    
    start = time.time()
    #map(lambda d: f1(d, labeled_profiles, labels), labeled_documents)
    for doc in labeled_documents:
        f1(doc, labeled_profiles, labels)
    print time.time() - start

    return labeled_profiles, labels
def choose_english_docs_only():
    batch_size = 50000
    document_iterator = DocumentIterator(document_batch_size=None, db_window_size=batch_size, doc_filter="1 = 1")
    not_en_cnt, docs_cnt = 0, 0 
    for doc in document_iterator.getAllInBatches():
        docs_cnt += batch_size
        print docs_cnt
        try:
            if doc.rawtext is not None and len(doc.rawtext) > 0:
                detected_langs = detect_langs(doc.rawtext)
                print doc.id, detected_langs
                """"
                if not ('en' in [l.lang for l in detected_langs] and len(detected_langs) == 1):
                    sql = "update pap_papers_1 set published = 0 where id = %s" % (doc.id, )
                    print sql
                    db.query(sql)
                    db.commit()

                    not_en_cnt += 1
                    print not_en_cnt
                """
        except:
            pass
Exemplo n.º 5
0
def get_labeled_set():
    labeled_documents = DocumentIterator(
        doc_filter=LABELED_DOCUMENTS_CONDITION).getAll()

    labeled_profiles = []
    labels = []

    for doc in labeled_documents:
        if len(doc.profile) < 50:
            print doc.id, len(doc.profile)
        labeled_profiles.append(doc.profile)
        labels.append(doc.learned_category)

    return labeled_profiles, labels
Exemplo n.º 6
0
def getDocumentIterator1(doc_filter):
    doc_iterator = DocumentIterator(db_window_size=10000,
                                    doc_filter=doc_filter)
    return (doc for doc in doc_iterator.getAllInBatches() if doc.rawtext)
Exemplo n.º 7
0
def propagate_labels_gensim(labeled_profiles, labels, acceptable_distance,
                            num_features, semantic_model, calc_distances,
                            min_df, max_df, scores):
    newly_labeled_documents = []

    unlabeled_document_iterator = DocumentIterator(
        document_batch_size=DOCUMENT_BATCH_SIZE,
        db_window_size=DB_WINDOW_SIZE,
        doc_filter=UNLABELED_DOCUMENTS_CONDITION)
    validation_documents = DocumentIterator(
        doc_filter=VALIDATION_DOCUMENTS_CONDITION).getAll()
    validation_texts, validation_labels = [], []
    for doc in validation_documents:
        validation_texts.append(doc.tokenized_text)
        validation_labels.append(doc.learned_category[0])

#nbrs = NearestNeighbors(n_neighbors=N_NEIGHBORS, algorithm='brute', metric='cosine').fit(labeled_profiles)
    clf = KNeighborsClassifier(n_neighbors=N_NEIGHBORS,
                               algorithm='brute',
                               metric='cosine').fit(labeled_profiles, labels)

    current_iter = 0
    stop_propagation = semantic_model.num_docs >= FINAL_DOCUMENT_COUNT
    for unlabeled_document_batch in unlabeled_document_iterator.batchIter():
        print "Retrieved document batch"
        if stop_propagation:
            break

        predictions = defaultdict(list)
        for i, unlabeled_document in enumerate(unlabeled_document_batch):
            profile = semantic_model.inferProfile(
                unlabeled_document.tokenized_text)
            if len(profile) == 0:
                #print "no elements", unlabeled_document.id
                continue
            #distances, indices = nbrs.kneighbors([profile])
            predict_result = clf.predict_proba([profile])
            max_idx = numpy.argmax(predict_result)
            proba = predict_result[0][max_idx]
            closest_category = clf.classes_[max_idx]
            #closest_categories, average_distance = find_closest_categories([labels[i] for i in indices[0]], distances[0], N_OUTPUT_LABELS)
            if proba >= 0.7:
                predictions[closest_category].append(unlabeled_document)

        average_num_docs = numpy.mean(
            [len(docs) for docs in predictions.values()])
        print[(cat, len(docs)) for cat, docs in predictions.iteritems()]
        for category, docs in predictions.iteritems():
            newly_labeled_documents.append(
                (category, random.sample(docs,
                                         int(math.ceil(average_num_docs)))))

        print len(newly_labeled_documents)
        if len(newly_labeled_documents) >= round(
                0.40 * semantic_model.num_docs):  #NEW_LABELS_BATCH:
            print "Updating model..."

            for category, docs in newly_labeled_documents:
                for doc in docs:
                    assign_category(doc, category)

            labeled_profiles, labels, semantic_model = getLabeledSetGensim(
                num_features, min_df, max_df)

            #mean, standard_deviation = calculate_statistics(labeled_profiles, semantic_model, calc_distances)
            #acceptable_distance = mean - standard_deviation
            #print mean, standard_deviation, acceptable_distance
            validation_profiles = numpy.asarray(
                [semantic_model.inferProfile(x) for x in validation_texts])
            clf_names, score, train_time, test_time = testClassifiers(
                X_train=labeled_profiles,
                y_train=labels,
                X_test=validation_profiles,
                y_test=validation_labels,
                multilabel=False)

            print score

            for i, clf_name in enumerate(clf_names):
                scores[clf_name][current_iter] = score[i]

            #nbrs = NearestNeighbors(n_neighbors=N_NEIGHBORS, algorithm='brute', metric='cosine').fit(labeled_profiles)
            clf = KNeighborsClassifier(n_neighbors=N_NEIGHBORS,
                                       algorithm='brute',
                                       metric='cosine').fit(
                                           labeled_profiles, labels)

            newly_labeled_documents = []
            current_iter += 1

            if semantic_model.num_docs >= FINAL_DOCUMENT_COUNT:
                stop_propagation = True
                break
Exemplo n.º 8
0
def propagate_labels(labeled_profiles, labels, acceptable_distance):
    print "Label propagation..."

    semantic_model = SemanticModel.load(file_name=MODEL_SNAPSHOT_FILENAME,
                                        doc_filter=LABELED_DOCUMENTS_CONDITION)

    #db = MySQLdb.connect(host='localhost', user='******', passwd='1qaz@WSX', db='paperity')

    #semantic_model.tester = lambda epoch: test_accuracy(semantic_model, db, epoch, 'accuracy_result.csv')

    newly_labeled_documents = []

    unlabeled_document_iterator = DocumentIterator(
        document_batch_size=DOCUMENT_BATCH_SIZE,
        db_window_size=DB_WINDOW_SIZE,
        doc_filter=UNLABELED_DOCUMENTS_CONDITION,
        convertText=semantic_model.convertText)
    #nbrs = NearestNeighbors(n_neighbors=N_NEIGHBORS, algorithm='brute', metric='cosine').fit(labeled_profiles)
    clf = KNeighborsClassifier(n_neighbors=N_NEIGHBORS,
                               algorithm='brute',
                               metric='cosine').fit(labeled_profiles, labels)

    stop_propagation = semantic_model.num_docs >= FINAL_DOCUMENT_COUNT
    for unlabeled_document_batch in unlabeled_document_iterator:
        if stop_propagation:
            break

        for i, unlabeled_document in enumerate(unlabeled_document_batch):
            semantic_model.inferProfiles([unlabeled_document],
                                         num_iters=PROFILE_INFERENCE_NUM_ITERS,
                                         update_word_profiles=False,
                                         initialize_document_profiles=True)
            distances, indices = nbrs.kneighbors([unlabeled_document.profile])

            closest_categories, average_distance = find_closest_categories(
                [labels[i] for i in indices[0]], distances[0], N_OUTPUT_LABELS)

            if average_distance <= acceptable_distance:
                assign_category(unlabeled_document, closest_categories,
                                newly_labeled_documents)

                if len(newly_labeled_documents) == round(
                        0.40 * semantic_model.num_docs):  #NEW_LABELS_BATCH:
                    print "Updating model..."
                    semantic_model.document_iterator.saveDocumentProfilesToDb(
                        newly_labeled_documents)
                    semantic_model.update(
                        newly_labeled_documents,
                        num_iters_full_retrain=NUM_ITERS_MODEL_UPDATE,
                        num_iters_partial_retrain=PROFILE_INFERENCE_NUM_ITERS)

                    labeled_profiles, labels = get_labeled_set()

                    #mean, standard_deviation = calculate_statistics(labeled_profiles)
                    #acceptable_distance = mean - standard_deviation
                    #print mean, standard_deviation, acceptable_distance

                    #nbrs = NearestNeighbors(n_neighbors=N_NEIGHBORS, algorithm='brute', metric='cosine').fit(labeled_profiles)
                    clf = KNeighborsClassifier(n_neighbors=N_NEIGHBORS,
                                               algorithm='brute',
                                               metric='cosine').fit(
                                                   labeled_profiles, labels)
                    newly_labeled_documents = []

            if semantic_model.num_docs >= FINAL_DOCUMENT_COUNT:
                stop_propagation = True
                break
Exemplo n.º 9
0
from semantic_model import DocumentIterator
from collections import Counter

if __name__ == "__main__":
	word_count = Counter()

	for document_batch in DocumentIterator().getAllInBatches(cond="published = 1 AND journal_id = 8356"):
		for document in document_batch:
			print document.journal.title
			for word in document.tokenized_text:
				word_count[word] += 1

	for word, count in word_count.most_common(100):
		print word, count
Exemplo n.º 10
0
	def __init__(self, where):
		self.document_iterator = DocumentIterator()
		self.where =  where