Exemplo n.º 1
0
EST_POSITIVE = 0.7
MAX_FEATURES = 3000


def conservative_min(xs):
    # remove outliers
    q25, q75 = np.percentile(xs, [25, 75])
    iqr = q75 - q25
    lb = q25 - (1.5 * iqr)
    ub = q75 + (1.5 * iqr)
    xs_con = xs[(xs >= lb) & (xs <= ub)]
    return np.min(xs_con)


for borg in ["generic", "brand"]:
    X, y, vec = dnu.vectorize("unlabeled.txt", "%s_positive.txt" % (borg), MAX_FEATURES)

    y_pos = y[y == 1]
    num_positives = [y_pos.shape[0]]

    clf = LinearSVC()
    clf.fit(X, y)

    num_iters = 0
    while num_iters < MAX_ITERS:
        print("Iteration #%d, #-positive examples: %d" % (num_iters, num_positives[-1]))
        confidence = clf.decision_function(X)
        min_pos_confidence = conservative_min(confidence[y_pos])
        y_pos = np.where(confidence >= min_pos_confidence)[0]
        #        if y_pos.shape[0] <= num_positives[-1]:
        #            break
Exemplo n.º 2
0
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
import drug_ner_utils as dnu
import numpy as np
import os

def vectorize_ngrams(ngrams, vocab):
    vec = np.zeros((1, len(vocab)))
    for ngram in ngrams:
        if vocab.has_key(ngram):
            vec[0, vocab[ngram]] = 1
    return vec
    

X, y, generic_vec = dnu.vectorize("unlabeled.txt", "generic_positive.txt", 100)
y = joblib.load(os.path.join(dnu.DATA_DIR, "y_generic_4.pkl"))
generic_clf = LinearSVC()
generic_clf.fit(X, y)
print("Score for generic classifier: %.3f" % (generic_clf.score(X, y)))

X, y, brand_vec = dnu.vectorize("unlabeled.txt", "brand_positive.txt", 100)

y = joblib.load(os.path.join(dnu.DATA_DIR, "y_brand_3.pkl"))
brand_clf = LinearSVC()
brand_clf.fit(X, y)
print("Score for brand classifier: %.3f" % (brand_clf.score(X, y)))

fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb')
i = 0
for line in fraw:
Exemplo n.º 3
0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
import drug_ner_utils as dnu
import numpy as np
import os


def vectorize_ngrams(ngrams, vocab):
    vec = np.zeros((1, len(vocab)))
    for ngram in ngrams:
        if vocab.has_key(ngram):
            vec[0, vocab[ngram]] = 1
    return vec


X, y, generic_vec = dnu.vectorize("unlabeled.txt", "generic_positive.txt", 100)
y = joblib.load(os.path.join(dnu.DATA_DIR, "y_generic_4.pkl"))
generic_clf = LinearSVC()
generic_clf.fit(X, y)
print("Score for generic classifier: %.3f" % (generic_clf.score(X, y)))

X, y, brand_vec = dnu.vectorize("unlabeled.txt", "brand_positive.txt", 100)

y = joblib.load(os.path.join(dnu.DATA_DIR, "y_brand_3.pkl"))
brand_clf = LinearSVC()
brand_clf.fit(X, y)
print("Score for brand classifier: %.3f" % (brand_clf.score(X, y)))

fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb')
i = 0
for line in fraw:
Exemplo n.º 4
0
MAX_ITERS = 10
EST_POSITIVE = 0.7
MAX_FEATURES = 3000

def conservative_min(xs):
    # remove outliers
    q25, q75 = np.percentile(xs, [25, 75])
    iqr = q75 - q25
    lb = q25 - (1.5 * iqr)
    ub = q75 + (1.5 * iqr)
    xs_con = xs[(xs >= lb) & (xs <= ub)]
    return np.min(xs_con)
    
    
for borg in ["generic", "brand"]:
    X, y, vec = dnu.vectorize("unlabeled.txt", "%s_positive.txt" % (borg), 
                              MAX_FEATURES)

    y_pos = y[y == 1]
    num_positives = [y_pos.shape[0]]

    clf = LinearSVC()
    clf.fit(X, y)

    num_iters = 0
    while (num_iters < MAX_ITERS):
        print("Iteration #%d, #-positive examples: %d" % 
              (num_iters, num_positives[-1]))
        confidence = clf.decision_function(X)
        min_pos_confidence = conservative_min(confidence[y_pos])
        y_pos = np.where(confidence >= min_pos_confidence)[0]
#        if y_pos.shape[0] <= num_positives[-1]: