EST_POSITIVE = 0.7 MAX_FEATURES = 3000 def conservative_min(xs): # remove outliers q25, q75 = np.percentile(xs, [25, 75]) iqr = q75 - q25 lb = q25 - (1.5 * iqr) ub = q75 + (1.5 * iqr) xs_con = xs[(xs >= lb) & (xs <= ub)] return np.min(xs_con) for borg in ["generic", "brand"]: X, y, vec = dnu.vectorize("unlabeled.txt", "%s_positive.txt" % (borg), MAX_FEATURES) y_pos = y[y == 1] num_positives = [y_pos.shape[0]] clf = LinearSVC() clf.fit(X, y) num_iters = 0 while num_iters < MAX_ITERS: print("Iteration #%d, #-positive examples: %d" % (num_iters, num_positives[-1])) confidence = clf.decision_function(X) min_pos_confidence = conservative_min(confidence[y_pos]) y_pos = np.where(confidence >= min_pos_confidence)[0] # if y_pos.shape[0] <= num_positives[-1]: # break
from sklearn.externals import joblib from sklearn.feature_extraction.text import CountVectorizer from sklearn.svm import LinearSVC import drug_ner_utils as dnu import numpy as np import os def vectorize_ngrams(ngrams, vocab): vec = np.zeros((1, len(vocab))) for ngram in ngrams: if vocab.has_key(ngram): vec[0, vocab[ngram]] = 1 return vec X, y, generic_vec = dnu.vectorize("unlabeled.txt", "generic_positive.txt", 100) y = joblib.load(os.path.join(dnu.DATA_DIR, "y_generic_4.pkl")) generic_clf = LinearSVC() generic_clf.fit(X, y) print("Score for generic classifier: %.3f" % (generic_clf.score(X, y))) X, y, brand_vec = dnu.vectorize("unlabeled.txt", "brand_positive.txt", 100) y = joblib.load(os.path.join(dnu.DATA_DIR, "y_brand_3.pkl")) brand_clf = LinearSVC() brand_clf.fit(X, y) print("Score for brand classifier: %.3f" % (brand_clf.score(X, y))) fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb') i = 0 for line in fraw:
from sklearn.feature_extraction.text import CountVectorizer from sklearn.svm import LinearSVC import drug_ner_utils as dnu import numpy as np import os def vectorize_ngrams(ngrams, vocab): vec = np.zeros((1, len(vocab))) for ngram in ngrams: if vocab.has_key(ngram): vec[0, vocab[ngram]] = 1 return vec X, y, generic_vec = dnu.vectorize("unlabeled.txt", "generic_positive.txt", 100) y = joblib.load(os.path.join(dnu.DATA_DIR, "y_generic_4.pkl")) generic_clf = LinearSVC() generic_clf.fit(X, y) print("Score for generic classifier: %.3f" % (generic_clf.score(X, y))) X, y, brand_vec = dnu.vectorize("unlabeled.txt", "brand_positive.txt", 100) y = joblib.load(os.path.join(dnu.DATA_DIR, "y_brand_3.pkl")) brand_clf = LinearSVC() brand_clf.fit(X, y) print("Score for brand classifier: %.3f" % (brand_clf.score(X, y))) fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb') i = 0 for line in fraw:
MAX_ITERS = 10 EST_POSITIVE = 0.7 MAX_FEATURES = 3000 def conservative_min(xs): # remove outliers q25, q75 = np.percentile(xs, [25, 75]) iqr = q75 - q25 lb = q25 - (1.5 * iqr) ub = q75 + (1.5 * iqr) xs_con = xs[(xs >= lb) & (xs <= ub)] return np.min(xs_con) for borg in ["generic", "brand"]: X, y, vec = dnu.vectorize("unlabeled.txt", "%s_positive.txt" % (borg), MAX_FEATURES) y_pos = y[y == 1] num_positives = [y_pos.shape[0]] clf = LinearSVC() clf.fit(X, y) num_iters = 0 while (num_iters < MAX_ITERS): print("Iteration #%d, #-positive examples: %d" % (num_iters, num_positives[-1])) confidence = clf.decision_function(X) min_pos_confidence = conservative_min(confidence[y_pos]) y_pos = np.where(confidence >= min_pos_confidence)[0] # if y_pos.shape[0] <= num_positives[-1]: