def build_ngram_text(infile, outfile): fin = open(os.path.join(dnu.DATA_DIR, infile), 'rb') fout = open(os.path.join(dnu.DATA_DIR, outfile), 'wb') for line in fin: for word in line.strip().split(): ngrams = dnu.str_to_ngrams(word, dnu.GRAM_SIZE) if len(ngrams) > 0: fout.write("%s\n" % " ".join(ngrams)) fin.close() fout.close()
from sklearn.externals import joblib import drug_ner_utils as dnu import os generic_fd = set(dnu.truncate_fd(joblib.load(os.path.join(dnu.DATA_DIR, "generic_fd.pkl")), 100)) brand_fd = set(dnu.truncate_fd(joblib.load(os.path.join(dnu.DATA_DIR, "brand_fd.pkl")), 50)) fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb') i = 0 for line in fraw: line = line.strip().lower() annotated = [] for word in line.split(): ngrams = set(dnu.str_to_ngrams(word, dnu.GRAM_SIZE)) jc_generic = 1.0 * (len(ngrams.intersection(generic_fd)) / len(ngrams.union(generic_fd))) jc_brand = 1.0 * (len(ngrams.intersection(brand_fd)) / len(ngrams.union(brand_fd))) print word, jc_generic, jc_brand is_generic = jc_generic > 0.01 is_brand = jc_brand > 0.01 if is_generic: annotated.append("<GENERIC>%s</GENERIC>" % (word)) elif is_brand: annotated.append("<BRAND>%s</BRAND>" % (word)) else: annotated.append(word) print("Input: %s" % (line)) print("Output: %s" % (" ".join(annotated)))
print("Score for generic classifier: %.3f" % (generic_clf.score(X, y))) X, y, brand_vec = dnu.vectorize("unlabeled.txt", "brand_positive.txt", 100) y = joblib.load(os.path.join(dnu.DATA_DIR, "y_brand_3.pkl")) brand_clf = LinearSVC() brand_clf.fit(X, y) print("Score for brand classifier: %.3f" % (brand_clf.score(X, y))) fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb') i = 0 for line in fraw: line = line.strip().lower() annotated = [] for word in line.split(): ngrams = dnu.str_to_ngrams(word, dnu.GRAM_SIZE) Xgen = generic_vec.transform([" ".join(ngrams)]) Xbrand = brand_vec.transform([" ".join(ngrams)]) is_generic = generic_clf.predict(Xgen) is_brand = brand_clf.predict(Xbrand) if is_generic == 1: annotated.append("<GENERIC>" + word + "</GENERIC>") elif is_brand == 1: annotated.append("<BRAND>" + word + "</BRAND>") else: annotated.append(word) print("Input: %s" % (line)) print("Output: %s" % (" ".join(annotated))) i += 1 if i > 10: break
import os generic_fd = set( dnu.truncate_fd(joblib.load(os.path.join(dnu.DATA_DIR, "generic_fd.pkl")), 100)) brand_fd = set( dnu.truncate_fd(joblib.load(os.path.join(dnu.DATA_DIR, "brand_fd.pkl")), 50)) fraw = open(os.path.join(dnu.DATA_DIR, "raw_data.txt"), 'rb') i = 0 for line in fraw: line = line.strip().lower() annotated = [] for word in line.split(): ngrams = set(dnu.str_to_ngrams(word, dnu.GRAM_SIZE)) jc_generic = 1.0 * (len(ngrams.intersection(generic_fd)) / len(ngrams.union(generic_fd))) jc_brand = 1.0 * (len(ngrams.intersection(brand_fd)) / len(ngrams.union(brand_fd))) print word, jc_generic, jc_brand is_generic = jc_generic > 0.01 is_brand = jc_brand > 0.01 if is_generic: annotated.append("<GENERIC>%s</GENERIC>" % (word)) elif is_brand: annotated.append("<BRAND>%s</BRAND>" % (word)) else: annotated.append(word) print("Input: %s" % (line)) print("Output: %s" % (" ".join(annotated)))