def run(output_dir, feature_num, mode): feature_num = int(feature_num) taxo_res = TaxonomyResources(freq_fpaths, isa_fpaths) taxo_features = TaxonomyFeatures(taxo_res, relations_fpath=train_relations_fpath) ensure_dir(output_dir) features = ["hyper_in_hypo_i","hypo2hyper_substract", "freq_substract", "in_weight_substract", "length_substract", "hypo2hyper_s_substract","hypo2hyper_max2_substract"] features = features[:feature_num] if mode == "gridsearch": # grid search is only supported for SVC method = "SVC" hc = SuperTaxi(join(output_dir, "SVC-grid-search"), method="SVC", features=features, overwrite=True) clf = hc.grid_search_svc(taxo_features.relations, test=TEST_GRIDSEARCH) return for method in METHODS: try: classifier_dir = join(output_dir, method) print "\n", method.upper(), "\n", "="*50 hc = SuperTaxi(classifier_dir, method=method, features=features, overwrite=True) if mode == "train": clf = hc.train(taxo_features.relations) hc._print_clf_info() elif mode == "cv": hc.crossval(taxo_features.relations) else: print "Error: unrecognised mode %s" % mode except: print format_exc()
def breakdown(lexsample_fpath, output_dir): ensure_dir(output_dir) df = read_csv(lexsample_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False, header=None, names=names, low_memory=False) n = 0 q_levels = [0.1*x for x in range(9, 0, -1)] q_values = [df["conf"].quantile(l) for l in q_levels] q = zip(q_levels, q_values) q_cur = q.pop() df = df.sort(["conf"], ascending=1) if REVERSE: df.conf = -df.conf name = splitext(ntpath.basename(lexsample_fpath))[0] for i, row in df.iterrows(): if row["conf"] >= q_cur[1]: output_fpath = join(output_dir, name + "-" +"conf" + unicode(int(100*(1.0-q_cur[0]))) + ".csv") print "Saving: %.3f %.3f %d %s" % (q_cur[0], q_cur[1], n, output_fpath) df.to_csv(output_fpath, sep="\t", encoding="utf-8", float_format='%.3f', index=False, header=False) if len(q) > 0: q_cur = q.pop() else: break if row["conf"] < q_cur[1]: df.loc[i,"predict_sense_ids"] = -1 if n % 5000 == 0: print n n += 1
def run(output_dir, feature_num, mode): feature_num = int(feature_num) taxo_res = TaxonomyResources(freq_fpaths, isa_fpaths) taxo_features = TaxonomyFeatures(taxo_res, relations_fpath=train_relations_fpath) ensure_dir(output_dir) features = ["hyper_in_hypo_i","hypo2hyper_substract", "freq_substract", "in_weight_substract", "length_substract", "hypo2hyper_s_substract","hypo2hyper_max2_substract"] features = features[:feature_num] if mode == "gridsearch": # grid search is only supported for SVC method = "SVC" hc = SuperTaxi(join(output_dir, "SVC-grid-search"), method="SVC", features=features, overwrite=True) clf = hc.grid_search_svc(taxo_features.relations, test=TEST_GRIDSEARCH) return for method in METHODS: try: classifier_dir = join(output_dir, method) print("\n", method.upper(), "\n", "="*50) hc = SuperTaxi(classifier_dir, method=method, features=features, overwrite=True) if mode == "train": clf = hc.train(taxo_features.relations) hc._print_clf_info() elif mode == "cv": hc.crossval(taxo_features.relations) else: print("Error: unrecognised mode %s" % mode) except: print(format_exc())
def __init__(self, model_dir, method="LogisticRegressionL2", features=FEATURES, k=100, overwrite=False): self.CLASSIFIER_FILE = "classifier" self.KBEST_VOC_FILE = "kbest-voc.csv" self.KBEST_FILE = "kbest.pkl" self.META_FILE = "meta.json" clf_fpath = join(model_dir, self.CLASSIFIER_FILE) kbest_fpath = join(model_dir, self.KBEST_FILE) self._model_dir = model_dir self._meta_fpath = join(model_dir, self.META_FILE) self._meta = {} self._meta["method"] = method self._meta["k"] = k self._meta["features"] = features if exists(model_dir) and exists(clf_fpath) and not overwrite: # load the model self._clf = joblib.load(clf_fpath) self._meta = json.load(open(self._meta_fpath, "r")) print("Metadata were loaded from:", self._meta_fpath) else: # model doesn't exist, or must be overwritten create a new one ensure_dir(model_dir) self.save_meta()
def __init__(self, model_dir, method="LogisticRegressionL2", features=FEATURES, k=100, overwrite=False): self.CLASSIFIER_FILE = "classifier" self.KBEST_VOC_FILE = "kbest-voc.csv" self.KBEST_FILE = "kbest.pkl" self.META_FILE = "meta.json" clf_fpath = join(model_dir, self.CLASSIFIER_FILE) kbest_fpath = join(model_dir, self.KBEST_FILE) self._model_dir = model_dir self._meta_fpath = join(model_dir, self.META_FILE) self._meta = {} self._meta["method"] = method self._meta["k"] = k self._meta["features"] = features if exists(model_dir) and exists(clf_fpath) and not overwrite: # load the model self._clf = joblib.load(clf_fpath) self._meta = json.load(open(self._meta_fpath, "r")) print "Metadata were loaded from:", self._meta_fpath else: # model doesn't exist, or must be overwritten create a new one ensure_dir(model_dir) self.save_meta()
def __init__(self, babelnet_keys, babelnet_fpath="", freq_fpath="", normalized=True, divide_by_freq=False, force_api=False): self._babelnet_keys = babelnet_keys self._babelnet_dir = babelnet_fpath ensure_dir(self._babelnet_dir) self._normalized = normalized self._force_api = force_api self._freq = FreqDictionary(freq_fpath) self._babelnet = self._load(babelnet_fpath, divide_by_freq=divide_by_freq) # format: (word->sense_id->{"bow" , "wnOffset"}
def __init__(self, babelnet_keys, babelnet_fpath="", freq_fpath="", normalized=True, divide_by_freq=False, force_api=False): self._babelnet_keys = babelnet_keys self._babelnet_dir = babelnet_fpath ensure_dir(self._babelnet_dir) self._normalized = normalized self._force_api = force_api self._freq = FreqDictionary(freq_fpath) self._babelnet = self._load( babelnet_fpath, divide_by_freq=divide_by_freq ) # format: (word->sense_id->{"bow" , "wnOffset"}
def breakdown(lexsample_fpath, output_dir): ensure_dir(output_dir) df = read_csv(lexsample_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False, header=None, names=names, low_memory=False) n = 0 q_levels = [0.1 * x for x in range(9, 0, -1)] q_values = [df["conf"].quantile(l) for l in q_levels] q = zip(q_levels, q_values) q_cur = q.pop() df = df.sort(["conf"], ascending=1) if REVERSE: df.conf = -df.conf name = splitext(ntpath.basename(lexsample_fpath))[0] for i, row in df.iterrows(): if row["conf"] >= q_cur[1]: output_fpath = join( output_dir, name + "-" + "conf" + unicode(int(100 * (1.0 - q_cur[0]))) + ".csv") print "Saving: %.3f %.3f %d %s" % (q_cur[0], q_cur[1], n, output_fpath) df.to_csv(output_fpath, sep="\t", encoding="utf-8", float_format='%.3f', index=False, header=False) if len(q) > 0: q_cur = q.pop() else: break if row["conf"] < q_cur[1]: df.loc[i, "predict_sense_ids"] = -1 if n % 5000 == 0: print n n += 1