예제 #1
0
파일: train.py 프로젝트: anukat2015/taxi
def run(output_dir, feature_num, mode):

    feature_num = int(feature_num)
    taxo_res = TaxonomyResources(freq_fpaths, isa_fpaths)
    taxo_features = TaxonomyFeatures(taxo_res, relations_fpath=train_relations_fpath)

    ensure_dir(output_dir)
    features = ["hyper_in_hypo_i","hypo2hyper_substract", "freq_substract", "in_weight_substract", "length_substract",
            "hypo2hyper_s_substract","hypo2hyper_max2_substract"]
    features = features[:feature_num]
    
    if mode == "gridsearch":
        #  grid search is only supported for SVC
        method = "SVC"
        hc = SuperTaxi(join(output_dir, "SVC-grid-search"), method="SVC", features=features, overwrite=True)
        clf = hc.grid_search_svc(taxo_features.relations, test=TEST_GRIDSEARCH)
        return 
    
    for method in METHODS:
        try:
            classifier_dir = join(output_dir, method) 
            print "\n", method.upper(), "\n", "="*50
            hc = SuperTaxi(classifier_dir, method=method, features=features, overwrite=True)
            if mode == "train":
                clf = hc.train(taxo_features.relations)
                hc._print_clf_info()
            elif mode == "cv":
                hc.crossval(taxo_features.relations) 
            else:
                print "Error: unrecognised mode %s" % mode
        except:
            print format_exc()
예제 #2
0
def breakdown(lexsample_fpath, output_dir): 
    ensure_dir(output_dir)
    df = read_csv(lexsample_fpath, encoding='utf-8', delimiter="\t", error_bad_lines=False,
                  header=None, names=names, low_memory=False)

    n = 0
    q_levels = [0.1*x for x in range(9, 0, -1)]
    q_values = [df["conf"].quantile(l) for l in q_levels]
    q = zip(q_levels, q_values)

    q_cur = q.pop()

    df = df.sort(["conf"], ascending=1)
    if REVERSE: df.conf = -df.conf
    name = splitext(ntpath.basename(lexsample_fpath))[0]

    for i, row in df.iterrows():
        if row["conf"] >= q_cur[1]:
            output_fpath = join(output_dir, name + "-" +"conf" + unicode(int(100*(1.0-q_cur[0]))) + ".csv")
            print "Saving: %.3f %.3f %d %s" % (q_cur[0], q_cur[1], n, output_fpath)
            df.to_csv(output_fpath, sep="\t", encoding="utf-8", float_format='%.3f', index=False, header=False)
            
            if len(q) > 0: q_cur = q.pop()
            else: break
      
        if row["conf"] < q_cur[1]:
           df.loc[i,"predict_sense_ids"] = -1
        
        if n % 5000 == 0: print n
        n += 1
예제 #3
0
파일: train.py 프로젝트: mjj203/taxi-1
def run(output_dir, feature_num, mode):

    feature_num = int(feature_num)
    taxo_res = TaxonomyResources(freq_fpaths, isa_fpaths)
    taxo_features = TaxonomyFeatures(taxo_res, relations_fpath=train_relations_fpath)

    ensure_dir(output_dir)
    features = ["hyper_in_hypo_i","hypo2hyper_substract", "freq_substract", "in_weight_substract", "length_substract",
            "hypo2hyper_s_substract","hypo2hyper_max2_substract"]
    features = features[:feature_num]

    if mode == "gridsearch":
        #  grid search is only supported for SVC
        method = "SVC"
        hc = SuperTaxi(join(output_dir, "SVC-grid-search"), method="SVC", features=features, overwrite=True)
        clf = hc.grid_search_svc(taxo_features.relations, test=TEST_GRIDSEARCH)
        return

    for method in METHODS:
        try:
            classifier_dir = join(output_dir, method)
            print("\n", method.upper(), "\n", "="*50)
            hc = SuperTaxi(classifier_dir, method=method, features=features, overwrite=True)
            if mode == "train":
                clf = hc.train(taxo_features.relations)
                hc._print_clf_info()
            elif mode == "cv":
                hc.crossval(taxo_features.relations)
            else:
                print("Error: unrecognised mode %s" % mode)
        except:
            print(format_exc())
예제 #4
0
파일: supervised.py 프로젝트: mjj203/taxi-1
    def __init__(self,
                 model_dir,
                 method="LogisticRegressionL2",
                 features=FEATURES,
                 k=100,
                 overwrite=False):

        self.CLASSIFIER_FILE = "classifier"
        self.KBEST_VOC_FILE = "kbest-voc.csv"
        self.KBEST_FILE = "kbest.pkl"
        self.META_FILE = "meta.json"
        clf_fpath = join(model_dir, self.CLASSIFIER_FILE)
        kbest_fpath = join(model_dir, self.KBEST_FILE)
        self._model_dir = model_dir
        self._meta_fpath = join(model_dir, self.META_FILE)

        self._meta = {}
        self._meta["method"] = method
        self._meta["k"] = k
        self._meta["features"] = features

        if exists(model_dir) and exists(clf_fpath) and not overwrite:
            # load the model
            self._clf = joblib.load(clf_fpath)
            self._meta = json.load(open(self._meta_fpath, "r"))
            print("Metadata were loaded from:", self._meta_fpath)
        else:
            # model doesn't exist, or must be overwritten create a new one
            ensure_dir(model_dir)
            self.save_meta()
예제 #5
0
    def __init__(self, model_dir, method="LogisticRegressionL2", features=FEATURES, k=100, overwrite=False):

        self.CLASSIFIER_FILE = "classifier"
        self.KBEST_VOC_FILE = "kbest-voc.csv"
        self.KBEST_FILE = "kbest.pkl"
        self.META_FILE = "meta.json"
        clf_fpath = join(model_dir, self.CLASSIFIER_FILE)
        kbest_fpath = join(model_dir, self.KBEST_FILE)
        self._model_dir = model_dir
        self._meta_fpath = join(model_dir, self.META_FILE)
        
        self._meta = {}
        self._meta["method"] = method
        self._meta["k"] = k
        self._meta["features"] = features

        if exists(model_dir) and exists(clf_fpath) and not overwrite:
            # load the model
            self._clf = joblib.load(clf_fpath)
            self._meta = json.load(open(self._meta_fpath, "r"))
            print "Metadata were loaded from:", self._meta_fpath
        else:
            # model doesn't exist, or must be overwritten create a new one
            ensure_dir(model_dir)
            self.save_meta()
예제 #6
0
 def __init__(self, babelnet_keys, babelnet_fpath="", freq_fpath="", normalized=True, divide_by_freq=False, force_api=False):
     self._babelnet_keys = babelnet_keys
     self._babelnet_dir = babelnet_fpath
     ensure_dir(self._babelnet_dir)
     self._normalized = normalized
     self._force_api = force_api
     self._freq =  FreqDictionary(freq_fpath)
     self._babelnet = self._load(babelnet_fpath, divide_by_freq=divide_by_freq)  # format: (word->sense_id->{"bow" , "wnOffset"}
예제 #7
0
 def __init__(self,
              babelnet_keys,
              babelnet_fpath="",
              freq_fpath="",
              normalized=True,
              divide_by_freq=False,
              force_api=False):
     self._babelnet_keys = babelnet_keys
     self._babelnet_dir = babelnet_fpath
     ensure_dir(self._babelnet_dir)
     self._normalized = normalized
     self._force_api = force_api
     self._freq = FreqDictionary(freq_fpath)
     self._babelnet = self._load(
         babelnet_fpath, divide_by_freq=divide_by_freq
     )  # format: (word->sense_id->{"bow" , "wnOffset"}
예제 #8
0
def breakdown(lexsample_fpath, output_dir):
    ensure_dir(output_dir)
    df = read_csv(lexsample_fpath,
                  encoding='utf-8',
                  delimiter="\t",
                  error_bad_lines=False,
                  header=None,
                  names=names,
                  low_memory=False)

    n = 0
    q_levels = [0.1 * x for x in range(9, 0, -1)]
    q_values = [df["conf"].quantile(l) for l in q_levels]
    q = zip(q_levels, q_values)

    q_cur = q.pop()

    df = df.sort(["conf"], ascending=1)
    if REVERSE: df.conf = -df.conf
    name = splitext(ntpath.basename(lexsample_fpath))[0]

    for i, row in df.iterrows():
        if row["conf"] >= q_cur[1]:
            output_fpath = join(
                output_dir, name + "-" + "conf" +
                unicode(int(100 * (1.0 - q_cur[0]))) + ".csv")
            print "Saving: %.3f %.3f %d %s" % (q_cur[0], q_cur[1], n,
                                               output_fpath)
            df.to_csv(output_fpath,
                      sep="\t",
                      encoding="utf-8",
                      float_format='%.3f',
                      index=False,
                      header=False)

            if len(q) > 0: q_cur = q.pop()
            else: break

        if row["conf"] < q_cur[1]:
            df.loc[i, "predict_sense_ids"] = -1

        if n % 5000 == 0: print n
        n += 1