def __init__(self, data_path): self.main_df = self.create_main_df(data_path) self.row_df = None self.formula_df = None self.regex_obj = Regex() self.config = helpers.load_yaml("src/config.yml") self.classification_tasks_dict = self.get_classification_tasks()
def __init__(self, num_clusters=10): self.num_clusters = num_clusters self.config = helpers.load_yaml("src/config.yml") # common featurizer objects for all tasks self.featurizer_tf = None self.featurizer_emb = None self.clusterer = KmeansCluster(num_clusters=num_clusters)
def __init__(self, max_features=None, mode="word-embeddings"): self.nlp = helpers.get_nlp() self.mode = mode if self.mode == "tfidf": self.tfidf_words = TfidfVectorizer(sublinear_tf=True, min_df=1, smooth_idf=True, norm="l2", encoding="utf-8", analyzer="word", ngram_range=(1, 2)) self.tfidf_chars = TfidfVectorizer(sublinear_tf=True, min_df=1, smooth_idf=True, norm="l2", encoding="utf-8", analyzer="char", ngram_range=(3, 3)) self.featurizer = FeatureUnion([("words", self.tfidf_words), ("chars", self.tfidf_chars)]) elif self.mode == "word-embeddings": self.featurizer = SentenceEmbedding() else: self.featurizer = None self.features = None self.logger = logging.getLogger(__name__) self.config = helpers.load_yaml("src/config.yml")
def __init__(self, task, cv=3): self.cv = cv self.model = None self.calibrated_model = None # name of the property self.task = task self.config = helpers.load_yaml("src/config.yml")
def __init__(self, init_name, name, hash_name, label_task, priority, ver_cost, der_cost, val_acc=0.0): """ Arguments: init_name {str} -- [name that appears in the original csv files] name {str} -- [new name that is used after parsing the original csv files] hash_name {str} -- [name of the column that contains the hash value of the classification task] label_task {str} -- [Classification type. Either "single-label" or "multi-label"] priority {int} -- {a lower number is a more important task} ver_cost {int} -- {verification cost} der_cost {int} -- {derivation cost} val_acc {float} -- [accuracy in the validation set] """ self.init_name = init_name self.name = name self.hash_name = hash_name self.label_task = label_task self.priority = priority self.ver_cost = ver_cost self.der_cost = der_cost self.topn = None self.val_acc = val_acc # True if we have hashed the values of the Classification Task self.has_hash = True if name == hash_name: self.has_hash = False self.config = helpers.load_yaml("src/config.yml") self.all_values = set() self.classifier_name = self.name + "_classifier" self.featurizer_tf_name = self.name + "_featurizer_tf" self.featurizer_emb_name = self.name + "_featurizer_emb" # store components needed for classification self.is_trained = False self.featurizer_tf = None self.featurizer_emb = None self.classifier = None # dict to translate a label to a unique value self.label_to_hash_dict = dict() self.hash_to_label_dict = dict() self.hash_to_label_dict_name = self.name + "_hash_to_label_dict.json" self.label_to_hash_dict_name = self.name + "_label_to_hash_dict.json" # hash_counter will always be incremented when we add a new label to a task's hash_dict self.hash_counter = 0
def __init__(self, data_path, min_samples=5, topn=5, simulation=True, export=True): self.min_samples = min_samples self.topn = topn self.data_path = data_path self.simulation = simulation self.export = export self.config = helpers.load_yaml("src/config.yml") self.parser = DatasetParser(self.data_path) # dict of classification_tasks to be included in the classification. self.classification_tasks_dict = self.parser.classification_tasks_dict # common featurizer objects for all tasks self.featurizer_tf = None self.featurizer_emb = None self.complete_df = None self.train_df = None self.val_df = None self.test_df = None
def train(yaml): yaml_path = os.path.join(ROOT_DIR, "scripts", "train", yaml + ".yaml") FLAGS = edict(load_yaml(yaml_path)) train_from_yaml(FLAGS)