Пример #1
0
 def __init__(self, data_path):
     self.main_df = self.create_main_df(data_path)
     self.row_df = None
     self.formula_df = None
     self.regex_obj = Regex()
     self.config = helpers.load_yaml("src/config.yml")
     self.classification_tasks_dict = self.get_classification_tasks()
Пример #2
0
 def __init__(self, num_clusters=10):
     self.num_clusters = num_clusters
     self.config = helpers.load_yaml("src/config.yml")
     # common featurizer objects for all tasks
     self.featurizer_tf = None
     self.featurizer_emb = None
     self.clusterer = KmeansCluster(num_clusters=num_clusters)
Пример #3
0
 def __init__(self, max_features=None, mode="word-embeddings"):
     self.nlp = helpers.get_nlp()
     self.mode = mode
     if self.mode == "tfidf":
         self.tfidf_words = TfidfVectorizer(sublinear_tf=True,
                                            min_df=1,
                                            smooth_idf=True,
                                            norm="l2",
                                            encoding="utf-8",
                                            analyzer="word",
                                            ngram_range=(1, 2))
         self.tfidf_chars = TfidfVectorizer(sublinear_tf=True,
                                            min_df=1,
                                            smooth_idf=True,
                                            norm="l2",
                                            encoding="utf-8",
                                            analyzer="char",
                                            ngram_range=(3, 3))
         self.featurizer = FeatureUnion([("words", self.tfidf_words),
                                         ("chars", self.tfidf_chars)])
     elif self.mode == "word-embeddings":
         self.featurizer = SentenceEmbedding()
     else:
         self.featurizer = None
     self.features = None
     self.logger = logging.getLogger(__name__)
     self.config = helpers.load_yaml("src/config.yml")
 def __init__(self, task, cv=3):
     self.cv = cv
     self.model = None
     self.calibrated_model = None
     # name of the property
     self.task = task
     self.config = helpers.load_yaml("src/config.yml")
Пример #5
0
    def __init__(self,
                 init_name,
                 name,
                 hash_name,
                 label_task,
                 priority,
                 ver_cost,
                 der_cost,
                 val_acc=0.0):
        """
        Arguments:
            init_name {str} -- [name that appears in the original csv files]
            name {str} -- [new name that is used after parsing the original csv files]
            hash_name {str} -- [name of the column that contains the hash value of the 
                               classification task]
            label_task {str} -- [Classification type. Either "single-label" or "multi-label"]
            priority {int} -- {a lower number is a more important task}
            ver_cost {int} -- {verification cost}
            der_cost {int} -- {derivation cost}
            val_acc {float} -- [accuracy in the validation set]
        """
        self.init_name = init_name
        self.name = name
        self.hash_name = hash_name
        self.label_task = label_task
        self.priority = priority
        self.ver_cost = ver_cost
        self.der_cost = der_cost
        self.topn = None
        self.val_acc = val_acc
        # True if we have hashed the values of the Classification Task
        self.has_hash = True
        if name == hash_name:
            self.has_hash = False

        self.config = helpers.load_yaml("src/config.yml")

        self.all_values = set()

        self.classifier_name = self.name + "_classifier"
        self.featurizer_tf_name = self.name + "_featurizer_tf"
        self.featurizer_emb_name = self.name + "_featurizer_emb"
        # store components needed for classification
        self.is_trained = False
        self.featurizer_tf = None
        self.featurizer_emb = None
        self.classifier = None

        # dict to translate a label to a unique value
        self.label_to_hash_dict = dict()
        self.hash_to_label_dict = dict()
        self.hash_to_label_dict_name = self.name + "_hash_to_label_dict.json"
        self.label_to_hash_dict_name = self.name + "_label_to_hash_dict.json"
        # hash_counter will always be incremented when we add a new label to a task's hash_dict
        self.hash_counter = 0
    def __init__(self,
                 data_path,
                 min_samples=5,
                 topn=5,
                 simulation=True,
                 export=True):
        self.min_samples = min_samples
        self.topn = topn
        self.data_path = data_path
        self.simulation = simulation
        self.export = export
        self.config = helpers.load_yaml("src/config.yml")

        self.parser = DatasetParser(self.data_path)
        # dict of classification_tasks to be included in the classification.
        self.classification_tasks_dict = self.parser.classification_tasks_dict
        # common featurizer objects for all tasks
        self.featurizer_tf = None
        self.featurizer_emb = None
        self.complete_df = None
        self.train_df = None
        self.val_df = None
        self.test_df = None
Пример #7
0
def train(yaml):
    yaml_path = os.path.join(ROOT_DIR, "scripts", "train", yaml + ".yaml")
    FLAGS = edict(load_yaml(yaml_path))
    train_from_yaml(FLAGS)