def semantic_labeling(train_dataset, test_dataset, train_dataset2=None, evaluate_train_set=False, reuse_rf_model=True): """Doing semantic labeling, train on train_dataset, and test on test_dataset. train_dataset2 is optionally provided in case train_dataset, and test_dataset doesn't have overlapping semantic types For example, given that train_dataset is soccer domains, and test_dataset is weather domains; the system isn't able to recognize semantic types of test_dataset because of no overlapping. We need to provide another train_dataset2, which has semantic types of weather domains; so that the system is able to make prediction. Train_dataset2 is default to train_dataset. (train_dataset is use to train RandomForest) :param train_dataset: str :param test_dataset: str :param train_dataset2: Optional[str] :param evaluate_train_set: bool :param reuse_rf_model: bool :return: """ logger = get_logger("semantic-labeling-api", format_str='>>>>>> %(asctime)s - %(levelname)s:%(name)s:%(module)s:%(lineno)d: %(message)s') if train_dataset2 is None: train_dataset2 = train_dataset datasets = [train_dataset, test_dataset] else: datasets = [train_dataset, test_dataset, train_dataset2] semantic_labeler = SemanticLabeler() # read data into memory logger.info("Read data into memory") semantic_labeler.read_data_sources(list(set(datasets))) # index datasets that haven't been indexed before not_indexed_datasets = list({dataset for dataset in datasets if not is_indexed(dataset)}) if len(not_indexed_datasets) > 0: logger.info("Index not-indexed datasets: %s" % ",".join(not_indexed_datasets)) semantic_labeler.train_semantic_types(not_indexed_datasets) # remove existing file if not reuse previous random forest model if not reuse_rf_model and os.path.exists("model/lr.pkl"): os.remove("model/lr.pkl") # train the model logger.info("Train randomforest... with args ([1], [%s]", train_dataset) semantic_labeler.train_random_forest([1], [train_dataset]) # generate semantic typing logger.info("Generate semantic typing using: trainset: %s, for testset: %s", train_dataset, test_dataset) result = semantic_labeler.test_semantic_types_from_2_sets(train_dataset2, test_dataset) if not os.path.exists("output"): os.mkdir("output") with open("output/%s_result.json" % test_dataset, "w") as f: ujson.dump(result, f) if evaluate_train_set: logger.info("Generate semantic typing for trainset") result = semantic_labeler.test_semantic_types_from_2_sets(train_dataset2, train_dataset2) with open("output/%s_result.json" % train_dataset2, "w") as f: ujson.dump(result, f) return result
with open("output/%s_result.json" % test_dataset, "w") as f: ujson.dump(result, f) if evaluate_train_set: logger.info("Generate semantic typing for trainset") result = semantic_labeler.test_semantic_types_from_2_sets(train_dataset2, train_dataset2) with open("output/%s_result.json" % train_dataset2, "w") as f: ujson.dump(result, f) return result if __name__ == '__main__': import argparse parser = argparse.ArgumentParser('Semantic labeling API') parser.add_argument('--train_dataset', type=str, help='trainset', required=True) parser.add_argument('--test_dataset', type=str, help='testset', required=True) parser.add_argument('--train_dataset2', type=str, default=None, help='default to train_dataset') parser.add_argument('--evaluate_train_set', type=lambda x: x.lower() == "true", default=False, help='default False') parser.add_argument('--reuse_rf_model', type=lambda x: x.lower() == "true", default=True, help='default True') args = parser.parse_args() if args.train_dataset2 is None: args.train_dataset2 = args.train_dataset logger = get_logger("api-starter", format_str='>>>>>> %(asctime)s - %(levelname)s:%(name)s:%(module)s:%(lineno)d: %(message)s') logger.info("Calling semantic labeling API with args: %s" % args) semantic_labeling(args.train_dataset, args.test_dataset, args.train_dataset2, args.evaluate_train_set, args.reuse_rf_model)
class SemanticLabeler: logger = get_logger("SemanticLabeler", level=logging.DEBUG) def __init__(self): self.dataset_map = {} self.file_class_map = {} self.random_forest = None def preprocess_memex_data_sources(self, folder_path): source_map = OrderedDict() for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) print(file_path) with open(file_path, "r") as f: for json_line in f.readlines(): json_obj = json.loads(json_line) source_name = json_obj["tld"] if source_name not in source_map: source_map[source_name] = Source(source_name) source = source_map[source_name] for attr in json_obj: if attr.startswith("inferlink"): attr_name = attr.split("_")[1] if attr_name not in source.column_map: source.column_map[attr_name] = Column( attr_name, source.name) source.column_map[ attr_name].semantic_type = attr_name for ele1 in json_obj[attr]: if isinstance(ele1["result"], dict): source.column_map[attr_name].add_value( ele1["result"]["value"]) else: for ele2 in ele1["result"]: source.column_map[attr_name].add_value( ele2["value"]) for source in source_map.values(): if source.column_map: source.write_csv_file("data/datasets/memex/%s" % source.name) def read_data_sources(self, folder_paths): semantic_type_set = set() attr_count = 0 for folder_name in folder_paths: self.logger.debug("Read dataset: %s", folder_name) folder_path = "data/datasets/%s" % folder_name source_map = OrderedDict() data_folder_path = os.path.join(folder_path, "tables") model_folder_path = os.path.join(folder_path, "models") for filename in sorted(os.listdir(data_folder_path)): extension = os.path.splitext(filename)[1] if ".DS" in filename: continue self.logger.debug(" -> read: %s", filename) source = Source(os.path.splitext(filename)[0]) file_path = os.path.join(data_folder_path, filename) if "full" in data_folder_path: source.read_data_from_wc_csv(file_path) elif extension == ".csv": source.read_data_from_csv(file_path) elif extension == ".json": source.read_data_from_json(file_path) elif extension == ".xml": source.read_data_from_xml(file_path) else: source.read_data_from_text_file(file_path) source_map[filename] = source if ('rowNumber' in source.column_map): del source.column_map['rowNumber'] # NOTE: BINH delete empty columns here!!!, blindly follows the code in indexer:36 for key in list(source.column_map.keys()): column = source.column_map[key] if column.semantic_type: if len(column.value_list) == 0: del source.column_map[key] source.empty_val_columns[key] = column logging.warning( "Indexer: IGNORE COLUMN `%s` in source `%s` because of empty values", column.name, source.name) for column in source.column_map.values(): semantic_type_set.add(column.semantic_type) attr_count += len(source.column_map.values()) if os.path.exists(model_folder_path): for filename in os.listdir(model_folder_path): if ".DS" in filename: continue try: source = source_map[os.path.splitext( os.path.splitext(filename)[0])[0]] except: source = source_map[filename] extension = os.path.splitext(filename)[1] if extension == ".json": source.read_semantic_type_json( os.path.join(model_folder_path, filename)) else: print(source) source.read_semantic_type_from_gold( os.path.join(model_folder_path, filename)) self.dataset_map[folder_name] = source_map # print semantic_type_set print(len(semantic_type_set)) print(attr_count) def train_random_forest(self, train_sizes, data_sets): self.random_forest = MyRandomForest(data_sets, self.dataset_map, "model/lr_all.pkl") self.random_forest.train(train_sizes) def train_semantic_types(self, dataset_list): print("train_semantic_types") for name in dataset_list: self.logger.debug("Indexing dataset %s", name) index_config = {'name': re.sub(not_allowed_chars, "!", name)} indexer.init_analyzers(index_config) source_map = self.dataset_map[name] for idx, key in enumerate(source_map.keys()): source = source_map[key] print("Index ", key) successful = source.save( index_config={ 'name': re.sub(not_allowed_chars, "!", name) }) if (not successful): self.logger.info("Error while parsing file ", key) print("Error while parsing file.") self.logger.debug(" + finish index source: %s", key) def predict_semantic_type_for_column(self, column): train_examples_map = searcher.search_types_data("index_name", []) textual_train_map = searcher.search_similar_text_data( "index_name", column.value_text, []) return column.predict_type(train_examples_map, textual_train_map, self.random_forest) def test_semantic_types(self, data_set, test_sizes): print("test_semantic_types") rank_score_map = defaultdict(lambda: defaultdict(lambda: 0)) count_map = defaultdict(lambda: defaultdict(lambda: 0)) index_config = {'name': data_set} source_map = self.dataset_map[data_set] double_name_list = source_map.values() * 2 file_write.write("Dataset: " + data_set + "\n") for size in test_sizes: start_time = time.time() for idx, source_name in enumerate(source_map.keys()): train_names = [ source.index_name for source in double_name_list[idx + 1:idx + size + 1] ] train_examples_map = searcher.search_types_data( index_config, train_names) source = source_map[source_name] for column in source.column_map.values(): if column.semantic_type: textual_train_map = searcher.search_similar_text_data( index_config, column.value_text, train_names) semantic_types = column.predict_type( train_examples_map, textual_train_map, self.random_forest) for threshold in [0.0]: found = False rank = 1 rank_score = 0 for prediction in semantic_types[:1]: if column.semantic_type in prediction[1]: if prediction[0] > threshold and prediction[ 0] != 0: rank_score = 1.0 / (rank) found = True break if prediction[0] != 0: rank += len(prediction[1]) if not found and semantic_types[0][0] < threshold: rank_score = 1 # file_write.write( # column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") file_write.write(str(rank_score) + "\n") rank_score_map[size][threshold] += rank_score count_map[size][threshold] += 1 running_time = time.time() - start_time for threshold in [0.0]: file_write.write("Size: " + str(size) + " F-measure: " + str(rank_score_map[size][threshold] * 1.0 / count_map[size][threshold]) + " Time: " + str(running_time) + " Count: " + str(count_map[size][threshold]) + "\n") def read_class_type_from_csv(self, file_path): self.file_class_map = {} with open(file_path, "r") as f: csv_reader = csv.reader(f) for row in csv_reader: self.file_class_map[row[0].replace(".tar.gz", ".csv")] = row[1] def test_semantic_types_from_2_sets(self, train_set, test_set): # self.read_class_type_from_csv("data/datasets/%s/classes.csv" % test_set) # print self.file_class_map.keys() rank_score_map = defaultdict(lambda: 0) count_map = defaultdict(lambda: 0) source_result_map = {} train_index_config = {'name': train_set} train_names = [ source.index_name for source in self.dataset_map[train_set].values() ] self.logger.info("Train source: %s", train_names) valid = True for idx, source_name in enumerate(self.dataset_map[test_set]): # if source_name not in self.file_class_map: # continue train_examples_map = searcher.search_types_data( train_index_config, train_names) source = self.dataset_map[test_set][source_name] self.logger.info("Test source: %s", source_name) column_result_map = {} for column in source.column_map.values(): # if not column.semantic_type or not column.value_list or "ontology" not in column.semantic_type: # continue if not column.semantic_type or not column.value_list: continue textual_train_map = searcher.search_similar_text_data( train_index_config, column.value_text, train_names) # self.logger.info(textual_train_map) try: semantic_types = column.predict_type( train_examples_map, textual_train_map, self.random_forest) except KeyError: print("KEY ERROR") valid = False break # if(not semantic_types): # self.logger.info("Could not do "+column.name) # continue column_result_map[column.name] = semantic_types self.logger.info(" -> column: %s", column.name) file_write.write(column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]: found = False rank = 1 rank_score = 0 for prediction in semantic_types[:1]: if column.semantic_type in prediction[1]: if prediction[0] > threshold and prediction[0] != 0: rank_score = 1.0 / rank found = True break if prediction[0] != 0: rank += len(prediction[1]) if not found and semantic_types[0][0] < threshold: rank_score = 1 file_write.write(str(rank_score) + "\n") rank_score_map[threshold] += rank_score count_map[threshold] += 1 source_result_map[source_name] = column_result_map # for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]: # file_write.write( # " MRR: " + str( # rank_score_map[threshold] * 1.0 / count_map[threshold]) + " Count: " + str( # count_map[threshold]) + " threshold=" + str(threshold) + "\n") return source_result_map def write_data_for_transform(self, name): for source_name, source in self.dataset_map[name].items(): for attribute in source.column_map.values(): attribute.write_to_data_file()
class MyRandomForest: logger = get_logger("rf", level=logging.DEBUG) def __init__(self, data_sets=None, dataset_map=None, model_path=None): self.data_sets = data_sets self.dataset_map = dataset_map self.model_path = model_path self.model = None self.feature_selector = None def generate_train_data(self, train_sizes): self.logger.info("generate_train_data") train_data = [] for data_set in self.data_sets: print("data_set: ", data_set) train_data = [] index_config = {'name': data_set} source_map = self.dataset_map[data_set] double_name_list = source_map.values() * 2 for size in train_sizes: for idx, source_name in enumerate(source_map.keys()): train_names = [ source.index_name for source in double_name_list[idx + 1:idx + size + 1] ] train_examples_map = searcher.search_types_data( index_config, train_names) source = source_map[source_name] print("Source: ", source) for column in source.column_map.values(): print("COLUMN: ", column) if column.semantic_type: textual_train_map = searcher.search_similar_text_data( index_config, column.value_text, train_names) feature_vectors = column.generate_candidate_types( train_examples_map, textual_train_map, is_labeled=True) train_data += feature_vectors return train_data def train(self, train_sizes): if os.path.exists(self.model_path): print "Loading ..." self.model = joblib.load("model/lr_all.pkl") else: train_df = self.generate_train_data(train_sizes) train_df = pd.DataFrame(train_df) train_df = train_df.replace([np.inf, -np.inf, np.nan], 0) # self.model = LogisticRegression(n_estimators=200, combination="majority_voting") self.model = LogisticRegression(class_weight="balanced") # print train_df # sample_weight = train_df['label'].apply(lambda x: 15 if x else 1) # print sample_weight if is_tree_based: self.model.fit(train_df[tree_feature_list], train_df['label']) else: # self.model.fit(train_df[feature_list], train_df['label']) self.model.fit(train_df[feature_list], train_df['label']) # train_df[feature_list + ["label"]].to_csv("train.csv", mode='w', header=True) # cost = len(train_df[train_df['label'] == False]) / len(train_df[train_df['label'] == True]) # self.model.fit(train_df[feature_list].as_matrix(), train_df['label'].as_matrix(), # np.tile(np.array([1, cost, 0, 0]), (train_df.shape[0], 1))) joblib.dump(self.model, self.model_path) def predict(self, test_data, true_type): test_df = pd.DataFrame(test_data) test_df = test_df.replace([np.inf, -np.inf, np.nan], 0) if (test_df.empty == True): self.logger.info("Error") #return if is_tree_based: test_df['prob'] = [ x[1] for x in self.model.predict_proba( test_df[tree_feature_list].as_matrix()) ] else: test_df['prob'] = [ x[1] for x in self.model.predict_proba( test_df[feature_list].as_matrix()) ] # test_df['prediction'] = [1 if x else 0 for x in self.model.predict(test_df[feature_list])] test_df['truth'] = test_df['name'].map( lambda row: row.split("!")[0] == true_type) test_df = test_df.sort_values(["prob"], ascending=[False]) if os.path.exists("debug.csv"): test_df.to_csv("debug.csv", mode='a', header=False) else: test_df.to_csv("debug.csv", mode='w', header=True) return test_df[["prob", 'name']].T.to_dict().values()