def online_learning(model: Model, dataset: str, train_sms: List[SemanticModel], discover_sources: List[SemanticModel], output_dir: Path, training_args, iter_range=(1, 3)): data: Dict[str, Dict[bytes, Example]] = {sm.id: {} for sm in discover_sources} discover_sids = {sm.id for sm in discover_sources} ignore_sids = set( ) # those should not include in the discovery_helper process because of no new sources logger = get_logger("app") (output_dir / "examples").mkdir(exist_ok=True, parents=True) # default should have ground-truth for sm in discover_sources: data[sm.id][graph_to_hashable_string(sm.graph)] = make_example( sm, sm.graph, Example.generate_example_id(sm.id, 0, 0), [sm.id for sm in train_sms]) for n_iter in range(*iter_range): logger.info("==================================> Iter: %s", n_iter) new_data = generate_data(model, dataset, train_sms, discover_sources, n_iter) for sm in discover_sources: if sm.id in ignore_sids: continue new_candidate_sms = [ key for key in new_data[sm.id] if key not in data[sm.id] ] if len(new_candidate_sms) == 0: # no new candidate sms logger.info("No new candidate for source: %s", sm.id) ignore_sids.add(sm.id) else: for key in new_candidate_sms: data[sm.id][key] = new_data[sm.id][key] train_examples = [ example for sm in train_sms if sm.id in discover_sids for example in data[sm.id].values() ] train_examples.sort(key=lambda e: e.example_id) serializeJSON(train_examples, output_dir / "examples" / f"train.{n_iter}.json") shutil.copyfile(output_dir / "examples" / f"train.{n_iter}.json", output_dir / "examples" / f"train.json") raw_model, tf_domain, pairwise_domain, __ = train_model( dataset, [sm.id for sm in train_sms], 120, train_examples, [], training_args, output_dir / "models") model = Model(dataset, raw_model, tf_domain, pairwise_domain) return model
class EarlyStopping(object): logger = get_logger("app.assembling.training_workflow.early_stopping") def __init__(self) -> None: self.prev_max_score = 1 def early_stopping(self, n_iter, search_nodes: Iterable[PGMSearchNode]): # return True # return n_iter >= 4 try: search_node = next(iter(search_nodes)) except StopIteration: return True current_score = search_node.get_score() if self.prev_max_score - current_score > 0.3 and current_score < 0.3: return True self.prev_max_score = current_score return False
class NodeProb(object): logger = get_logger("app.assembling.weak_models.node_prob") def __init__(self, example_annotator: 'ExampleAnnotator', load_classifier: bool = False): self.example_annotator = example_annotator self.multival_predicate = example_annotator.multival_predicate if load_classifier: retrain = example_annotator.training_examples is not None self.scaler, self.classifier = self.get_classifier( retrain=retrain, train_examples=example_annotator.training_examples) else: self.scaler, self.classifier = None, None def feature_extraction(self, graph: Graph, stype_score: Dict[int, Optional[float]]): node2features = {} for node in graph.iter_class_nodes(): prob_data_nodes = _(node.iter_outgoing_links()) \ .imap(lambda x: x.get_target_node()) \ .ifilter(lambda x: x.is_data_node()) \ .reduce(lambda a, b: a + (stype_score[b.id] or 0), 0) similar_nodes = graph.iter_nodes_by_label(node.label) minimum_merged_cost = min((get_merged_cost(node, similar_node, self.multival_predicate) for similar_node in similar_nodes)) node2features[node.id] = [('prob_data_nodes', prob_data_nodes), ('minimum_merged_cost', minimum_merged_cost)] return node2features def compute_prob(self, node2features): X = numpy.asarray([[p[1] for p in features] for features in node2features.values()]) self.scaler.transform(X) y_pred = self.classifier.predict_proba(X)[:, 1] return {nid: y_pred[i] for i, nid in enumerate(node2features.keys())} def get_classifier(self, retrain: bool, train_examples: List[Example]): # TODO: implement this properly, currently, we have to train and save manually cached_file = get_cache_dir( self.example_annotator.dataset, list(self.example_annotator.train_source_ids) ) / "weak_models" / "node_prob_classifier.pkl" if not cached_file.exists() or retrain: self.logger.debug("Retrain new model") raw_X_train = make_data(self, train_examples) classifier = LogisticRegression(fit_intercept=True) X_train = numpy.asarray( [list(features.values())[1:] for features in raw_X_train]) X_train, y_train = X_train[:, :-1], [ int(x) for x in X_train[:, -1] ] scaler = StandardScaler().fit(X_train) scaler.transform(X_train) try: classifier.fit(X_train, y_train) except ValueError as e: assert str(e).startswith( "This solver needs samples of at least 2 classes in the data" ) # this should be at a starter phase when we don't have any data but use ground-truth to build X_train = numpy.vstack([X_train, [0, 0]]) y_train.append(0) classifier.fit(X_train, y_train) cached_file.parent.mkdir(exist_ok=True, parents=True) serialize((scaler, classifier), cached_file) return scaler, classifier return deserialize(cached_file)
class Settings(object): logger = get_logger("app.assembling.settings") instance = None # #################################################################### # Semantic Labeling constant ReImplMinhISWC = "ReImplMinhISWC" MohsenJWS = "MohsenJWS" OracleSL = "OracleSL" # Searching constant ALGO_ES_DISABLE = "NoEarlyStopping" ALGO_ES_MIN_PROB = "MinProb" # Auto-labeling constant ALGO_AUTO_LBL_MAX_F1 = "AUTO_LBL_MAX_F1" ALGO_AUTO_LBL_PRESERVED_STRUCTURE = "AUTO_LBL_PRESERVED_STRUCTURE" # #################################################################### def __init__(self): # #################################################################### # General arguments self.random_seed: int = 120 self.n_samples: int = 1000 # #################################################################### # semantic labeling arguments self.semantic_labeling_method: str = Settings.ReImplMinhISWC self.semantic_labeling_top_n_stypes: int = 4 self.semantic_labeling_simulate_testing: bool = False # #################################################################### # auto labeling arguments self.auto_labeling_method: str = Settings.ALGO_AUTO_LBL_MAX_F1 # #################################################################### # weak models arguments self.data_constraint_guess_datetime_threshold: float = 0.5 self.data_constraint_valid_threshold: float = 0.95 self.data_constraint_n_comparison_samples: int = 150 # #################################################################### # graphical model arguments self.mrf_max_n_props = 10 self.mrf_max_n_duplications = 5 self.mrf_max_n_duplication_types = 4 # #################################################################### # searching arguments self.training_beam_width: int = 10 self.searching_beam_width: int = 10 self.searching_max_data_node_hop: int = 2 self.searching_max_class_node_hop: int = 2 self.searching_n_explore_result = 5 self.searching_triple_adviser_max_candidate: int = 15 self.searching_early_stopping_method: str = Settings.ALGO_ES_DISABLE self.searching_early_stopping_minimum_expected_accuracy = 0 self.searching_early_stopping_min_prob_args: Tuple[float] = (0.01, ) # #################################################################### # parallels self.parallel_gmtk_n_threads: int = 8 self.parallel_n_process: int = 4 self.parallel_n_annotators: int = 8 self.max_n_tasks: int = 80 # tune this parameter if its consume lots of memory def log_current_settings(self): self.logger.info("Current settings: %s", self.to_string()) def set_setting(self, key: str, value, log_change: bool = True): assert key in self.__dict__ self.__dict__[key] = value if log_change: self.log_current_settings() @staticmethod def get_instance(print_settings: bool = True) -> 'Settings': if Settings.instance is None: Settings.instance = Settings() if print_settings: Settings.instance.log_current_settings() return Settings.instance @staticmethod def parse_shell_args(print_settings: bool = True): def str2bool(v): assert v.lower() in {"true", "false"} return v.lower() == "true" parser = argparse.ArgumentParser('Settings') parser.register("type", "boolean", str2bool) parser.add_argument('--random_seed', type=int, default=120, help='default 120') parser.add_argument('--n_samples', type=int, default=1000, help='default 1000') parser.add_argument( '--semantic_labeling_method', type=str, default='ReImplMinhISWC', help= 'can be OracleSL, ReImplMinhISWC and MohsenISWC, default ReImplMinhISWC' ) parser.add_argument('--semantic_labeling_top_n_stypes', type=int, default=4, help='Default is top 4') parser.add_argument('--semantic_labeling_simulate_testing', type='boolean', default=False, help='Default is False') parser.add_argument( '--auto_labeling_method', type=str, default='AUTO_LBL_MAX_F1', help= 'can be AUTO_LBL_MAX_F1 and AUTO_LBL_PRESERVED_STRUCTURE (default AUTO_LBL_MAX_F1)' ) parser.add_argument('--data_constraint_guess_datetime_threshold', type=int, default=0.5, help='default 0.5') parser.add_argument('--data_constraint_valid_threshold', type=int, default=0.95, help='default is 0.95') parser.add_argument('--data_constraint_n_comparison_samples', type=int, default=150, help='default is 150') parser.add_argument('--training_beam_width', type=int, default=10, help='default 10') parser.add_argument('--searching_beam_width', type=int, default=10, help='default 10') parser.add_argument('--searching_max_data_node_hop', type=int, default=2, help='default 2') parser.add_argument('--searching_max_class_node_hop', type=int, default=2, help='default 2') parser.add_argument('--searching_n_explore_result', type=int, default=5, help='default 5') parser.add_argument('--searching_triple_adviser_max_candidate', type=int, default=15, help='default 15') parser.add_argument( '--searching_early_stopping_method', type=str, default='NoEarlyStopping', help='can be NoEarlyStopping or MinProb (default NoEarlyStopping)') parser.add_argument( '--searching_early_stopping_minimum_expected_accuracy', type=int, default=0, help='default 0') parser.add_argument('--searching_early_stopping_min_prob_args', type=str, default="[0.01]", help='default is [0.01]') parser.add_argument('--parallel_gmtk_n_threads', type=int, default=8, help='default is 8 threads') parser.add_argument('--parallel_n_process', type=int, default=4, help='default is 4 processes') parser.add_argument('--parallel_n_annotators', type=int, default=8, help='default is 8') parser.add_argument('--max_n_tasks', type=int, default=80, help='default is 80') args = parser.parse_args() args.searching_early_stopping_min_prob_args = ujson.loads( args.searching_early_stopping_min_prob_args) assert args.semantic_labeling_method in { Settings.ReImplMinhISWC, Settings.MohsenJWS, Settings.OracleSL } assert args.auto_labeling_method in { Settings.ALGO_AUTO_LBL_MAX_F1, Settings.ALGO_AUTO_LBL_PRESERVED_STRUCTURE } assert args.searching_early_stopping_method in { Settings.ALGO_ES_DISABLE, Settings.ALGO_ES_MIN_PROB } Settings.get_instance(False) settings = Settings.instance settings.random_seed = args.random_seed settings.n_samples = args.n_samples settings.semantic_labeling_method = args.semantic_labeling_method settings.semantic_labeling_top_n_stypes = args.semantic_labeling_top_n_stypes settings.semantic_labeling_simulate_testing = args.semantic_labeling_simulate_testing settings.auto_labeling_method = args.auto_labeling_method settings.data_constraint_guess_datetime_threshold = args.data_constraint_guess_datetime_threshold settings.data_constraint_valid_threshold = args.data_constraint_valid_threshold settings.data_constraint_n_comparison_samples = args.data_constraint_n_comparison_samples settings.searching_beam_width = args.searching_beam_width settings.searching_max_data_node_hop = args.searching_max_data_node_hop settings.searching_max_class_node_hop = args.searching_max_class_node_hop settings.searching_n_explore_result = args.searching_n_explore_result settings.searching_triple_adviser_max_candidate = args.searching_triple_adviser_max_candidate settings.searching_early_stopping_method = args.searching_early_stopping_method settings.searching_early_stopping_minimum_expected_accuracy = args.searching_early_stopping_minimum_expected_accuracy settings.searching_early_stopping_min_prob_args = args.searching_early_stopping_min_prob_args settings.parallel_gmtk_n_threads = args.parallel_gmtk_n_threads settings.parallel_n_process = args.parallel_n_process settings.parallel_n_annotators = args.parallel_n_annotators settings.max_n_tasks = args.max_n_tasks if print_settings: settings.log_current_settings() return settings def to_string(self): return f"""
delete_worksheet = driver.find_elements_by_css_selector( "#WorksheetOptionsDiv > ul.dropdown-menu > li")[-3] assert delete_worksheet.text.strip() == "Delete Worksheet" delete_worksheet.click() short_delay() alert = driver.switch_to.alert alert.accept() delay() remove_all_noti() # SETUP hyper-parameters logger = get_logger("app.preprocessing.generate_r2rml") dataset = "museum_edm" ont = get_ontology(dataset) #%% INIT SELENIUM driver = webdriver.Firefox() driver.get("http://localhost:8080") time.sleep(5) #%% LOAD FILES model_dir = Path(config.datasets[dataset].models_y2rml.as_path()) r2rml_dir = Path( config.datasets[dataset].as_path()) / "karma-version" / "models-r2rml" karma_source_dir = Path( config.datasets[dataset].as_path()) / "karma-version" / "sources"
class KR2RML(R2RML): """Load KR2RML and produce default command history""" logger = get_logger("app.transformation.kr2rml") def __init__(self, ont: Ontology, tbl: DataTable, kr2rml_file: Path) -> None: g = rdflib.Graph(store=IOMemory()) g.parse(location=str(kr2rml_file), format="n3") worksheet_history = list( g.triples( (None, URIRef( "http://isi.edu/integration/karma/dev#hasWorksheetHistory" ), None))) assert len(worksheet_history) == 1 worksheet_history = ujson.loads(worksheet_history[0][-1]) input_columns = list( g.triples(( None, URIRef("http://isi.edu/integration/karma/dev#hasInputColumns"), None))) assert len(input_columns) == 1 input_columns = ujson.loads(input_columns[0][-1]) # construct mapping between kr2rml attribute paths to tbl_attr_paths tbl_attr_paths = tbl.schema.get_attr_paths() n_attr_paths = len(tbl_attr_paths) tbl_attr_paths = { apath.replace("@", ""): apath for apath in tbl_attr_paths } assert len(tbl_attr_paths) == n_attr_paths start_idx = 0 for i, cname in enumerate(input_columns[0]): cpath = Schema.PATH_DELIMITER.join( cname['columnName'] for cname in input_columns[0][i:]) # cname = Schema.PATH_DELIMITERinput_columns[i:]) cname['columnName'] + Schema.PATH_DELIMITER found_attr = False for attr_path in tbl_attr_paths: if (attr_path + Schema.PATH_DELIMITER).startswith(cpath): found_attr = True break if found_attr: start_idx = i break literal_nodes = {} col2col = {} for col in input_columns: attr_path = Schema.PATH_DELIMITER.join( cname['columnName'] for cname in col[start_idx:]) if attr_path not in tbl_attr_paths: attr_path = Schema.PATH_DELIMITER.join( cname['columnName'] for cname in col[start_idx:-1]) if col[-1]['columnName'] == 'Values': assert attr_path in tbl_attr_paths elif col[-1]['columnName'] == 'content': attr_path += Schema.PATH_DELIMITER + "#text" assert attr_path in tbl_attr_paths else: raise ValueError( f"Invalid column type: {col[-1]['columnName']}") col2col[Schema.PATH_DELIMITER.join( cname['columnName'] for cname in col)] = tbl_attr_paths[attr_path] assert len(set( col2col.values())) == len(input_columns), "No duplication" # extracting commands commands = [] for command in worksheet_history: if command['commandName'] == "SubmitPythonTransformationCommand": cmd_start_col = command['inputParameters'][0] cmd_input_parent_col = Schema.PATH_DELIMITER.join( [col['columnName'] for col in cmd_start_col['value'][:-1]]) cmd_input_col = command['inputParameters'][-2] cmd_output_col = command['inputParameters'][-1] if command['inputParameters'][-3]['name'] == 'isJSONOutput': cmd_code = command['inputParameters'][-5] default_error_value = command['inputParameters'][-4] assert command['inputParameters'][-3]['value'] == "false" else: default_error_value = command['inputParameters'][-3] cmd_code = command['inputParameters'][-4] assert cmd_input_col['name'] == "inputColumns" and cmd_output_col[ "name"] == "outputColumns" and cmd_code[ 'name'] == 'transformationCode' and default_error_value[ 'name'] == 'errorDefaultValue' cmd_input_cols = [[ cname['columnName'] for cname in o['value'] ] for o in ujson.loads(cmd_input_col['value'])] karma_input_attr_paths = [ col2col[Schema.PATH_DELIMITER.join(cmd_input_col)] for cmd_input_col in cmd_input_cols ] # update col2col because of new columns new_attr_name = ujson.loads( cmd_output_col['value'])[0]['value'][-1]['columnName'] new_attr_path = new_attr_name if cmd_input_parent_col == "" else ( cmd_input_parent_col + Schema.PATH_DELIMITER + new_attr_name) cmd_output_col = Schema.PATH_DELIMITER.join( cname['columnName'] for cname in ujson.loads( cmd_output_col['value'])[0]['value']) col2col[cmd_output_col] = new_attr_path cmd_code = cmd_code['value'].replace("return ", "__return__ = ") input_attr_paths = [] for match in reversed( list(re.finditer("getValue\(([^)]+)\)", cmd_code))): start, end = match.span(1) field = cmd_code[start:end].replace("'", "").replace( '"""', "").replace('"', '') # it seems that Karma use last column name, we need to recover full name # using the provided input first for cmd_input_col, input_attr_path in zip( cmd_input_cols, karma_input_attr_paths): if field == cmd_input_col[-1]: field = input_attr_path break else: # otherwise construct from the start columns full_field = field if cmd_input_parent_col == "" else ( cmd_input_parent_col + Schema.PATH_DELIMITER + field) field = col2col[full_field] cmd_code = cmd_code[:start] + f'"{field}"' + cmd_code[end:] input_attr_paths.append(field) default_error_value = default_error_value['value'] commands.append( PyTransformNewColumnCmd(input_attr_paths, new_attr_name, cmd_code, default_error_value)) elif command["commandName"] == "SetSemanticTypeCommand" or command[ "commandName"] == "SetMetaPropertyCommand": cmd_input_col = command['inputParameters'][-2] if command["inputParameters"][-5][ 'name'] == 'SemanticTypesArray': cmd_stype = command['inputParameters'][-5] else: cmd_stype = command['inputParameters'][-6] if cmd_stype['name'] == 'SemanticTypesArray': assert cmd_input_col['name'] == "inputColumns" and len( cmd_stype['value'] ) == 1 and cmd_stype['value'][0]['isPrimary'] cmd_input_col = col2col[Schema.PATH_DELIMITER.join( cname['columnName'] for cname in ujson.loads( cmd_input_col['value'])[0]['value'])] cmd_stype = cmd_stype['value'][0] commands.append( SetSemanticTypeCmd( cmd_input_col, domain=ont.simplify_uri(cmd_stype['DomainUri']), type=ont.simplify_uri(cmd_stype['FullType']), node_id=ont.simplify_uri( cmd_stype['DomainId'].replace(" (add)", "")))) else: cmd_stype_domain = command['inputParameters'][-7] cmd_stype_id = command['inputParameters'][-6] assert cmd_input_col['name'] == "inputColumns" and cmd_stype_domain['name'] == 'metaPropertyUri' \ and cmd_stype_id['name'] == 'metaPropertyId' cmd_input_col = col2col[Schema.PATH_DELIMITER.join( cname['columnName'] for cname in ujson.loads( cmd_input_col['value'])[0]['value'])] commands.append( SetSemanticTypeCmd( cmd_input_col, domain=ont.simplify_uri(cmd_stype_domain['value']), type="karma:classLink", node_id=ont.simplify_uri(cmd_stype_id['value']))) elif command['commandName'] == 'UnassignSemanticTypeCommand': cmd_input_col = command['inputParameters'][-2] assert cmd_input_col['name'] == "inputColumns" cmd_input_col = col2col[Schema.PATH_DELIMITER.join( cname['columnName'] for cname in ujson.loads( cmd_input_col['value'])[0]['value'])] delete_cmds = [] for i, cmd in enumerate(commands): if isinstance(cmd, SetSemanticTypeCmd ) and cmd.input_attr_path == cmd_input_col: delete_cmds.append(i) for i in reversed(delete_cmds): commands.pop(i) elif command["commandName"] == "ChangeInternalNodeLinksCommand": cmd_edges = command['inputParameters'][-3] assert cmd_edges['name'] == 'newEdges' # cmd_initial_edges = command['inputParameters'][-4] # if cmd_initial_edges['name'] == 'initialEdges' and len(cmd_initial_edges['value']) > 0: # delete_cmds = [] # for cmd_edge in cmd_initial_edges['value']: # edge_lbl = ont.simplify_uri(cmd_edge['edgeId']) # source_id = ont.simplify_uri(cmd_edge['edgeSourceId']) # # if cmd_edge['edgeTargetId'] in literal_nodes: # for i, cmd in enumerate(commands): # if isinstance(cmd, SetSemanticTypeCmd) and cmd.type == edge_lbl and cmd.node_id == source_id: # delete_cmds.append(i) # else: # target_id = ont.simplify_uri(cmd_edge['edgeTargetId']) # for i, cmd in enumerate(commands): # if isinstance(cmd, SetInternalLinkCmd) and cmd.link_lbl == edge_lbl and cmd.target_id == target_id and cmd.source_id == source_id: # delete_cmds.append(i) # # for idx in sorted(delete_cmds, reverse=True): # commands.pop(idx) for cmd_edge in cmd_edges['value']: source_uri = cmd_edge.get('edgeSourceUri', None) target_uri = cmd_edge.get('edgeTargetUri', None) if source_uri is not None and source_uri != cmd_edge[ 'edgeSourceId']: source_uri = ont.simplify_uri(source_uri) else: source_uri = None if target_uri is not None and target_uri != cmd_edge[ 'edgeTargetId']: target_uri = ont.simplify_uri(target_uri) else: target_uri = None if cmd_edge['edgeTargetId'] in literal_nodes: # convert this command to SetSemanticType commands.append( SetSemanticTypeCmd( literal_nodes[cmd_edge['edgeTargetId']], domain=ont.simplify_uri(source_uri), type=ont.simplify_uri(cmd_edge['edgeId']), node_id=ont.simplify_uri( cmd_edge['edgeSourceId']))) else: commands.append( SetInternalLinkCmd( ont.simplify_uri(cmd_edge['edgeSourceId']), ont.simplify_uri(cmd_edge['edgeTargetId']), ont.simplify_uri(cmd_edge['edgeId']), source_uri, target_uri)) elif command['commandName'] == "AddLinkCommand": cmd_edges = command['inputParameters'][-3] assert cmd_edges['name'] == 'edge' cmd_edge = cmd_edges['value'] source_uri = cmd_edge.get('edgeSourceUri', None) target_uri = cmd_edge.get('edgeTargetUri', None) if source_uri is not None: source_uri = ont.simplify_uri(source_uri) else: source_uri = None if cmd_edge['edgeTargetId'] in literal_nodes: # convert this command to SetSemanticType commands.append( SetSemanticTypeCmd( literal_nodes[cmd_edge['edgeTargetId']], domain=ont.simplify_uri(source_uri), type=ont.simplify_uri(cmd_edge['edgeId']), node_id=ont.simplify_uri( cmd_edge['edgeSourceId']))) else: if target_uri is not None: target_uri = ont.simplify_uri(target_uri) else: target_uri = None commands.append( SetInternalLinkCmd( ont.simplify_uri(cmd_edge['edgeSourceId']), ont.simplify_uri(cmd_edge['edgeTargetId']), ont.simplify_uri(cmd_edge['edgeId']), source_uri, target_uri)) elif command['commandName'] == 'DeleteLinkCommand': cmd_edge = command['inputParameters'][-3] assert cmd_edge['name'] == 'edge' cmd_edge = cmd_edge['value'] for i, cmd in enumerate(commands): if isinstance(cmd, SetInternalLinkCmd): if cmd.source_id == cmd_edge[ 'edgeSourceId'] and cmd.target_id == cmd_edge[ 'edgeTargetId'] and cmd.link_lbl == ont.simplify_uri( cmd_edge['edgeId']): commands.pop(i) break elif command["commandName"] == "AddLiteralNodeCommand": cmd_literal_value = command["inputParameters"][0] assert cmd_literal_value['name'] == 'literalValue' cmd_literal_value = cmd_literal_value['value'] # they may re-use literal_values, let's user fix it manually if cmd_literal_value.startswith("http"): new_attr_path = f"literal:{ont.simplify_uri(cmd_literal_value)}" else: new_attr_path = f"literal:{cmd_literal_value}" if cmd_literal_value + "1" not in literal_nodes: new_attr_path += ":1" literal_nodes[cmd_literal_value + "1"] = new_attr_path elif cmd_literal_value + "2" not in literal_nodes: new_attr_path += ":2" literal_nodes[cmd_literal_value + "2"] = new_attr_path elif cmd_literal_value + "3" not in literal_nodes: new_attr_path += ":3" literal_nodes[cmd_literal_value + "3"] = new_attr_path else: assert False col2col[new_attr_path] = new_attr_path commands.append( AddLiteralColumnCmd(new_attr_path, cmd_literal_value)) elif command["commandName"] == "OperateSelectionCommand": # no way to see it in the KARMA UI continue elif command["commandName"] == "OrganizeColumnsCommand": continue elif command["commandName"] == "SetWorksheetPropertiesCommand": # this command doesn't affect the model continue # elif command["commandName"] == "UnfoldCommand": # cmd_input_col = command["inputParameters"][-2] # cmd_output_col = command["inputParameters"][-1] # assert cmd_input_col['name'] == "inputColumns" and cmd_output_col['name'] == 'outputColumns' # cmd_input_cols = [ # [cname['columnName'] for cname in o['value']] for o in ujson.loads(cmd_input_col['value']) # ] # input_attr_paths = [col2col[Schema.PATH_DELIMITER.join(cmd_input_col)] for cmd_input_col in cmd_input_cols] # cmd_output_cols = [ # [cname['columnName'] for cname in o['value']] for o in ujson.loads(cmd_output_col['value']) # ] # # output_attr_paths = [] # # update columns mapping # for cmd_output_col in cmd_output_cols: # attr_path = Schema.PATH_DELIMITER.join(cmd_output_col[start_idx:]) # col2col[Schema.PATH_DELIMITER.join(cmd_output_col)] = attr_path # output_attr_paths.append(attr_path) # # commands.append(UnrollCmd(input_attr_paths, output_attr_paths)) # elif command["commandName"] == "GlueCommand": # cmd_input_col = command["inputParameters"][-2] # cmd_output_col = command["inputParameters"][-1] else: assert False, "Source: %s. Doesn't handle command %s" % ( tbl.id, command["commandName"]) # fixing conflict modeling command conflicts = defaultdict(lambda: []) for i, cmd in enumerate(commands): if isinstance(cmd, SetSemanticTypeCmd): conflicts[cmd.input_attr_path].append((i, cmd)) if isinstance(cmd, SetInternalLinkCmd): conflicts[(cmd.source_id, cmd.target_id)].append((i, cmd)) delete_commands = [] for cmds in conflicts.values(): if len(cmds) > 1: display_warn = False for idx, cmd in cmds[1:]: if cmd != cmds[0][1]: if not display_warn: display_warn = True KR2RML.logger.warning( "Table: %s. Conflict between command: \n\t+ %s \n\t+ %s", tbl.id, cmds[0][1], cmd) else: print("\t+", cmd) # only keep final commands for idx, cmd in cmds[:-1]: delete_commands.append(idx) if isinstance(cmds[0][1], SetInternalLinkCmd): # need to update source_uri & target_uri first (for duplicate commands, source_uri, target_uri = None) key = (cmds[-1][1].source_id, cmds[-1][1].link_lbl, cmds[-1][1].target_id) for idx, cmd in cmds[:-1]: if (cmd.source_id, cmd.link_lbl, cmd.target_id) == key: cmds[-1][1].source_uri = cmd.source_uri cmds[-1][1].target_uri = cmd.target_uri break delete_commands.sort(reverse=True) for idx in delete_commands: commands.pop(idx) super().__init__(commands) def to_yaml(self, fpath: Path): with open(fpath, "w") as f: yaml.dump(self.to_dict(), f, default_flow_style=False, indent=4)
#!/usr/bin/python # -*- coding: utf-8 -*- from multiprocessing.pool import Pool from typing import Dict, Tuple, List, Set, Union, Optional, Callable, Generic, TypeVar from multiprocessing import Process, Queue, get_start_method, set_start_method import time import os from nose.tools import eq_ from pyutils.progress_utils import Timer from semantic_modeling.config import get_logger """Provide an easy and quick way to test if parallel is worth to do (overhead cost of serialize/deserialize arguments)""" logger = get_logger("default") def get_args_size(*args) -> int: total_element = 0 for arg in args: if isinstance(arg, (list, dict, tuple)): total_element += len(arg) else: total_element += 1 return total_element def minimal_computing_func(queue, *args): """A function that doesn't do anything but use to test overhead cost of multiprocessing""" queue.put(get_args_size(*args))
class SemanticTyper(object): logger = get_logger("app.semantic_labeling.typer") instance = None def __init__(self, dataset: str, train_sms: List[SemanticModel], exec_dir: Optional[Path] = None) -> None: self.dataset = dataset self.train_source_ids = {sm.id for sm in train_sms} if exec_dir is None: exec_dir = get_cache_dir(dataset, train_sms) / "semantic-labeling" self.exec_dir = Path(exec_dir) self.exec_dir.mkdir(exist_ok=True, parents=True) self.model = None self.stype_db = SemanticTypeDB.get_stype_db( dataset, [sm.id for sm in train_sms], self.exec_dir) def load_model(self): """Try to load previous model if possible""" if self.model is not None: return model_file = self.exec_dir / 'model.pkl' if model_file.exists(): self.logger.debug("Load previous trained model...") self.model = deserialize(model_file) else: self.logger.error("Cannot load model...") raise Exception("Model doesn't exist..") @staticmethod def get_instance(dataset: str, train_sms: List[SemanticModel], exec_dir: Optional[Path] = None) -> 'SemanticTyper': if SemanticTyper.instance is None: SemanticTyper.instance = SemanticTyper(dataset, train_sms, exec_dir) assert SemanticTyper.instance.dataset == dataset and \ SemanticTyper.instance.train_source_ids == {sm.id for sm in train_sms} return SemanticTyper.instance def semantic_labeling_v2(self, sms: List[SemanticModel], top_n: int) -> None: """Generate semantic labels and store it in its own""" sms: Dict[str, SemanticModel] = {s.id: s for s in sms} if self.model is None: model_file = self.exec_dir / 'model.pkl' if model_file.exists(): self.logger.debug("Load previous trained model...") self.model = deserialize(model_file) else: self.logger.debug("Train new model...") x_train, y_train, x_test, y_test = generate_training_data( self.stype_db) # clf = LogisticRegression(class_weight="balanced") clf = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight="balanced", random_state=120) clf = clf.fit(x_train, y_train) self.logger.debug("Save model...") serialize(clf, model_file) self.model = clf col_attrs = [] for col in self.stype_db.train_columns: if col.table_name not in sms: continue col_attrs.append( (col, sms[col.table_name].get_attr_by_label(col.name))) for col in self.stype_db.test_columns: if col.table_name not in sms: continue col_attrs.append( (col, sms[col.table_name].get_attr_by_label(col.name))) for col, attr in col_attrs: pred_stypes = self.pred_type(col, top_n) attr.semantic_types = [ SemanticType(stype[0].decode("utf-8"), stype[1].decode("utf-8"), score) for stype, score in pred_stypes if score > 0 ] def semantic_labeling(self, train_sources: List[SemanticModel], test_sources: List[SemanticModel], top_n: int, eval_train: bool = False) -> None: """Generate semantic labels and store it in test sources""" train_sources: Dict[str, SemanticModel] = {s.id: s for s in train_sources} test_sources: Dict[str, SemanticModel] = {s.id: s for s in test_sources} assert set(train_sources.keys()) == self.train_source_ids if self.model is None: model_file = self.exec_dir / 'model.pkl' if model_file.exists(): self.logger.debug("Load previous trained model...") self.model = deserialize(model_file) else: self.logger.debug("Train new model...") x_train, y_train, x_test, y_test = generate_training_data( self.stype_db) # clf = LogisticRegression(class_weight="balanced") clf = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight="balanced", random_state=120) clf = clf.fit(x_train, y_train) self.logger.debug("Save model...") serialize(clf, model_file) self.model = clf col_attrs = [] if eval_train: for col in self.stype_db.train_columns: if col.table_name not in train_sources: continue col_attrs.append( (col, train_sources[col.table_name].get_attr_by_label( col.name))) for col in self.stype_db.test_columns: if col.table_name not in test_sources: continue col_attrs.append( (col, test_sources[col.table_name].get_attr_by_label(col.name))) for col, attr in col_attrs: pred_stypes = self.pred_type(col, top_n) attr.semantic_types = [ SemanticType(stype[0].decode("utf-8"), stype[1].decode("utf-8"), score) for stype, score in pred_stypes if score > 0 ] def pred_type(self, col: Column, top_n: int) -> List[Tuple[Tuple[bytes, bytes], float]]: X = [] refcols = [ refcol for refcol in self.stype_db.train_columns if refcol.id != col.id ] j = self.stype_db.col2idx[col.id] for refcol in refcols: iref = self.stype_db.col2idx[refcol.id] X.append(self.stype_db.similarity_matrix[j, iref]) result = self.model.predict_proba(X)[:, 1] result = _(zip(result, (self.stype_db.col2types[rc.id] for rc in refcols))) \ .sort(key=lambda x: x[0], reverse=True) top_k_st = {} for score, stype in result: if stype not in top_k_st: top_k_st[stype] = score if len(top_k_st) == top_n: break return sorted([(stype, score) for stype, score in top_k_st.items()], reverse=True, key=lambda x: x[1]) def semantic_labeling_parent( self, train_sources: List[SemanticModel], test_sources: List[SemanticModel], top_n: int, eval_train: bool = False ) -> Dict[str, Dict[int, List[Tuple[Tuple[bytes, bytes], float, List[Tuple[ Tuple[bytes, bytes], float]]]]]]: """Generate semantic labels and store it in test sources""" train_sources: Dict[str, SemanticModel] = {s.id: s for s in train_sources} test_sources: Dict[str, SemanticModel] = {s.id: s for s in test_sources} assert set(train_sources.keys()) == self.train_source_ids if self.model is None: model_file = self.exec_dir / 'model.pkl' if model_file.exists(): self.logger.debug("Load previous trained model...") self.model = deserialize(model_file) else: self.logger.debug("Train new model...") x_train, y_train, x_test, y_test = generate_training_data( self.stype_db) # clf = LogisticRegression(class_weight="balanced") clf = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight="balanced", random_state=120) clf = clf.fit(x_train, y_train) self.logger.debug("Save model...") serialize(clf, model_file) self.model = clf col_attrs = [] pred_parent_stypes = {} if eval_train: for sid in train_sources: pred_parent_stypes[sid] = {} for col in self.stype_db.train_columns: if col.table_name not in train_sources: continue col_attrs.append( (col, train_sources[col.table_name].get_attr_by_label( col.name))) for sid in test_sources: pred_parent_stypes[sid] = {} for col in self.stype_db.test_columns: if col.table_name not in test_sources: continue col_attrs.append( (col, test_sources[col.table_name].get_attr_by_label(col.name))) for col, attr in col_attrs: pred_full_stypes = self.pred_full_stype(col, top_n) attr.semantic_types = [ SemanticType(stype[0].decode("utf-8"), stype[1].decode("utf-8"), score) for stype, score, parent_stypes in pred_full_stypes if score > 0 ] for stype, score, parent_stypes in pred_full_stypes: if score > 0: if attr.id not in pred_parent_stypes[col.table_name]: pred_parent_stypes[col.table_name][attr.id] = [] pred_parent_stypes[col.table_name][attr.id].append( (stype, score, sorted(parent_stypes.items(), key=lambda x: x[1], reverse=True))) return pred_parent_stypes def pred_full_stype( self, col: Column, top_n: int ) -> List[Tuple[Tuple[bytes, bytes], float, Dict[Tuple[bytes, bytes], float]]]: X = [] refcols = [ refcol for refcol in self.stype_db.train_columns if refcol.id != col.id ] j = self.stype_db.col2idx[col.id] for refcol in refcols: iref = self.stype_db.col2idx[refcol.id] X.append(self.stype_db.similarity_matrix[j, iref]) result = self.model.predict_proba(X)[:, 1] result = _(zip(result, (self.stype_db.col2dnodes[rc.id] for rc in refcols))) \ .sort(key=lambda x: x[0], reverse=True) # each top_k_st is map between stype, its score, and list of parent stypes with score top_k_st: Dict[Tuple[bytes, bytes], Tuple[float, Dict[Tuple[Tuple[bytes, bytes], float]]]] = {} for score, dnode in result: link = dnode.get_first_incoming_link() parent = link.get_source_node() parent_link = parent.get_first_incoming_link() if parent_link is None: parent_stype = None else: parent_stype = (parent_link.get_source_node().label, parent_link.label) stype = (parent.label, link.label) if stype not in top_k_st: if len(top_k_st) == top_n: # ignore stype which doesn't make itself into top k continue top_k_st[stype] = (score, {parent_stype: score}) else: # keep looping until we collect enough parent_link, default is top 3 if parent_stype not in top_k_st[stype][1]: # if we have seen the parent_stype, we don't need to update score because it's already the greatest top_k_st[stype][1][parent_stype] = score return sorted([(stype, score, parent_stypes) for stype, (score, parent_stypes) in top_k_st.items()], reverse=True, key=lambda x: x[1])
class DataConstraint(object): """This model tries to answer a question whether a mapping of a column follow some constraints inferred from data For example: we have 2 columns DoB & DoD 1. if they are linked to same class by different predicate (local constraint) 2. if they are linked to same class by same predicate, which class should we choose to link? (look at the parent) + how about: the case of same predicate, different class (not handled yet, let's semantic labeling does it) Given a list of known sources, we extract possible columns order =, >=, <= (consider only columns that are in the above scope). The we count cases the semantic types/relationship follow the order, and cases it doesn't as prob. Given a new example, we also look for columns that match, and produce the prediction """ logger = get_logger("app.weak_models.data_constraint") def __init__(self, train_sms: List[SemanticModel], data_tables: List[ColumnBasedTable], valid_threshold: float, guess_datetime_threshold: float, n_comparison_sample: int) -> None: self.guess_datetime_threshold = guess_datetime_threshold self.valid_threshold = valid_threshold self.n_comparison_sample = n_comparison_sample self.cached_compared_cols: Dict[str, Dict[Tuple[bytes, bytes], Optional[float]]] = {} self.prob_count_scope1: Dict[Tuple[bytes, bytes], Dict[Tuple[bytes, bytes], int]] = {} self.prob_count_scope2: Dict[Tuple[bytes, bytes], Dict[Tuple[bytes, bytes], int]] = {} # keep a list of columns that can have data constraint (i.e: its value is comparable with other columns) col2useful_type: Dict[Column, ColumnType] = {} data_tables: Dict[str, ColumnBasedTable] = { tbl.id: tbl for tbl in data_tables } for tbl in data_tables.values(): for col in tbl.columns: type = self._guess_detail_type(col) if type is not None and type.is_comparable(): col2useful_type[col] = type # now we build the constraint from training sources for sm in train_sms: stypes: Dict[Tuple[bytes, bytes], List[GraphLink]] = {} node_group: Dict[GraphNode, List[GraphLink]] = {} table = data_tables[sm.id] name2col: Dict[bytes, Column] = { col.name.encode("utf-8"): col for col in table.columns } col2idx: Dict[Column, int] = { col: i for i, col in enumerate(table.columns) } for attr in sm.attrs: dnode = sm.graph.get_node_by_id(attr.id) dlink = dnode.get_first_incoming_link() pnode = dlink.get_source_node() stype = (pnode.label, dlink.label) if stype not in stypes: stypes[stype] = [] stypes[stype].append(dlink) # group node by their parents if pnode not in node_group: node_group[pnode] = [] node_group[pnode].append(dlink) # first scope, infer constraint inside class nodes for pnode, dlinks in node_group.items(): # before filter out data nodes that are not comparable, we double check if the data node we # have to ignore has its semantic types comparable # for e in dlinks: # if name2col[e.get_target_node().label] not in col2useful_type and (pnode.label, e.label) in self.prob_count_scope1: # self.logger.warning("Column's semantic types was detected to be comparable. But, now it can't: %s: %s", sm.id, e.get_target_node().label) # self.prob_count_scope1[(pnode.label, e.label)] = None dlinks = [ e for e in dlinks if name2col[e.get_target_node().label] in col2useful_type ] dnodes = [e.get_target_node() for e in dlinks] if len(dnodes) < 2: continue if len({ col2useful_type[name2col[dnode.label]] for dnode in dnodes }) != 1: # doesn't support mixed-type print({ col2useful_type[name2col[dnode.label]] for dnode in dnodes }) continue if len(dnodes) > 2: self.logger.warning("Only handle max-2 now... %s: %s", sm.id, [e.label for e in dlinks]) continue cols = [name2col[dnode.label] for dnode in dnodes] compare_result = self._compare_col(table, col2idx, cols[0], cols[1]) dtypes = [(pnode.label, dlink.label) for dlink in dlinks] # if we cannot compare 2 columns, then we ignore them if compare_result is None: # however, if this type is already register in the counter, then instead of ignore them # we should delete set it to None to prevent re-add in the future if (dtypes[0] in self.prob_count_scope1 and self.prob_count_scope1[dtypes[0]] is not None ) or (dtypes[1] in self.prob_count_scope1 and self.prob_count_scope1[dtypes[1]] is not None): self.logger.warning( "Inferred constraint for 2 columns %s doesn't hold for source: %s. (Column: %s, %s)", dtypes, sm.id, cols[0].name, cols[1].name) self.prob_count_scope1[dtypes[0]] = None self.prob_count_scope1[dtypes[1]] = None continue for dtype in dtypes: if dtype not in self.prob_count_scope1: self.prob_count_scope1[dtype] = {} if self.prob_count_scope1[ dtypes[0]] is None or self.prob_count_scope1[ dtypes[1]] is None: # inferred constraint doesn't hold, so we should ignore this column continue if compare_result: # col0 > col1 if len(self.prob_count_scope1[dtypes[0]]) != 0: if dtypes[0] not in self.prob_count_scope1[dtypes[ 0]] or dtypes[1] not in self.prob_count_scope1[ dtypes[0]]: self.prob_count_scope1[dtypes[0]] = None self.prob_count_scope1[dtypes[1]] = None else: assert self.prob_count_scope1[dtypes[0]][ dtypes[0]] == 1 and self.prob_count_scope1[ dtypes[0]][dtypes[1]] == 0 if len(self.prob_count_scope1[dtypes[1]]) != 0: if dtypes[0] not in self.prob_count_scope1[dtypes[ 1]] or dtypes[1] not in self.prob_count_scope1[ dtypes[1]]: self.prob_count_scope1[dtypes[0]] = None self.prob_count_scope1[dtypes[1]] = None else: assert self.prob_count_scope1[dtypes[1]][ dtypes[0]] == 1 and self.prob_count_scope1[ dtypes[1]][dtypes[1]] == 0 self.prob_count_scope1[dtypes[0]] = { dtypes[0]: 1, dtypes[1]: 0 } self.prob_count_scope1[dtypes[1]] = { dtypes[0]: 1, dtypes[1]: 0 } else: if len(self.prob_count_scope1[dtypes[0]]) != 0: assert self.prob_count_scope1[dtypes[0]][ dtypes[0]] == 0 and self.prob_count_scope1[ dtypes[0]][dtypes[1]] == 1 if len(self.prob_count_scope1[dtypes[1]]) != 0: assert self.prob_count_scope1[dtypes[1]][ dtypes[0]] == 0 and self.prob_count_scope1[ dtypes[1]][dtypes[1]] == 1 self.prob_count_scope1[dtypes[0]] = { dtypes[0]: 0, dtypes[1]: 1 } self.prob_count_scope1[dtypes[1]] = { dtypes[0]: 0, dtypes[1]: 1 } # second scope for stype, dlinks in stypes.items(): if len(dlinks) == 1: continue # now filter data nodes that is not comparable dnodes = [e.get_target_node() for e in dlinks] if any(name2col[dnode.label] not in col2useful_type for dnode in dnodes): continue if len({ col2useful_type[name2col[dnode.label]] for dnode in dnodes }) != 1: # doesn't support mixed-type print({ col2useful_type[name2col[dnode.label]] for dnode in dnodes }) continue if len(dlinks) > 2: self.logger.warning("Only handle max-2 now... %s: %s", sm.id, stype) continue snodes = [e.get_source_node() for e in dlinks] slinks = [ n.get_first_incoming_link() for n in snodes if n.get_first_incoming_link() is not None ] if len(slinks) == 0: continue # now we need to build some constraints to help distinguish between those semantic types # we assume parents of those types are different ... parent_types = [(se.get_source_node().label, se.label) for se in slinks] if len(set(parent_types)) != len(snodes): self.logger.warning( "Doesn't handle a case when parents are same: %s: %s", sm.id, stype) continue cols = [name2col[dnode.label] for dnode in dnodes] compare_result = self._compare_col(table, col2idx, cols[0], cols[1]) # if we cannot compare 2 columns, then we ignore them if compare_result is None: # however, if this type is already register in the counter, then instead of ignore them # we should delete set it to None to prevent re-add in the future if stype in self.prob_count_scope2 and self.prob_count_scope2[ stype] is not None: self.logger.warning( "Inferred constraint for type %s doesn't hold for source: %s. (Column: %s, %s)", stype, sm.id, cols[0].name, cols[1].name) self.prob_count_scope2[stype] = None continue if stype not in self.prob_count_scope2: self.prob_count_scope2[stype] = {} if self.prob_count_scope2[stype] is None: # inferred constraint doesn't hold, so we should ignore this column continue if compare_result: # col0 > col1 if len(self.prob_count_scope2[stype]) != 0: assert self.prob_count_scope2[stype][parent_types[ 0]] == 1 and self.prob_count_scope2[stype][ parent_types[1]] == 0 self.prob_count_scope2[stype] = { parent_types[0]: 1, parent_types[1]: 0 } else: if len(self.prob_count_scope2[stype]) != 0: assert self.prob_count_scope2[stype][parent_types[ 0]] == 0 and self.prob_count_scope2[stype][ parent_types[1]] == 1 self.prob_count_scope2[stype] = { parent_types[0]: 0, parent_types[1]: 1 } for key in list(self.prob_count_scope1.keys()): if self.prob_count_scope1[key] is None: del self.prob_count_scope1[key] for key in list(self.prob_count_scope2.keys()): if self.prob_count_scope2[key] is None: del self.prob_count_scope2[key] # we also cache column comparison (to speed to evaluation time) for tbl in data_tables.values(): useful_cols = [ col for col in tbl.columns if col in col2useful_type ] tbl_comparison: Dict[Tuple[bytes, bytes], Optional[float]] = {} col2idx: Dict[Column, int] = {col: i for i, col in enumerate(tbl.columns)} # TODO: can speed up by half for col in useful_cols: col_name = col.name.encode("utf-8") for col2 in useful_cols: if col2 != col: if col2useful_type[col] != col2useful_type[col2]: tbl_comparison[(col_name, col2.name.encode("utf-8"))] = None else: tbl_comparison[(col_name, col2.name.encode("utf-8") )] = self._compare_col( tbl, col2idx, col, col2) self.cached_compared_cols[tbl.name] = tbl_comparison def extract_feature(self, sm_id: str, g: Graph, attr_id: int, link2label: Optional[Dict[int, bool]] = None) -> dict: return { "local": self.compute_prob_scope1(sm_id, g, attr_id, link2label), "global": self.compute_prob_scope2(sm_id, g, attr_id, link2label), } def compute_prob_scope1( self, sm_id: str, g: Graph, attr_id: int, link2label: Optional[Dict[int, bool]] = None) -> Optional[float]: if link2label is None: # use default dict to reduce code size link2label = {} dnode = g.get_node_by_id(attr_id) dlink = dnode.get_first_incoming_link() pnode = dlink.get_source_node() stype = (pnode.label, dlink.label) if stype not in self.prob_count_scope1 or not link2label.get( dlink.id, True): return None assert len(self.prob_count_scope1[stype]) == 2 another_stype = [ x for x in self.prob_count_scope1[stype].keys() if x != stype ][0] another_dnodes = [ e.get_target_node() for e in pnode.iter_outgoing_links() if e.label == another_stype[1] and link2label.get(e.id, True) ] if len(another_dnodes) == 0: return None dnode_stype_idx = self.prob_count_scope1[stype][stype] another_dnode_stype_idx = self.prob_count_scope1[stype][another_stype] tbl_comparison = self.cached_compared_cols[sm_id] result = None for another_dnode in another_dnodes: if (dnode.label, another_dnode.label) not in tbl_comparison: continue result = tbl_comparison[(dnode.label, another_dnode.label)] if result is None: continue if result: # attr > another_attr, attr_stype_idx should > another_attr_stype_idx with high prob. if dnode_stype_idx > another_dnode_stype_idx: return self.valid_threshold return 1 - self.valid_threshold else: # opposite case of above if dnode_stype_idx > another_dnode_stype_idx: return 1 - self.valid_threshold return self.valid_threshold if result is None: # the constraint said that we should be able to compare, but we cannot, it should have low probability return 1 - self.valid_threshold def compute_prob_scope2( self, sm_id: str, g: Graph, attr_id: int, link2label: Optional[Dict[int, bool]] = None) -> Optional[float]: """Give a probability whether mapping of an attribute statistic data constraints We can mark some part of graph as false """ dnode = g.get_node_by_id(attr_id) dlink = dnode.get_first_incoming_link() stype = (dlink.get_source_node().label, dlink.label) if stype not in self.prob_count_scope2: return None slink = dlink.get_source_node().get_first_incoming_link() if slink is None: # root nodes return None dnode_parent_type = (slink.get_source_node().label, slink.label) if dnode_parent_type not in self.prob_count_scope2[stype] or ( link2label is not None and not link2label[slink.id]): return None dnode_stype_idx = self.prob_count_scope2[stype][dnode_parent_type] # get other class nodes in the graph that an attr can be mapped to (same semantic type). # notice that the constraint is represent as binary-function, so we only keep the class nodes # that have another attribute, which is mapped with the same semantic type snodes = [ node for node in g.iter_nodes_by_label(stype[0]) if node.id != dlink.source_id ] if len(snodes) == 0: # if we don't have any other source nodes (i.e: only one possible mapping) return None tbl_comparison = self.cached_compared_cols[sm_id] another_dnodes = [] another_dnodes_stype_idx = [] for snode in snodes: # check if this source node have another attribute that is mapped by same semantic type for link in snode.iter_outgoing_links(): if link.label == dlink.label: another_dnode = link.get_target_node() break else: another_dnode = None if another_dnode is not None and ( dnode.label, another_dnode.label) in tbl_comparison: slink = snode.get_first_incoming_link() parent_type = (slink.get_source_node().label, slink.label) if parent_type in self.prob_count_scope2[stype] and ( link2label is None or link2label[slink.id] is True): # if its parent_type is not in the constraint or its link is false, then we should ignore it another_dnodes.append(another_dnode) another_dnodes_stype_idx.append( self.prob_count_scope2[stype][parent_type]) # do compare between attr and another_attrs if len(another_dnodes) + 1 > len(self.prob_count_scope2[stype]): self.logger.warning( "There is a model that have more attributes than the inferred constraint.. trace: %s -- %s", sm_id, stype) return None # let's see if we can compare the given attribute with other attributes if len(another_dnodes ) == 0 or dnode_stype_idx in another_dnodes_stype_idx: # how about this case? return None assert len(self.prob_count_scope2[stype] ) == 2, "Doesn't handle > 2 attributes now..." # now we can compare with other attributes another_dnode, another_dnode_stype_idx = another_dnodes[ 0], another_dnodes_stype_idx[0] result = tbl_comparison[(dnode.label, another_dnode.label)] if result is None: # the constraint said that we should be able to compare, but we cannot, it should have low probability return 1 - self.valid_threshold if result: # attr > another_attr, attr_stype_idx should > another_attr_stype_idx with high prob. if dnode_stype_idx > another_dnode_stype_idx: return self.valid_threshold return 1 - self.valid_threshold else: # opposite case of above if dnode_stype_idx > another_dnode_stype_idx: return 1 - self.valid_threshold return self.valid_threshold def _compare_col(self, tbl: ColumnBasedTable, col2idx, col1: Column, col2: Column) -> Optional[bool]: # any mixed-type should be handled before.. n_gt, n_eq, n_lt = 0, 0, 0 count = 0 if col1.type == ColumnType.NUMBER: for row in tbl.rows: val1 = row[col2idx[col1]] val2 = row[col2idx[col2]] if not isinstance(val1, (int, float)) or not isinstance( val2, (int, float)) or val1 is None or val2 is None: continue if val1 == val2: n_eq += 1 elif val1 > val2: n_gt += 1 else: n_lt += 1 count += 1 if count == self.n_comparison_sample: break else: for row in tbl.rows: val1 = row[col2idx[col1]] val2 = row[col2idx[col2]] if not isinstance(val1, (str, bytes)) or not isinstance( val2, (str, bytes)) or val1 is None or val2 is None: continue try: # TODO: need to detect it is val1 = parse_date(val1, dayfirst=False, yearfirst=False) val2 = parse_date(val2, dayfirst=False, yearfirst=False) except ValueError: continue if val1 == val2: n_eq += 1 elif val1 > val2: n_gt += 1 else: n_lt += 1 count += 1 if count == 50: break if n_gt > 0 and ((n_gt + n_eq) / count) >= self.valid_threshold: return True if n_lt > 0 and ((n_lt + n_eq) / count) >= self.valid_threshold: return False # not decidable (also for equal-case) return None def _guess_detail_type(self, col: Column): if col.type == ColumnType.NUMBER: return ColumnType.NUMBER if col.type == ColumnType.NULL: return None # trying to guess if this is DateTime # just get first 100 values to reduce computing time values = [val for val in col.get_textual_data() if val.strip() != ""][:50] n_success = 0 for val in values: try: parse_date(val) n_success += 1 except ValueError: pass if (n_success / len(values)) > self.guess_datetime_threshold: # consider this is a datetime column return ColumnType.DATETIME return None
# if self.value is not None else b"") def is_terminal(self, args: BeamSearchArgs) -> bool: return len(self.remained_terminals) == 0 def get_value(self) -> PGMSearchNodeValue: return self.value def get_score(self) -> float: return self.G_scored[self.working_terminal] def get_hashing_id(self) -> bytes: return self.hashing_id _logger = get_logger('app.assembling.search_discovery') def filter_unlikely_graph(g: MergeGraph) -> bool: settings = Settings.get_instance() max_n_duplications = settings.mrf_max_n_duplications max_n_duplication_types = settings.mrf_max_n_duplication_types for n in g.iter_class_nodes(): # FILTER middle nodes if n.n_incoming_links == 1 and n.n_outgoing_links == 1: link = next(iter(n.iter_outgoing_links())) if link.get_target_node().is_class_node(): return False # FILTER: max_size_duplication_group <= 7 and max_n_duplications <= 4
class MohsenSemanticModeling(object): logger = get_logger("app.mohsen_jws2015") def __init__(self, dataset: str, use_correct_type: bool, use_old_semantic_typer: bool, train_sm_ids: List[str], exec_dir: Optional[Union[str, Path]] = None, sm_type_dir: Optional[Union[str, Path]] = None): self.dataset: str = dataset self.train_sm_ids = train_sm_ids self.ont = get_ontology(dataset) self.karma_models: Dict[str, KarmaModel] = {km.id: km for km in get_karma_models(dataset)} # can only run once time, trying re-invoke will generate an error self.__has_run_modeling = False if exec_dir is None: exec_dir = get_cache_dir(dataset, train_sm_ids) / "mohsen_jws2015" self.exec_dir: Path = Path(exec_dir) self.sm_type_dir = sm_type_dir # parameters for mohsen's algorithm self.use_old_semantic_typer = use_old_semantic_typer self.use_correct_type = use_correct_type assert Settings.get_instance().semantic_labeling_top_n_stypes <= 4 self.num_candidate_semantic_type = 4 self.multiple_same_property_per_node = True self.coherence = 1.0 self.confidence = 1.0 self.size_reduction = 0.5 self.num_candidate_mappings = 50 self.mapping_branching_factor = 50 self.topk_steiner_tree = 10 # take all, not cut off everything self.cut_off = int(1e6) self.our_and_karma_sm_alignments = {} def get_meta(self, train_source_names: List[str], test_source_names: List[str]) -> Dict: return { "dataset": self.dataset, "use_correct_type": self.use_correct_type, "use_old_semantic_typer": self.use_old_semantic_typer, "num_candidate_semantic_type": self.num_candidate_semantic_type, "multiple_same_property_per_node": self.multiple_same_property_per_node, "coherence": self.coherence, "confidence": self.confidence, "size_reduction": self.size_reduction, "num_candidate_mappings": self.num_candidate_mappings, "mapping_branching_factor": self.mapping_branching_factor, "topk_steiner_tree": self.topk_steiner_tree, "train_source_names": train_source_names, "test_source_names": test_source_names, "cut_off": self.cut_off } def init(self, train_source_names: List[str], test_source_names: List[str]): if self.__has_run_modeling: raise Exception("Cannot call init twice!!") train_source_names = sorted(train_source_names) test_source_names = sorted(test_source_names) assert self.train_sm_ids == train_source_names execution_meta_file = self.exec_dir / "execution-meta.json" lock_file = self.exec_dir / "lock.pid" if lock_file.exists(): raise Exception("Cannot run mohsen method because another process is running") if execution_meta_file.exists(): # only have this file when previous execution is success! self.logger.debug("Load information from previous run...") re_executing = False with open(execution_meta_file, 'r') as f: try: meta = ujson.load(f) except ValueError: re_executing = True if re_executing is False: test_source_names = set(meta.pop("test_source_names")) new_meta = self.get_meta(train_source_names, test_source_names) if test_source_names.difference(set(new_meta.pop("test_source_names"))): re_executing = True else: re_executing = meta != new_meta else: re_executing = True if re_executing: self.logger.info("Going to re-execute karma code") self.exec_dir.mkdir(exist_ok=True) with open(lock_file, 'w') as f: f.write(str(os.getpid())) setup_karma(self.dataset, self.exec_dir, self.sm_type_dir) execute_karma_code(self.dataset, self.exec_dir, self.use_correct_type, self.use_old_semantic_typer, self.num_candidate_semantic_type, self.multiple_same_property_per_node, self.coherence, self.confidence, self.size_reduction, self.num_candidate_mappings, self.mapping_branching_factor, self.topk_steiner_tree, self.cut_off, train_source_names, test_source_names) # only have this file when previous execution is success! with open(execution_meta_file, 'w') as f: ujson.dump(self.get_meta(train_source_names, test_source_names), f, indent=4) self.__has_run_modeling = True def karma_model_candidate_generation(self, train_sms: List[SemanticModel], test_sms: List[SemanticModel], n_candidate: int = 1000) -> List[List[KarmaModel]]: if not self.__has_run_modeling: self.init([s.id for s in train_sms], [s.id for s in test_sms]) self.logger.debug("Load previous result...") results = [] karma_models_dir = Path(config.datasets[self.dataset].karma_version.as_path()) / "models-json" for test_sm in test_sms: file_name = "source--%s.json" % test_sm.id predicted_models: List[KarmaModel] = [] if self.use_old_semantic_typer: karma_sm = KarmaModel.load_from_file(self.ont, self.exec_dir / "output" / f"source--{test_sm.id}.original.json") else: karma_sm = KarmaModel.load_from_file(self.ont, karma_models_dir / f"{test_sm.id}-model.json") sm_alignment: SemanticModelAlignment = SemanticModelAlignment(test_sm, karma_sm) with open(self.exec_dir / "output" / file_name, 'r') as f: for i, serialized_sm in enumerate(f): pred_sm = sm_alignment.load_and_align(self.ont, serialized_sm) pred_sm.id = f"{test_sm.id}:::{i}" predicted_models.append(pred_sm) if (i + 1) >= n_candidate: break if len(predicted_models) == 0: karma_graph = KarmaGraph(True, True, True) for dnode in karma_sm.karma_graph.iter_data_nodes(): karma_graph.real_add_new_node(KarmaGraphNode([], [], dnode.literal_type, dnode.is_literal_node), GraphNodeType.DATA_NODE, dnode.label) karma_model = KarmaModel(karma_sm.id, karma_sm.description, karma_sm.source_columns, karma_sm.mapping_to_source_columns, karma_graph) predicted_models = [karma_model] assert len(predicted_models) == len({m.id for m in predicted_models}), "No id duplication" results.append(predicted_models) return results def sm_candidate_generation(self, training_sources: List[SemanticModel], testing_sources: List[SemanticModel]) -> List[List[SemanticModel]]: results = self.karma_model_candidate_generation(training_sources, testing_sources) return [[m.get_semantic_model() for m in predicted_models] for predicted_models in results] def semantic_labeling(self, training_sources: List[SemanticModel], testing_sources: List[SemanticModel]) -> List[ Dict[str, List[KarmaSemanticType]]]: """This method perform """ results = self.karma_model_candidate_generation(training_sources, testing_sources, n_candidate=1) node2stypes = [] for test_sm, predicted_models in zip(testing_sources, results): node2stypes.append({node.label.decode("utf-8"): node.learned_semantic_types for node in predicted_models[0].karma_graph.iter_data_nodes()}) return node2stypes def sm_prediction(self, training_sources: List[SemanticModel], testing_sources: List[SemanticModel]) -> List[ SemanticModel]: return [pred_sms[0] for pred_sms in self.sm_candidate_generation(training_sources, testing_sources)]
class SemanticTypeDB(object): logger = get_logger('app.semantic_labeling.stype_db') SIMILARITY_METRICS = [ "label_jaccard", "stype_jaccard", "num_ks_test", "num_mann_whitney_u_test", "num_jaccard", "text_jaccard", "text_tf-idf" ] instance = None def __init__(self, dataset: str, train_tables: List[ColumnBasedTable], test_tables: List[ColumnBasedTable]): self.dataset = dataset self.train_tables = train_tables self.test_tables = test_tables self.similarity_matrix: numpy.ndarray = None self.tfidf_db: TfidfDatabase = None self._init() def _init(self): self.source_mappings: Dict[str, SemanticModel] = { s.id: s for s in get_semantic_models(self.dataset) } self.train_columns = [ col for tbl in self.train_tables for col in tbl.columns ] self.train_column_stypes: List[str] = [] for tbl in self.train_tables: sm = self.source_mappings[tbl.id] for col in tbl.columns: dnode = sm.graph.get_node_by_id( sm.get_attr_by_label(col.name).id) dlink = dnode.get_first_incoming_link() self.train_column_stypes.append(dlink.label.decode("utf-8")) self.test_columns = [ col for tbl in self.test_tables for col in tbl.columns ] self.name2table: Dict[str, ColumnBasedTable] = { tbl.id: tbl for tbl in chain(self.train_tables, self.test_tables) } self.col2idx: Dict[str, int] = { col.id: i for i, col in enumerate( chain(self.train_columns, self.test_columns)) } self.col2types: Dict[str, Tuple[str, str]] = {} self.col2dnodes: Dict[str, GraphNode] = {} col: Column for col in chain(self.train_columns, self.test_columns): sm = self.source_mappings[col.table_name] attr = sm.get_attr_by_label(col.name) dnode = sm.graph.get_node_by_id(attr.id) link = dnode.get_first_incoming_link() self.col2types[col.id] = (link.get_source_node().label, link.label) self.col2dnodes[col.id] = dnode assert len(self.col2types) == len(self.train_columns) + len( self.test_columns), "column name must be unique" @staticmethod def create(dataset: str, train_source_ids: List[str]) -> 'SemanticTypeDB': tables = get_sampled_data_tables(dataset) train_source_ids = set(train_source_ids) train_tables = [ ColumnBasedTable.from_table(tbl) for tbl in tables if tbl.id in train_source_ids ] test_tables = [ ColumnBasedTable.from_table(tbl) for tbl in tables if tbl.id not in train_source_ids ] return SemanticTypeDB(dataset, train_tables, test_tables) @staticmethod def get_stype_db(dataset: str, train_source_ids: List[str], cache_dir: Path) -> 'SemanticTypeDB': if SemanticTypeDB.instance is None: cache_file = cache_dir / 'stype_db.pkl' if cache_file.exists(): SemanticTypeDB.logger.debug( "Load SemanticTypeDB from cache file...") stype_db: SemanticTypeDB = deserialize(cache_file) if set(train_source_ids) != { tbl.id for tbl in stype_db.train_tables } or stype_db.dataset != dataset: stype_db = None else: stype_db = None if stype_db is None: SemanticTypeDB.logger.debug( "Have to re-create SemanticTypeDB...") stype_db = SemanticTypeDB.create(dataset, train_source_ids) stype_db._build_db() serialize(stype_db, cache_file) SemanticTypeDB.instance = stype_db return SemanticTypeDB.instance def get_table_by_name(self, name: str) -> ColumnBasedTable: return self.name2table[name] def _build_db(self) -> None: """Build semantic types database from scratch""" n_train_columns = len(self.train_columns) self.logger.debug("Build tfidf database...") self.similarity_matrix = numpy.zeros( (n_train_columns + len(self.test_columns), n_train_columns, len(self.SIMILARITY_METRICS)), dtype=float) self.tfidf_db = TfidfDatabase.create(textual.get_tokenizer(), self.train_columns) self.logger.debug("Pre-build tf-idf for all columns") self.tfidf_db.cache_tfidf(self.test_columns) self.logger.debug("Computing similarity matrix...") # loop through train source ids and compute similarity between columns for idx, col in enumerate(self.train_columns): self.logger.trace(" + working on col: %s", col.id) sim_features = self._compute_feature_vectors( col, self.train_columns, self.train_column_stypes) self.similarity_matrix[idx, :, :] = numpy.asarray( sim_features).reshape((n_train_columns, -1)) for idx, col in enumerate(self.test_columns): self.logger.trace(" + working on col: %s", col.id) sim_features = self._compute_feature_vectors( col, self.train_columns, self.train_column_stypes) self.similarity_matrix[idx + n_train_columns, :, :] = numpy.asarray( sim_features).reshape( (n_train_columns, -1)) def _compute_feature_vectors(self, col: Column, refcols: List[Column], refcol_stypes: List[str]): features = [] for i, refcol in enumerate(refcols): features.append([ # name features column_name.jaccard_sim_test(refcol.name, col.name, lower=True), column_name.jaccard_sim_test(refcol_stypes[i], col.name, lower=True), # numeric features numeric.ks_test(refcol, col), numeric.mann_whitney_u_test(refcol, col), numeric.jaccard_sim_test(refcol, col), # text features textual.jaccard_sim_test(refcol, col), textual.cosine_similarity(self.tfidf_db.compute_tfidf(refcol), self.tfidf_db.compute_tfidf(col)), ]) return features # implement pickling def __getstate__(self): return self.dataset, self.train_tables, self.test_tables, self.similarity_matrix def __setstate__(self, state): self.dataset = state[0] self.train_tables = state[1] self.test_tables = state[2] self.similarity_matrix = state[3] self._init()
class TfidfDatabase(object): logger = get_logger('app.semantic_labeling.tfidf_db') def __init__(self, tokenizer, vocab: Dict[str, int], invert_token_idx: Dict[str, int], col2tfidf: Dict[str, numpy.ndarray]) -> None: self.vocab = vocab self.invert_token_idx = invert_token_idx self.tokenizer = tokenizer self.n_docs = len(col2tfidf) self.cache_col2tfidf = col2tfidf @staticmethod def create(tokenizer, columns: List[Column]) -> 'TfidfDatabase': vocab = {} invert_token_idx: Dict[str, int] = defaultdict(lambda: 0) col2tfidf = {} token_count = defaultdict(lambda: 0) n_docs = len(columns) # compute tf first with Pool() as p: tf_cols = p.map(TfidfDatabase._compute_tf, [(tokenizer, col) for col in columns]) # then compute vocabulary & preparing for idf for tf_col in tf_cols: for w in tf_col: invert_token_idx[w] += 1 token_count[w] += 1 # reduce vocab size for w in token_count: if token_count[w] < 2 and w.isdigit(): # delete this word del invert_token_idx[w] else: vocab[w] = len(vocab) # revisit it and make tfidf for col, tf_col in zip(columns, tf_cols): tfidf = numpy.zeros((len(vocab))) for w, tf in tf_col.items(): if w in vocab: tfidf[vocab[w]] = tf * numpy.log(n_docs / (1 + invert_token_idx[w])) col2tfidf[col.id] = tfidf return TfidfDatabase(tokenizer, vocab, invert_token_idx, col2tfidf) def compute_tfidf(self, col: Column): if col.id in self.cache_col2tfidf: return self.cache_col2tfidf[col.id] tfidf = numpy.zeros(len(self.vocab)) for w, tf in self._compute_tf((self.tokenizer, col)).items(): if w in self.vocab: print(w, tf, self.invert_token_idx[w], numpy.log(self.n_docs / (1 + self.invert_token_idx[w]))) tfidf[self.vocab[w]] = tf * numpy.log( self.n_docs / (1 + self.invert_token_idx[w])) return tfidf def cache_tfidf(self, cols: List[Column]): cols = [col for col in cols if col.id not in self.cache_col2tfidf] with Pool() as p: tf_cols = p.map(TfidfDatabase._compute_tf, [(self.tokenizer, col) for col in cols]) for col, tf_col in zip(cols, tf_cols): tfidf = numpy.zeros(len(self.vocab)) for w, tf in tf_col.items(): if w in self.vocab: tfidf[self.vocab[w]] = tf * numpy.log( self.n_docs / (1 + self.invert_token_idx[w])) self.cache_col2tfidf[col.id] = tfidf @staticmethod def _compute_tf(args): tokenizer, col = args counter = Counter() sents = (subsent for sent in col.get_textual_data() for subsent in sent.decode('utf-8').split("/")) for doc in tokenizer.pipe(sents, batch_size=50, n_threads=4): counter.update((str(w) for w in doc)) number_of_token = sum(counter.values()) for token, val in counter.items(): counter[token] = val / number_of_token return counter
ParallelBatchExample, Tensor1AccumulatorDict from gmtk.optimize.numerical_gradient import NumericalGradient from gmtk.optimize.optimizer import PyTorchOptimizer from gmtk.tensors import DenseTensorFunc from semantic_modeling.assembling.learning.shared_models import Example, TrainingArgs from semantic_modeling.assembling.undirected_graphical_model.model_core import ExampleAnnotator from semantic_modeling.assembling.undirected_graphical_model.templates.triple_template import TripleFactorTemplate from semantic_modeling.assembling.undirected_graphical_model.model_extra import TensorBoard, evaluate, \ get_latest_model_id, move_current_files, save_evaluation_result from semantic_modeling.assembling.undirected_graphical_model.templates.substructure_template import \ SubstructureFactorTemplate from semantic_modeling.config import get_logger from semantic_modeling.utilities.parallel_util import sequential_map from semantic_modeling.utilities.serializable import serialize, serializeJSON logger = get_logger('app.persistent.assembling.train_model') def nll_func(example: NegativeLogLikelihoodExample): loss_val_accum = ValueAccumulator() example.accumulate_value_and_gradient(loss_val_accum, None) return loss_val_accum.get_value() def train_model(dataset: str, train_sids: List[str], manual_seed: int, train_examples: List[Example], test_examples: List[Example], args: TrainingArgs, basedir: Path): DenseTensorFunc.manual_seed(manual_seed) tf_domain = GrowableBinaryVectorDomain()
class MinhptxSemanticLabeling(object): logger = get_logger("app.minhptx_iswc2016") def __init__(self, dataset: str, max_n_records: int = float('inf'), is_sampling: bool = False, exec_dir: Optional[Union[Path, str]] = None) -> None: self.dataset: str = dataset self.ont: Ontology = get_ontology(dataset) self.max_n_records: int = max_n_records self.is_sampling: bool = is_sampling assert not is_sampling, "Not implemented" self.source_ids: Set[str] = { file.stem for file in Path( config.datasets[dataset].data.as_path()).iterdir() if file.is_file() and not file.name.startswith(".") } if exec_dir is None: exec_dir = Path( config.fsys.debug.as_path()) / dataset / "minhptx_iswc2016" self.exec_dir: Path = Path(exec_dir) self.meta_file: Path = self.exec_dir / "execution-meta.json" self.input_dir: Path = self.exec_dir / "input" self.input_dir.mkdir(parents=True, exist_ok=True) self.output_dir: Path = self.exec_dir / "output" self.output_dir.mkdir(parents=True, exist_ok=True) def get_meta(self, train_source_ids: Set[str], test_source_ids: Set[str]): return { "dataset": self.dataset, "max_n_records": self.max_n_records, "is_sampling": self.is_sampling, "source_ids": self.source_ids, "input_dir": str(self.input_dir), "output_dir": str(self.output_dir), "training_sources": train_source_ids, "testing_sources": test_source_ids } def _semantic_labeling( self, train_source_ids: Set[str], test_source_ids: Set[str] ) -> Dict[str, MinhptxSemanticLabelingResult]: """Generate semantic labeling for test_sources using train_sources""" need_reexec = True if Path(self.meta_file).exists(): # read meta and compare if previous run is compatible with current run self.logger.debug("Load information from previous run...") meta = deserializeJSON(self.meta_file) meta["training_sources"] = set(meta["training_sources"]) meta["testing_sources"] = set(meta["testing_sources"]) meta["source_ids"] = set(meta['source_ids']) new_meta = self.get_meta(train_source_ids, test_source_ids) if len( new_meta.pop("testing_sources").difference( meta.pop("testing_sources"))) == 0: if new_meta == meta: need_reexec = False if need_reexec: self.logger.debug("Re-execute semantic labeling...") try: # preparing data, want to compute semantic models for all sources in dataset data_dir = Path(config.datasets[self.dataset].data.as_path()) model_dir = Path( config.datasets[self.dataset].models_json.as_path()) shutil.rmtree(str(self.input_dir)) for fpath in self.output_dir.iterdir(): os.remove(fpath) [(self.input_dir / x / y).mkdir(parents=True, exist_ok=True) for x in ["%s_train" % self.dataset, "%s_test" % self.dataset] for y in ["data", "model"]] input_train_dir = self.input_dir / ("%s_train" % self.dataset) input_test_dir = self.input_dir / ("%s_test" % self.dataset) for fpath in sorted(data_dir.iterdir()): model_fname = fpath.stem + "-model.json" if fpath.stem in train_source_ids: self._copy_data(fpath, input_train_dir / "data" / fpath.name) # seriaalize the model instead of copied because we want to convert uri to simplified uri # instead of full uri (e.g karma:classLink). Full URI doesn't work in this app serializeJSON(KarmaModel.load_from_file( self.ont, model_dir / model_fname).to_normalized_json_model(), input_train_dir / "model" / f"{fpath.name}.model.json", indent=4) if fpath.stem in test_source_ids: self._copy_data(fpath, input_test_dir / "data" / fpath.name) # same reason like above serializeJSON(KarmaModel.load_from_file( self.ont, model_dir / model_fname).to_normalized_json_model(), input_test_dir / "model" / f"{fpath.name}.model.json", indent=4) invoke_command(" ".join([ config.previous_works.minhptx_iswc2016.cli.as_path(), str(self.input_dir), str(self.output_dir), "--train_dataset", "%s_train" % self.dataset, "--test_dataset", "%s_test" % self.dataset, "--evaluate_train_set", "True", "--reuse_rf_model", "False" ]), output2file=self.exec_dir / "execution.log") except Exception: sys.stdout.flush() self.logger.exception( "Error while preparing and invoking semantic labeling api..." ) raise serializeJSON(self.get_meta(train_source_ids, test_source_ids), self.meta_file, indent=4) # load result self.logger.debug("Load previous result...") output_files = [ fpath for fpath in self.output_dir.iterdir() if fpath.suffix == ".json" ] assert len(output_files) == 2 app_result: Dict[str, MinhptxSemanticLabelingResult] = deserializeJSON( output_files[0], Class=MinhptxSemanticLabelingResult) app_result.update( deserializeJSON(output_files[1], Class=MinhptxSemanticLabelingResult)) return { source_id: app_result[source_id] for source_id in chain(test_source_ids, train_source_ids) } def _copy_data(self, fsource: Path, fdest: Path) -> None: if self.max_n_records == float('inf'): shutil.copyfile(str(fsource), str(fdest)) return if fsource.suffix == ".csv": with open(fsource, "r") as f, open(fdest, "w") as g: for i, line in enumerate(f): if i > self.max_n_records: break g.write(line) else: assert False, "Not support file type: %s" % fsource.suffix def semantic_labeling(self, train_sources: List[SemanticModel], test_sources: List[SemanticModel], top_n: int) -> None: """Generate semantic labeling, and store it in test_sources""" train_source_ids = {s.id for s in train_sources} test_source_ids = {s.id for s in test_sources} assert len(train_source_ids.intersection(test_source_ids)) == 0 result = self._semantic_labeling(train_source_ids, test_source_ids) # dump result into test_sources for source in chain(train_sources, test_sources): for col in source.attrs: try: if col.label not in result[source.id].columns: # this column is ignored stypes = [] else: stypes = result[source.id].columns[col.label] col.semantic_types = [ KarmaSemanticType(col.id, stype.domain, stype.type, "Minhptx-ISWC2016-SemanticLabeling", stype.weight) for stype in stypes ][:top_n] except Exception: self.logger.exception( "Hit exception for source: %s, col: %s", source.get_id(), col.id) raise
class SemanticTypeAssistant(object): """We use semantic type to help justify if class C (not data node) should link to class A or class B Score is a potential gain if switching to another class (for example: potential gain if C link to B instead of A (currently C link to A)) """ logger = get_logger("app.weak_models.stype_assistant") def __init__(self, train_sms: List[SemanticModel], typer: SemanticTyper, triple_adviser: TripleAdviser): self.train_sms = {sm.id: sm for sm in train_sms} self.stype_db = typer.stype_db self.triple_adviser = triple_adviser # # contain a mapping from (s, p, o) => table.id, and node which are mounted in SM by o # self.parent_stype_index: Dict[Tuple[bytes, bytes, bytes], List[Tuple[str, int]]] = {} # for train_sm in train_sms: # for n in train_sm.graph.iter_nodes(): # for e in n.iter_outgoing_links(): # target = e.get_target_node() # index_key = (n.label, e.label, target.label) # if index_key not in self.parent_stype_index: # self.parent_stype_index[index_key] = [] # self.parent_stype_index[index_key].append((train_sm.id, target.id)) # contain a mapping from (semantic types & parent stypes (s, p, o) to columns self.column_stype_index: Dict[bytes, Dict[Tuple[bytes, bytes, bytes], List[Column]]] = {} for train_sm in train_sms: table = self.stype_db.get_table_by_name(train_sm.id) for dnode in train_sm.graph.iter_data_nodes(): dlink = dnode.get_first_incoming_link() pnode = dlink.get_source_node() # stype = (pnode.label, dlink.label) plink = pnode.get_first_incoming_link() if plink is None: # this is a root node continue parent_stype = (plink.get_source_node().label, plink.label, pnode.label) if pnode.label not in self.column_stype_index: self.column_stype_index[pnode.label] = {} if parent_stype not in self.column_stype_index[pnode.label]: self.column_stype_index[pnode.label][parent_stype] = [] column = table.get_column_by_name(dnode.label.decode("utf-8")) self.column_stype_index[pnode.label][parent_stype].append(column) # possible_mount of a node self.possible_mounts: Dict[bytes, List[Tuple[bytes, bytes]]] = {} for train_sm in train_sms: for n in train_sm.graph.iter_class_nodes(): if n.label not in self.possible_mounts: self.possible_mounts[n.label] = self.triple_adviser.get_subj_preds(n.label) # contains the likelihood between 2 columns X = self.stype_db.similarity_matrix.reshape((-1, self.stype_db.similarity_matrix.shape[-1])) similarity_matrix = typer.model.predict_proba(X)[:, 1] self.similarity_matrix = similarity_matrix.reshape(self.stype_db.similarity_matrix.shape[:-1]) # mapping from column's name to column's index self.name2cols: Dict[bytes, Dict[bytes, int]] = {} tbl: ColumnBasedTable for tbl in chain(self.stype_db.train_tables, self.stype_db.test_tables): self.name2cols[tbl.id] = {} for col in tbl.columns: self.name2cols[tbl.id][col.name.encode('utf-8')] = self.stype_db.col2idx[col.id] self.logger.debug("Finish building index for semantic type assistant...") def compute_prob(self, sm_id: str, g: Graph) -> Dict[int, float]: link2features = {} graph_observed_mounts = set() graph_observed_class_lbls = set() name2col_idx = self.name2cols[sm_id] parent_nodes: Dict[int, Tuple[GraphNode, Tuple[bytes, bytes]]] = {} for dnode in g.iter_data_nodes(): dlink = dnode.get_first_incoming_link() col_idx = name2col_idx[dnode.label] if dlink.source_id not in parent_nodes: pnode = dlink.get_source_node() plink = pnode.get_first_incoming_link() if plink is None: continue pstype = (plink.get_source_node().label, plink.label) # add pstype to observed mounts graph_observed_mounts.add(pstype) parent_nodes[dlink.source_id] = (pnode, plink, pstype, [dlink], [col_idx]) else: parent_nodes[dlink.source_id][-2].append(dlink) parent_nodes[dlink.source_id][-1].append(col_idx) for pnode in g.iter_class_nodes(): graph_observed_class_lbls.add(pnode.label) for pnode, plink, pstype, dlinks, col_idxs in parent_nodes.values(): # map from possible mount => scores of each columns parent_stype_score: Dict[Tuple[bytes, bytes], List[float]] = {} # filter out all possible mounts that present in the graph (except the current one), # but the domain of the mounts are not in the graph possible_mounts = [ possible_mount for possible_mount in self.possible_mounts.get(pnode.label, []) if not ((possible_mount in graph_observed_mounts and possible_mount != pstype) or possible_mount[0] not in graph_observed_class_lbls) ] if len(possible_mounts) > 1: # the number only make sense if there are another place to mount this object to for possible_mount in possible_mounts: spo = (possible_mount[0], possible_mount[1], pnode.label) scores = [] for i, col_idx in enumerate(col_idxs): # stype = (pnode.label, dlinks[i].label) refcols = self.column_stype_index[pnode.label][spo] best_score = max( self.similarity_matrix[col_idx, self.stype_db.col2idx[refcol.id]] for refcol in refcols) scores.append(best_score) parent_stype_score[possible_mount] = scores aggregation_score = {mount: sum(scores) / len(scores) for mount, scores in parent_stype_score.items()} else: aggregation_score = {} if pstype not in aggregation_score: link2features[plink.id] = None else: link2features[plink.id] = aggregation_score.pop(pstype) - max(aggregation_score.values()) return link2features
class ColumnBasedTable(object): logger = get_logger('app.semantic_labeling.data_table') def __init__(self, id: str, columns: List[Column]) -> None: self.id = id self.columns: List[Column] = columns self.name2colidx: Dict[str, int] = { cname.name: idx for idx, cname in enumerate(columns) } def get_column_by_name(self, name: str): return self.columns[self.name2colidx[name]] @staticmethod def from_table(tbl: DataTable) -> 'ColumnBasedTable': columns = [] for cname in tbl.schema.get_attr_paths(): type_stats = { type: 0.0 for type in [ColumnType.NUMBER, ColumnType.STRING, ColumnType.NULL] } col_values = [] for row in tbl.rows: get_col_values(cname.split(Schema.PATH_DELIMITER), row, col_values) col_values = [ norm_val(val, empty_as_null=True) for val in col_values ] for val in col_values: type_stats[get_type(val)] += 1 for key, val in type_stats.items(): type_stats[key] = val / len(col_values) # now we have to decide what type of this column using some heuristic!! if type_stats[ColumnType.STRING] > type_stats[ColumnType.NUMBER]: col_type = ColumnType.STRING else: if type_stats[ColumnType.NULL] < 0.7 and ( type_stats[ColumnType.NUMBER] + type_stats[ColumnType.NULL]) < 0.9: col_type = ColumnType.STRING elif type_stats[ColumnType.NUMBER] > type_stats[ ColumnType.STRING] and ( type_stats[ColumnType.NUMBER] + type_stats[ColumnType.NULL]) > 0.9: col_type = ColumnType.NUMBER else: if all(val is None for val in col_values): col_type = ColumnType.NULL else: ColumnBasedTable.logger.error( "Cannot decide type with the stats: %s", ujson.dumps(type_stats, indent=4)) raise Exception( f"Cannot decide type of column: {col_name} in {tbl.id}" ) column = Column(tbl.id, cname, col_type, len(col_values), type_stats) column.value = ColumnData(col_values) columns.append(column) col_based_tbl = ColumnBasedTable(tbl.id, columns) return col_based_tbl def to_dict(self): return { "id": self.id, "columns": [col.to_dict() for col in self.columns] } @staticmethod def from_dict(val) -> 'ColumnBasedTable': tbl = ColumnBasedTable( val["id"], [Column.from_dict(col) for col in val["columns"]]) return tbl # implement pickling def __getstate__(self): return self.to_dict() def __setstate__(self, state): obj = ColumnBasedTable.from_dict(state) self.__dict__ = obj.__dict__
from semantic_modeling.karma.karma import KarmaModel from semantic_modeling.karma.semantic_model import SemanticModel from semantic_modeling.utilities.ontology import Ontology from semantic_modeling.utilities.serializable import deserialize, deserializeJSON, serialize, serializeJSON from transformation.models.data_table import DataTable from transformation.r2rml.r2rml import R2RML _data_io_vars = { "ont": {}, "karma_models": {}, "semantic_models": {}, "data_tables": {}, "raw_data_tables": {}, "sampled_data_tables": {} } _logger = get_logger("app.data_io") def get_ontology(dataset: str) -> Ontology: """Get ontology of a given dataset""" global _data_io_vars if dataset not in _data_io_vars["ont"]: # if it has been cached ... cache_file = get_cache_dir(dataset) / 'ont.pkl' cache_file.parent.mkdir(exist_ok=True, parents=True) if cache_file.exists(): ont = deserialize(cache_file) else: ont = Ontology.from_dataset(dataset) serialize(ont, cache_file) _data_io_vars["ont"][dataset] = ont
self.source_id = source_id self.n_attrs = n_attrs self.discovering_func = discovering_func self.tracker: Tracker = tracker self.early_terminate_func: Callable[[int, Iterable[SearchNode]], bool] = early_terminate_func def should_stop(self, n_iter: int, current_nodes: Iterable[SearchNode]) -> bool: if self.early_terminate_func is None: return False return self.early_terminate_func(n_iter, current_nodes) _logger = get_logger('app.assembling.beam_search') # @profile def beam_search(starts: List[SearchNode], beam_width: int, n_results: int, args: BeamSearchArgs) -> List[SearchNode]: global _logger assert beam_width >= len(starts) # store the search result, a map from id of node's value => node to eliminate duplicated result results: Dict[str, SearchNode] = {} # ############################################## # Add very first nodes to kick off BEAM SEARCH current_exploring_nodes: Dict[str, SearchNode] = OrderedDict() for n in starts: