def main(): model = Model(MODEL_NAME) # Add Predicates add_predicates(model) # Add Rules add_rules(model) # Inference results = infer(model) write_results(results, model)
def main(): model = Model(MODEL_NAME) # Add Predicates link_predicate, hascat_predicate = add_predicates(model) # Add Rules add_rules(model) # Weight Learning learn(model, link_predicate, hascat_predicate) print('Learned Rules:') for rule in model.get_rules(): print(' ' + str(rule)) # Inference results = infer(model, link_predicate, hascat_predicate) write_results(results, model)
def create_model_from_config_dir(name, dir_path): dir_path = Path(dir_path) predicate_path = dir_path / "predicate.txt" rule_path = dir_path / "rule.txt" model = Model(name) for predicate in PSLUtils.read_predicates_from_file(predicate_path): model.add_predicate(predicate) for rule in PSLUtils.read_rules_from_file(rule_path): model.add_rule(rule) return model
return df def write_results (results, model): out_dir = "." os.makedirs(out_dir, exist_ok = True) for predicate in model.get_predicates().values(): if not predicate.closed(): out_path = os.path.join(out_dir, "%s.txt" % (predicate.name())) results[predicate].to_csv(out_path, sep = "\t", header = False, index = False) if __name__ == "__main__": model = Model("simple-acquaintances") add_predicates(model) add_rules(model) # inference results = infer(model) write_results(results, model) # save intermediate results out_path = pathlib.Path("knows_obs.tsv") ic(out_path) df = trace_predicate("Knows", Partition.OBSERVATIONS, out_path) ic(df)
class PSL: """ Class that performs inference using PSL. """ def __init__(self, relations, data_dir='data', working_dir='.temp', learner='mle', logger=None): """ Initialization of joint inference class. Parameters ---------- relations : list Relations to use for relational modeling. data_dir : str (default='data') Temporary directory to store intermdiate files. working_dir : str (default='.temp') Temporary directory to store intermdiate files. logger : object (default=None) Logger for logging output. learner : str (default='mle') Weight learning optimizer, 'mle': Maximum Likelihood, 'gpp': Gaussian Process Prior (uses the Ranking Estimator). """ self.relations = relations self.data_dir = data_dir self.working_dir = working_dir self.learner = learner self.logger = logger # public def fit(self, y, y_hat, target_ids): """ Train a PSL model. y: true labels for target nodes. shape: (n_samples,). y_hat: priors for target nodes. shape: (n_samples,). target_col: list of target_ids. shape: (n_samples,). """ # create model self.model_ = Model('spam') self._add_predicates(self.model_, self.relations) self._add_rules(self.model_, self.relations) # add data result = util.get_relational_entities(y_hat=y_hat, target_ids=target_ids, relations=self.relations, data_dir=self.data_dir, logger=self.logger) target_priors, relations_dict, target_col = result self._add_data(self.model_, target_col=target_col, target_priors=target_priors, relations_dict=relations_dict, y=y) # start timing start = time.time() if self.logger: self.logger.info('[PSL] training...') # learning settings optimizer = '' additional_cli_options = [ '--h2path', os.path.abspath(self.working_dir), '-D', 'parallel.numthreahds=1' ] if self.learner == 'gpp': optimizer = 'GaussianProcessPrior' additional_cli_options = ['-e', 'RankingEvaluator'] # train self.model_.learn(method=optimizer, additional_cli_optons=additional_cli_options, temp_dir=self.working_dir, jvm_options=JVM_OPTIONS, logger=self.logger) if self.logger: self.logger.info('[PSL] time: {:.3f}s'.format(time.time() - start)) self.logger.info('[PSL] learned rules:') for rule in self.model_.get_rules(): self.logger.info(' ' + str(rule)) return self def inference(self, y_hat, target_ids): """ Joint inference using PSL. y_hat: priors for target nodes. shape: (n_samples,). target_col: list of target_ids. shape: (n_samples,). Returns predictions with shape=(len(y_hat),). """ assert self.model_ # add data target_priors, relations_dict, target_col = util.get_relational_entities( y_hat=y_hat, target_ids=target_ids, relations=self.relations, data_dir=self.data_dir, logger=self.logger) y_score = self._group_inference(target_priors, relations_dict) return y_score # private def _group_inference(self, target_priors, relations_dict, target_col='com_id', max_size=7500, max_edges=40000): """ Run inference over clusters of connected components to reduce memory and runtime. """ result_df = pd.DataFrame(target_priors, columns=[target_col, 'ind_yhat']) clusters = connections.create_clusters(target_priors, relations_dict, max_size=MAX_CLUSTER_SIZE, max_edges=MAX_EDGES, logger=self.logger) # Run inference over each cluster results = [] for i, (msg_nodes, hub_nodes, relations, n_edges) in enumerate(clusters): start = time.time() # filter target IDs cluster_target_ids = [int(x.split('-')[1]) for x in msg_nodes] temp_df = result_df[result_df[target_col].isin(cluster_target_ids)] cluster_target_priors = list( zip(temp_df[target_col], temp_df['ind_yhat'])) self._add_data(self.model_, target_col=target_col, target_priors=cluster_target_priors, relations_dict=relations_dict) additional_cli_options = [ '--h2path', os.path.abspath(self.working_dir), '-D', 'parallel.numthreads=1' ] result_dict = self.model_.infer( temp_dir=self.working_dir, additional_cli_optons=additional_cli_options, logger=self.logger, jvm_options=JVM_OPTIONS) # get udpdated scores yhat_df = result_dict[self.model_.get_predicate('spam_msg')] yhat_df.columns = [target_col, 'pgm_yhat'] results.append(yhat_df) if self.logger: s = '[CLUSTER {} / {}] msgs: {}, hubs: {}, edges: {}, time: {:.3f}s' self.logger.info( s.format(i + 1, len(clusters), len(msg_nodes), len(hub_nodes), n_edges, time.time() - start)) # put updated scores in order of target IDs yhat_df = pd.concat(results) yhat_df = yhat_df.groupby(target_col).mean().reset_index() result_df = result_df.merge(yhat_df, on=target_col, how='left') # fill independent target ID nodes with independent predictions result_df['pgm_yhat'] = result_df['pgm_yhat'].fillna( result_df['ind_yhat']) # put scores into sklearn format result_df['pgm_yhat_neg'] = 1 - result_df['pgm_yhat'] y_score = np.hstack([ result_df['pgm_yhat_neg'].values.reshape(-1, 1), result_df['pgm_yhat'].values.reshape(-1, 1) ]) assert len(y_score) == len(target_priors) return y_score def _add_predicates(self, model, relations): """ Add predicates based on the given relations. """ model.add_predicate(Predicate('spam_msg', closed=False, size=1)) model.add_predicate(Predicate('prior_msg', closed=True, size=1)) for relation in relations: model.add_predicate( Predicate('spam_{}'.format(relation), closed=False, size=1)) model.add_predicate( Predicate('has_{}'.format(relation), closed=True, size=2)) def _add_rules(self, model, relations): """ Add rules connecting entities together. """ model.add_rule(Rule('1.0: ~spam_msg(M) ^2')) model.add_rule(Rule('1.0: prior_msg(M) -> spam_msg(M) ^2')) for relation in relations: var = relation[0].upper() r1 = '1.0: has_{}(M, {}) & spam_{}({}) -> spam_msg(M) ^2' r2 = '1.0: has_{}(M, {}) & spam_msg(M) -> spam_{}({}) ^2' model.add_rule(Rule('1.0: ~spam_{}({}) ^2'.format(relation, var))) model.add_rule(Rule(r1.format(relation, var, relation, var))) model.add_rule(Rule(r2.format(relation, var, relation, var))) def _add_data(self, model, target_col, target_priors, relations_dict, y=None, sep='\t'): """ Add predicate data. Observations: observed values for closed predicates and open predicates. Targets: Predicate targets we want to infer values for. Truth: Labels of some target predicates for training. """ # clear any data for predicate in model.get_predicates().values(): predicate.clear_data() # organize targets target_df = pd.DataFrame(target_priors, columns=[target_col, 'y_hat']) # filepaths prior_msg_fp = os.path.join(self.working_dir, 'spam_msg.tsv') spam_msg_nolabel_fp = os.path.join(self.working_dir, 'spam_msg_nolabel.tsv') # create data files target_df.to_csv(prior_msg_fp, columns=[target_col, 'y_hat'], sep=sep, header=None, index=None) target_df.to_csv(spam_msg_nolabel_fp, columns=[target_col], sep=sep, header=None, index=None) # add data to the model model.get_predicate('prior_msg').add_data_file(Partition.OBSERVATIONS, prior_msg_fp) model.get_predicate('spam_msg').add_data_file(Partition.TARGETS, spam_msg_nolabel_fp) # add relational data to the model for relation_id, relation_list in relations_dict.items(): relation = relation_id.split('_')[0] # organize data relation_df = pd.DataFrame(relation_list, columns=[relation_id, target_col], dtype=int) relation_df = relation_df[relation_df[target_col].isin( target_df[target_col])] hub_df = relation_df.drop_duplicates(subset=[relation_id]) if len(relation_df) == 0: continue # filepaths relation_fp = os.path.join(self.working_dir, 'has_{}.tsv'.format(relation)) hub_fp = os.path.join(self.working_dir, 'spam_{}.tsv'.format(relation)) # create files relation_df.to_csv(relation_fp, columns=[target_col, relation_id], sep=sep, header=None, index=None) hub_df.to_csv(hub_fp, columns=[relation_id], sep=sep, header=None, index=None) # add data model.get_predicate('has_{}'.format(relation)).add_data_file( Partition.OBSERVATIONS, relation_fp) model.get_predicate('spam_{}'.format(relation)).add_data_file( Partition.TARGETS, hub_fp) # add labeled data for weight learning if y is not None: spam_msg_label_fp = os.path.join(self.working_dir, 'spam_msg_label.tsv') label_df = pd.DataFrame(list(zip(target_df[target_col], y)), columns=[target_col, 'y']) label_df.to_csv(spam_msg_label_fp, columns=[target_col, 'y'], sep=sep, header=None, index=None) model.get_predicate('spam_msg').add_data_file( Partition.TRUTH, spam_msg_label_fp)
def run(): model = Model(MODEL_NAME) # Add Predicates knows_predicate = Predicate('Knows', closed=False, size=2) model.add_predicate(knows_predicate) likes_predicate = Predicate('Likes', closed=True, size=2) model.add_predicate(likes_predicate) lived_predicate = Predicate('Lived', closed=True, size=2) model.add_predicate(lived_predicate) # Add Data path = os.path.join(DATA_DIR, 'knows_obs.txt') knows_predicate.add_data_file(Partition.OBSERVATIONS, path) path = os.path.join(DATA_DIR, 'lived_obs.txt') lived_predicate.add_data_file(Partition.OBSERVATIONS, path) path = os.path.join(DATA_DIR, 'likes_obs.txt') likes_predicate.add_data_file(Partition.OBSERVATIONS, path) path = os.path.join(DATA_DIR, 'knows_targets.txt') knows_predicate.add_data_file(Partition.TARGETS, path) path = os.path.join(DATA_DIR, 'knows_truth.txt') knows_predicate.add_data_file(Partition.TRUTH, path) # Add Rules model.add_rule( Rule('20: Lived(P1, L) & Lived(P2, L) & (P1 != P2) -> Knows(P1, P2) ^2' )) model.add_rule( Rule( '5: Lived(P1, L1) & Lived(P2, L2) & (P1 != P2) & (L1 != L2) -> !Knows(P1, P2) ^2' )) model.add_rule( Rule('10: Likes(P1, L) & Likes(P2, L) & (P1 != P2) -> Knows(P1, P2) ^2' )) model.add_rule( Rule( '5: Knows(P1, P2) & Knows(P2, P3) & (P1 != P3) -> Knows(P1, P3) ^2' )) model.add_rule(Rule('Knows(P1, P2) = Knows(P2, P1) .')) model.add_rule(Rule('5: !Knows(P1, P2) ^2')) # Run Inference results = model.infer(psl_config=ADDITIONAL_PSL_OPTIONS) return results
def makeModel(model_name, addPrior=True, sim=False): model = Model(model_name) Trusts = Predicate("Trusts", size=2, closed=False) Knows = Predicate("Knows", size=2, closed=True) Prior = Predicate("Prior", size=1, closed=True) model.add_predicate(Trusts) model.add_predicate(Knows) model.add_predicate(Prior) if model_name in [ "triad-personality", "personality-similarity", "triad-pers-sim", "personality" ]: Trusting = Predicate("Trusting", size=1, closed=False) TrustWorthy = Predicate("TrustWorthy", size=1, closed=False) model.add_predicate(Trusting) model.add_predicate(TrustWorthy) if model_name in [ "similarity", "triad-similarity", "personality-similarity", "triad-pers-sim" ]: SameTastes = Predicate("SameTastes", size=2, closed=True) model.add_predicate(SameTastes) return model
df = model.infer(additional_cli_optons=ADDITIONAL_CLI_OPTIONS, psl_config=ADDITIONAL_PSL_OPTIONS) return df def write_results(results, model): out_dir = "." os.makedirs(out_dir, exist_ok=True) for predicate in model.get_predicates().values(): if not predicate.closed(): out_path = os.path.join(out_dir, "%s.txt" % (predicate.name())) results[predicate].to_csv(out_path, sep="\t", header=False, index=False) if (__name__ == "__main__"): model = Model("simple-acquaintances") add_predicates(model) add_rules(model) results = infer(model) write_results(results, model) print(model.get_predicate("Likes")._data[Partition.OBSERVATIONS])