def segmented_evaluation(file_path, categorize=None): queries = [] with open(file_path, 'r') as f: for line in util.verboserate(f): items = line.split('\t') s, r, t = items[0], tuple(items[1].split(',')), items[2] q = PathQuery(s, r, t) quantile_str = items[3] q.quantile = float(quantile_str) q.num_candidates = int(items[4]) queries.append(q) def single_relation(query): if len(query.r) != 1: return False r = query.r[-1] if inverted(r): return False return r # group queries if categorize is None: categorize = single_relation groups = util.group(queries, categorize) print 'computing grouped stats' stats = defaultdict(dict) for key, queries in util.verboserate(groups.iteritems()): scores = [q.quantile for q in queries] score = np.nanmean(scores) def inv_sigmoid(y): return -np.log(1.0 / y - 1) score2 = inv_sigmoid(score) total = len(scores) nontrivial = np.count_nonzero(~np.isnan(scores)) stats[key] = { 'score': score, 'score2': score2, 'total_eval': total, 'nontrivial_eval': nontrivial } stats.pop(False, None) return pd.DataFrame(stats).transpose()
def run_experiment(self): print 'Stochastic Gradient Descent: Examples %d ' % len(self.train) self.steps = 0 self.epochs = 0 while True: # reshuffle training data train_copy = list(self.train) random.shuffle(train_copy) # TODO: Figure out minibatches/ accumulate gradients # TODO: Specific to current dataset format! for ex in util.verboserate(train_copy): self.model.backprop(ex.sentences, ex.mask, ex.question, ex.answer[0], ex.hints) for controller in self.controllers: controller.control(self) self.track() self.steps += 1 self.epochs += 1 if self.halt: return
def get_examples(name): filename = join(data_path, name) if not isfile(filename): print 'Warning: ', filename, ' not found. Skipping...' return None examples_arr = list() with open(filename, 'r') as f: num_examples = 0 for line in util.verboserate(f): if num_examples >= maximum_examples: break items = line.split() s, path, t = items[:3] rels = tuple(path.split(',')) entities.add(s) entities.add(t) relations.update(rels) if len(items) >= 4: label = items[3] else: label = '1' # if no label, assume positive # only add positive examples if label == '1': examples_arr.append(PathQuery(s, rels, t)) num_examples += 1 return examples_arr
def objective_mean(dset): sample = util.sample_if_large(dset, self.dset_samples) vals = [] for ex in util.verboserate(sample): vals.append(experiment.model.objective(ex.sentences, ex.mask, ex.question, ex.answer[0], ex.hints)) return np.mean(vals)
def difference_evaluation(name): queries = [] with open(join(data_directory, name + '.tsv'), 'r') as f: for line in util.verboserate(f): items = line.split('\t') s, r, t = items[0], tuple(items[1].split(',')), items[2] q = PathQuery(s, r, t) q.aqs = [float(s) for s in items[3].split(',')] queries.append(q) aq_deltas = defaultdict(list) for q in queries: aqs = [1.0] + q.aqs for i in range(1, len(aqs)): r = q.r[i - 1] aq, prev_aq = aqs[i], aqs[i - 1] if prev_aq == 1.0: delta = 1.0 # no ground to gain elif prev_aq == 0.0: delta = np.nan # no ground to lose else: diff = aq - prev_aq if diff >= 0: delta = diff / (1.0 - prev_aq) # portion recovered else: delta = diff / prev_aq # portion lost if not np.isnan(delta): aq_deltas[r].append(delta) return pd.DataFrame({ 'mean(aq_diff)': dict((r, np.nanmean(deltas)) for r, deltas in aq_deltas.iteritems()) })
def difference_evaluation(name): queries = [] with open(join(data_directory, name + '.tsv'), 'r') as f: for line in util.verboserate(f): items = line.split('\t') s, r, t = items[0], tuple(items[1].split(',')), items[2] q = PathQuery(s, r, t) q.aqs = [float(s) for s in items[3].split(',')] queries.append(q) aq_deltas = defaultdict(list) for q in queries: aqs = [1.0] + q.aqs for i in range(1, len(aqs)): r = q.r[i-1] aq, prev_aq = aqs[i], aqs[i-1] if prev_aq == 1.0: delta = 1.0 # no ground to gain elif prev_aq == 0.0: delta = np.nan # no ground to lose else: diff = aq - prev_aq if diff >= 0: delta = diff / (1.0 - prev_aq) # portion recovered else: delta = diff / prev_aq # portion lost if not np.isnan(delta): aq_deltas[r].append(delta) return pd.DataFrame({'mean(aq_diff)': dict((r, np.nanmean(deltas)) for r, deltas in aq_deltas.iteritems())})
def group_queries_by_difficulty(train_graph, full_graph, queries, existence=True, epsilon=5e-1): print "Filtering queries contained in train graph" easy_queries = [] hard_queries = [] for query in util.verboserate(queries): if existence: if isinstance(query, PathQuery) or isinstance(query, data.PathQuery): easy = query.t in train_graph.walk_all(query.s, query.r) else: raise TypeError(type(query)) if easy: easy_queries.append(query) else: hard_queries.append(query) else: mc_estimates = train_graph.random_walk_probs(query.s, query.r) if query.t in mc_estimates: approx = mc_estimates[query.t] else: approx = 0. true = full_graph.random_walk_probs(query.s, query.r)[query.t] if abs(true - approx) < epsilon: easy_queries.append(query) else: hard_queries.append(query) print "Number of easy queries: ", len(easy_queries) print "Number of hard queries: ", len(hard_queries) return easy_queries, hard_queries
def accuracy_mean(dset): sample = util.sample_if_large(dset, self.dset_samples) vals = [] for ex in util.verboserate(sample): correct = ex.answer == experiment.model.predict(ex.sentences, ex.mask, ex.question) vals.append(correct) return np.mean(vals)
def segmented_evaluation(file_path, categorize=None): queries = [] with open(file_path, 'r') as f: for line in util.verboserate(f): items = line.split('\t') s, r, t = items[0], tuple(items[1].split(',')), items[2] q = PathQuery(s, r, t) quantile_str = items[3] q.quantile = float(quantile_str) q.num_candidates = int(items[4]) queries.append(q) def single_relation(query): if len(query.r) != 1: return False r = query.r[-1] if inverted(r): return False return r # group queries if categorize is None: categorize = single_relation groups = util.group(queries, categorize) print 'computing grouped stats' stats = defaultdict(dict) for key, queries in util.verboserate(groups.iteritems()): scores = [q.quantile for q in queries] score = np.nanmean(scores) def inv_sigmoid(y): return -np.log(1.0 / y - 1) score2 = inv_sigmoid(score) total = len(scores) nontrivial = np.count_nonzero(~np.isnan(scores)) stats[key] = {'score': score, 'score2': score2, 'total_eval': total, 'nontrivial_eval': nontrivial} stats.pop(False, None) return pd.DataFrame(stats).transpose()
def compute_best_thresholds(examples, debug=False): # per-relation thresholds ex_by_rel = util.group(examples, lambda q: q.r[0]) thresholds = {} for relation, examples_r in util.verboserate(ex_by_rel.items()): if debug: print relation scores = [ex.score for ex in examples_r] labels = [ex.label for ex in examples_r] thresholds[relation] = util.best_threshold(scores, labels, debug) return thresholds
def satisfying_pairs(p, graph): pairs = set() sources = graph.type_matching_entities(p, 's') for s in util.verboserate(sources): if len(p) == 1: for t in graph.neighbors[s][p[0]]: pairs.add((s, t)) else: for t in graph.walk_all(s, p): pairs.add((s, t)) return pairs
def stats(pqs): ents = Counter() rels = Counter() paths = Counter() lengths = Counter() for pq in util.verboserate(pqs): ents[pq.s] += 1 ents[pq.t] += 1 path = pq.r paths[path] += 1 lengths[len(path)] += 1 for r in path: rels[r] += 1 return ents, rels, paths, lengths
def get_questions_stats(train_data_file, dev_data_file): print('1. Getting the number of blanks') blank_str = '_blank_' num_blanks_map = defaultdict(int) word_freq_train = defaultdict(int) with open(train_data_file) as train_file: for counter, line in enumerate(util.verboserate(train_file)): line = line.strip() q_json = json.loads(line) q = q_json['sentence'] count = q.count(blank_str) num_blanks_map[count] += 1 words = q.split(' ') for word in words: word = word.strip() word_freq_train[word] += 1 a_list = q_json['answerSubset'] for a in a_list: word_freq_train[a] = word_freq_train[word] + 1 print(num_blanks_map) print '2. Number of word types in the train set {}'.format( len(word_freq_train)) print '3. Checking overlap with the dev answers' dev_answers_present = set() dev_answers_oov = set() dev_answers = set() with open(dev_data_file) as dev_file: for line in dev_file: line = line.strip() dev_json = json.loads(line) a_list = dev_json['answerSubset'] for a in a_list: if a in word_freq_train: dev_answers_present.add(a) else: dev_answers_oov.add(a) dev_answers.add(a) print 'Number of unique dev answer strings {}'.format(len(dev_answers)) print 'Number of oov answer strings in dev set {}'.format( len(dev_answers_oov)) print 'Number of dev answer strings which have atleast 1 occurrences in train set {}'.format( len(dev_answers_present))
def load_socher_test(test_set_path): examples = [] with open(join(data_directory, test_set_path), 'r') as f: for line in util.verboserate(f): items = line.split() s, r, t, label = items[0], tuple(items[1].split(',')), items[2], items[3] ex = PathQuery(s, r, t) if label == '1': ex.label = True elif label == '-1': ex.label = False else: raise ValueError(label) examples.append(ex) return examples
def load_socher_test(test_set_path): examples = [] with open(join(data_directory, test_set_path), 'r') as f: for line in util.verboserate(f): items = line.split() s, r, t, label = items[0], tuple( items[1].split(',')), items[2], items[3] ex = PathQuery(s, r, t) if label == '1': ex.label = True elif label == '-1': ex.label = False else: raise ValueError(label) examples.append(ex) return examples
def group_queries_by_difficulty(train_graph, full_graph, queries, existence=True, epsilon=5e-1): print "Filtering queries contained in train graph" easy_queries = [] hard_queries = [] for query in util.verboserate(queries): if existence: if isinstance(query, PathQuery) or isinstance( query, data.PathQuery): easy = query.t in train_graph.walk_all(query.s, query.r) else: raise TypeError(type(query)) if easy: easy_queries.append(query) else: hard_queries.append(query) else: mc_estimates = train_graph.random_walk_probs(query.s, query.r) if query.t in mc_estimates: approx = mc_estimates[query.t] else: approx = 0. true = full_graph.random_walk_probs(query.s, query.r)[query.t] if abs(true - approx) < epsilon: easy_queries.append(query) else: hard_queries.append(query) print "Number of easy queries: ", len(easy_queries) print "Number of hard queries: ", len(hard_queries) return easy_queries, hard_queries
def final_evaluation(dataset_path, model_name, params_path, eval_type, eval_samples=float('inf'), max_negative_samples=float('inf'), type_matching_negs=True): dset = parse_dataset(dataset_path) model = CompositionalModel(None, path_model=model_name, objective='margin') params = load_params(params_path, model_name) neg_gen = NegativeGenerator(dset.full_graph, max_negative_samples, type_matching_negs=type_matching_negs) queries = util.sample_if_large(dset.test, eval_samples, replace=False) # Define different evaluation functions # ----- ----- ----- ----- ----- scores = lambda query: model.predict(params, query).ravel() def performance(query): s, r, t = query.s, query.r, query.t negatives = neg_gen(query, 't') pos_query = PathQuery(s, r, t) neg_query = PathQuery(s, r, negatives) # don't score queries with no negatives if len(negatives) == 0: query.quantile = np.nan else: query.quantile = util.average_quantile(scores(pos_query), scores(neg_query)) query.num_candidates = len(negatives) + 1 attributes = query.s, ','.join(query.r), query.t, str(query.quantile), str(query.num_candidates) return '\t'.join(attributes) def report(queries): # filter out NaNs queries = [q for q in queries if not np.isnan(q.quantile)] util.metadata('mean_quantile', np.mean([q.quantile for q in queries])) util.metadata('h10', np.mean([1.0 if util.rank_from_quantile(q.quantile, q.num_candidates) <= 10 else 0.0 for q in queries])) def average_quantile(s, p): negatives, positives = neg_gen(PathQuery(s, p, ''), 't', return_positives=True) pos_query = PathQuery(s, p, positives) neg_query = PathQuery(s, p, negatives) return util.average_quantile(scores(pos_query), scores(neg_query)) def intermediate_aqs(query): s, path = query.s, query.r aqs = [] for length in 1 + np.arange(len(path)): p = path[:length] aq = average_quantile(s, p) aqs.append(aq) attributes = query.s, ','.join(query.r), query.t, ','.join(str(aq) for aq in aqs) return '\t'.join(attributes) # ----- ----- ----- ----- ----- if eval_type == 'mean_quantile': eval_fxn = performance eval_report = report elif eval_type == 'intermediate_aqs': eval_fxn = intermediate_aqs eval_report = lambda qs: None else: raise ValueError(eval_type) with open('results.tsv', 'w') as f: def progress(steps, elapsed): print '{} of {} processed ({} s)'.format(steps, len(queries), elapsed) util.metadata('steps', steps) util.metadata('gb_used', util.gb_used()) sys.stdout.flush() f.flush() for query in util.verboserate(queries, report=progress): s = eval_fxn(query) f.write(s) f.write('\n') eval_report(queries) with open('queries.cpkl', 'w') as f: pickle.dump(queries, f)
def final_evaluation(dataset_path, model_name, params_path, eval_type, eval_samples=float('inf'), max_negative_samples=float('inf'), type_matching_negs=True): dset = parse_dataset(dataset_path) model = CompositionalModel(None, path_model=model_name, objective='margin') params = load_params(params_path, model_name) neg_gen = NegativeGenerator(dset.full_graph, max_negative_samples, type_matching_negs=type_matching_negs) queries = util.sample_if_large(dset.test, eval_samples, replace=False) # Define different evaluation functions # ----- ----- ----- ----- ----- scores = lambda query: model.predict(params, query).ravel() def performance(query): s, r, t = query.s, query.r, query.t negatives = neg_gen(query, 't') pos_query = PathQuery(s, r, t) neg_query = PathQuery(s, r, negatives) # don't score queries with no negatives if len(negatives) == 0: query.quantile = np.nan else: query.quantile = util.average_quantile(scores(pos_query), scores(neg_query)) query.num_candidates = len(negatives) + 1 attributes = query.s, ','.join(query.r), query.t, str( query.quantile), str(query.num_candidates) return '\t'.join(attributes) def report(queries): # filter out NaNs queries = [q for q in queries if not np.isnan(q.quantile)] util.metadata('mean_quantile', np.mean([q.quantile for q in queries])) util.metadata( 'h10', np.mean([ 1.0 if util.rank_from_quantile(q.quantile, q.num_candidates) <= 10 else 0.0 for q in queries ])) def average_quantile(s, p): negatives, positives = neg_gen(PathQuery(s, p, ''), 't', return_positives=True) pos_query = PathQuery(s, p, positives) neg_query = PathQuery(s, p, negatives) return util.average_quantile(scores(pos_query), scores(neg_query)) def intermediate_aqs(query): s, path = query.s, query.r aqs = [] for length in 1 + np.arange(len(path)): p = path[:length] aq = average_quantile(s, p) aqs.append(aq) attributes = query.s, ','.join(query.r), query.t, ','.join( str(aq) for aq in aqs) return '\t'.join(attributes) # ----- ----- ----- ----- ----- if eval_type == 'mean_quantile': eval_fxn = performance eval_report = report elif eval_type == 'intermediate_aqs': eval_fxn = intermediate_aqs eval_report = lambda qs: None else: raise ValueError(eval_type) with open('results.tsv', 'w') as f: def progress(steps, elapsed): print '{} of {} processed ({} s)'.format(steps, len(queries), elapsed) util.metadata('steps', steps) util.metadata('gb_used', util.gb_used()) sys.stdout.flush() f.flush() for query in util.verboserate(queries, report=progress): s = eval_fxn(query) f.write(s) f.write('\n') eval_report(queries) with open('queries.cpkl', 'w') as f: pickle.dump(queries, f)
def mean_rank(self, maximizer, dset): sample = util.sample_if_large(dset, self.eval_samples) ranks = [self.rank(maximizer, ex) for ex in util.verboserate(sample)] return np.nanmean(ranks)
def accuracy_mean(dset): vals = [] for ex in util.verboserate(dset): correct = ex.answer == experiment.model.predict(ex.sentences, ex.mask, ex.question) vals.append(correct) return np.mean(vals)
def sample_paths(graph, num_paths, max_path_length): paths = [] for k in util.verboserate(range(num_paths)): length = random.randint(2, max_path_length) # don't include length 1 paths.append(graph.random_path_query(length)) return paths
def objective_mean(dset): sample = util.sample_if_large(dset, self.dset_samples) vals = [maximizer.objective.value(maximizer.params, ex) for ex in util.verboserate(sample)] return np.mean(vals)