def form_knowledge_base(self, id_vecs=True, unitary=False): # Check existence of corpus if self.corpus_dict is None: raise Exception("Attempted to form the knowledge " "base without a corpus.") print "Number of items in knowledge base:", len(self.corpus_dict) if not id_vecs: print "Processing Corpus" self.processCorpus() print "Generating relation type vectors" print "Using relation types: ", self.relation_symbols self.relation_type_vectors = {symbol: HRR(self.dimension) for symbol in self.relation_symbols} if unitary: for k, h in self.relation_type_vectors.iteritems(): h.make_unitary() if id_vecs: key_order = self.corpus_dict.keys() else: # Order words by the dependencies of their definitions # Only have to do this if we're not using ID-vectors key_order = [] resolved = set(self.relation_symbols) dependencies = {} for key in self.corpus_dict.keys(): dependencies[key] = set( [tag[1] for tag in self.corpus_dict[key] if tag[0] in self.relation_symbols]) while len(key_order) < (len(self.corpus_dict) + len(self.relation_symbols)): resolvable = set() for key in dependencies: if dependencies[key].issubset(resolved): resolvable.add(key) # add the resolvable keys to the order list and resolved set key_order.extend(resolvable) resolved = resolved.union(resolvable) # remove resolved tags from the dependency dictionary for r in resolvable: del dependencies[r] # if no items are resolvable, we're stuck if len(resolvable) == 0: break del resolved del resolvable if len(key_order) < len(self.corpus_dict): raise Exception("Dependency resolution failed.") self.semantic_pointers = collections.OrderedDict() print "Generating ID-vectors" if id_vecs: self.id_vectors = collections.OrderedDict() for key in key_order: self.id_vectors[key] = HRR(self.dimension) else: self.id_vectors = self.semantic_pointers print "Generating HRR vectors" for key in key_order: relations = filter( lambda x: x[0] in self.relation_symbols, self.corpus_dict[key]) if len(relations) == 0: self.semantic_pointers[key] = HRR(self.dimension) continue semantic_pointer = HRR(data=np.zeros(self.dimension)) for n in range(self.sp_noise): semantic_pointer += HRR(self.dimension) for relation in relations: id_vector = self.id_vectors[relation[1]] relation_type_vector = self.relation_type_vectors[relation[0]] pair = id_vector * relation_type_vector semantic_pointer += pair if self.normalize: semantic_pointer.normalize() self.semantic_pointers[key] = semantic_pointer # convert all vectors from hrrs to numpy ndarrays for k in key_order: h = self.semantic_pointers[k] self.semantic_pointers[k] = h.v if id_vecs: for k in key_order: h = self.id_vectors[k] self.id_vectors[k] = h.v for k in self.relation_type_vectors: h = self.relation_type_vectors[k] self.relation_type_vectors[k] = h.v
def run(self): self.dimension = len(self.id_vectors.values()[0]) self.role_hrrs = self.create_role_hrrs() self.pos_map = self.create_pos_map() score = defaultdict(float) for i in range(self.num_trials): title = "New Sentence Test" if self.deep: title += "- Deep" tools.print_header(self.output_file, title) sentence = self.generate_sentence() if self.deep: embed = self.rng.sample(sentence.keys(), 1)[0] embedded_sentence = self.generate_sentence() del sentence[embed] for role in embedded_sentence.keys(): sentence[embed + role] = embedded_sentence[role] tag_vectors = {} sentence_hrr = HRR(data=np.zeros(self.dimension)) # Pick role-fillers and create HRR representing the sentence # Also store the hrr to use as the query to extract each synset # included in the sentence. for role in sentence: tag_hrr = [self.role_hrrs[x] for x in role] tag_hrr = reduce(lambda x, y: x * y, tag_hrr) synset = sentence[role] sentence_hrr += tag_hrr * HRR(data=self.id_vectors[synset]) tag_vectors[role] = tag_hrr.v sentence_hrr.normalize() sentence_vector = sentence_hrr.v print >> self.output_file, "Roles in sentence:" print >> self.output_file, sentence # ask about parts of the sentence sentence_score = defaultdict(float) sentence_length = defaultdict(float) for role in sentence.keys(): answer = sentence[role] self.current_start_key = None self.current_target_keys = [answer] self.current_num_relations = len(sentence) print >> self.output_file, "\nTesting ", role result, correct, valid, exact = self.test_link( tag_vectors[role], sentence_vector, None, answer, output_file=self.output_file, return_vec=False, num_relations=len(sentence), answers=[answer]) depth = len(role) if correct: sentence_score[depth] += 1 print >> self.output_file, "Correct." else: print >> self.output_file, "Incorrect." sentence_length[depth] += 1 if self.short: break for d in sentence_length: sentence_percent = sentence_score[d] / sentence_length[d] print >> self.output_file, \ "Percent correct for current sentence at depth %d: %f" \ % (d, sentence_percent) score[d] = score[d] + sentence_percent for d in score: print "Sentence test score at depth %d: %f out of %d" \ % (d, score[d], self.num_trials) percent = score[d] / self.num_trials title = "Sentence Test Summary - Depth = %d" % d tools.print_header(self.output_file, title) print >> self.output_file, "Correct: ", score[d] print >> self.output_file, "Total: ", self.num_trials print >> self.output_file, "Percent: ", percent tools.print_footer(self.output_file, title) self.add_data("sentence_score_%d" % d, percent)
def form_knowledge_base(self, id_vecs=True, unitary=False): # Check existence of corpus if self.corpus_dict is None: raise Exception("Attempted to form the knowledge " "base without a corpus.") print "Number of items in knowledge base:", len(self.corpus_dict) if not id_vecs: print "Processing Corpus" self.processCorpus() print "Generating relation type vectors" print "Using relation types: ", self.relation_symbols self.relation_type_vectors = { symbol: HRR(self.dimension) for symbol in self.relation_symbols } if unitary: for k, h in self.relation_type_vectors.iteritems(): h.make_unitary() if id_vecs: key_order = self.corpus_dict.keys() else: # Order words by the dependencies of their definitions # Only have to do this if we're not using ID-vectors key_order = [] resolved = set(self.relation_symbols) dependencies = {} for key in self.corpus_dict.keys(): dependencies[key] = set([ tag[1] for tag in self.corpus_dict[key] if tag[0] in self.relation_symbols ]) while len(key_order) < (len(self.corpus_dict) + len(self.relation_symbols)): resolvable = set() for key in dependencies: if dependencies[key].issubset(resolved): resolvable.add(key) # add the resolvable keys to the order list and resolved set key_order.extend(resolvable) resolved = resolved.union(resolvable) # remove resolved tags from the dependency dictionary for r in resolvable: del dependencies[r] # if no items are resolvable, we're stuck if len(resolvable) == 0: break del resolved del resolvable if len(key_order) < len(self.corpus_dict): raise Exception("Dependency resolution failed.") self.semantic_pointers = collections.OrderedDict() print "Generating ID-vectors" if id_vecs: self.id_vectors = collections.OrderedDict() for key in key_order: self.id_vectors[key] = HRR(self.dimension) else: self.id_vectors = self.semantic_pointers print "Generating HRR vectors" for key in key_order: relations = filter(lambda x: x[0] in self.relation_symbols, self.corpus_dict[key]) if len(relations) == 0: self.semantic_pointers[key] = HRR(self.dimension) continue semantic_pointer = HRR(data=np.zeros(self.dimension)) for n in range(self.sp_noise): semantic_pointer += HRR(self.dimension) for relation in relations: id_vector = self.id_vectors[relation[1]] relation_type_vector = self.relation_type_vectors[relation[0]] pair = id_vector * relation_type_vector semantic_pointer += pair if self.normalize: semantic_pointer.normalize() self.semantic_pointers[key] = semantic_pointer # convert all vectors from hrrs to numpy ndarrays for k in key_order: h = self.semantic_pointers[k] self.semantic_pointers[k] = h.v if id_vecs: for k in key_order: h = self.id_vectors[k] self.id_vectors[k] = h.v for k in self.relation_type_vectors: h = self.relation_type_vectors[k] self.relation_type_vectors[k] = h.v