def normalize_freebase_output(text): """Remove starting and ending quotes and the namespace prefix. :param text: :return: """ if len(text) > 1 and text.startswith('"') and text.endswith('"'): text = text[1:-1] return globals.remove_freebase_ns(text)
def build_index(self, index_file_prefix, facts_file): logger.info("Building new mediator index.") num_lines = 0 vocabulary = {} entity_postings = {} # Read the vocabulary. logger.info("Building vocabulary.") vocab__words_set = set() with open(facts_file, 'r') as f: mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ) line = mm.readline() while line: cols = line.strip().split('\t') if len(cols) != 4: logger.warn("Invalid line: %s" % line) line = mm.readline() num_lines += 1 continue cols = [globals.remove_freebase_ns(x) for x in cols] vocab__words_set.update(cols) line = mm.readline() num_lines += 1 if num_lines % 2000000 == 0: logger.info("Processed %s lines." % num_lines) vocabulary_words = sorted(vocab__words_set) # This is only for fast reading. vocabulary = dict() for i, word in enumerate(vocabulary_words): vocabulary[word] = i # Second pass, this time with vocabulary. logger.info("Building index.") num_lines = 0 with open(facts_file, 'r') as f: mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ) line = mm.readline() while line: cols = line.strip().split('\t') if len(cols) != 4: logger.warn("Invalid line: %s" % line) line = mm.readline() num_lines += 1 continue cols = [globals.remove_freebase_ns(x) for x in cols] value_id = vocabulary[cols[0]] relation_id = vocabulary[cols[1]] mediator_id = vocabulary[cols[3]] if value_id not in entity_postings: entity_postings[value_id] = [] entity_postings[value_id].append((mediator_id, relation_id)) line = mm.readline() num_lines += 1 if num_lines % 2000000 == 0: logger.info("Processed %s lines." % num_lines) logger.info("Sorting postings...") for k, v in entity_postings.iteritems(): a = sorted(entity_postings[k]) # Remove the tuples a = [x for y in a for x in y] entity_postings[k] = np.array(a, dtype=np.uint32) total_postings = sum([len(x) for _, x in entity_postings.iteritems()]) logger.info("Number of posting lists: %s " % len(entity_postings)) logger.info("Avg. posting list length: %s " % (total_postings / float(len(entity_postings)))) logger.info("Writing index.") index_handle, offsets, sizes = write_index(index_file_prefix, vocabulary_words, entity_postings) self.vocabulary_words = vocabulary_words self.index = index_handle self.offsets = offsets self.sizes = sizes