def get_wikipedia_link_graph_sparse_csc(self): filename = FileLocations.get_dropbox_wikipedia_path( ) + 'wikipedia_link_graph_sparse_csc.deduped.15910478.pickle' self.logger.info('Loading %s', filename) with open(filename, 'rb') as handle: self.wikipedia_link_graph_sparse_csc = pickle.load(handle) self.logger.info('Loaded %s', filename) return self.wikipedia_link_graph_sparse_csc
def load_wikititle_id_by_id(self, filename=None): if self.wikititle_id_by_id is None: if filename is None: filename = FileLocations.get_dropbox_wikipedia_path( ) + 'wikititle_id_by_id.case_insensitive.15910478.pickle' self.logger.info('Loading %s', filename) with open(filename, 'rb') as handle: self.wikititle_id_by_id = pickle.load(handle) self.logger.info('Loaded %s', filename)
def load_wikipeadia_link_graph(self, link_graph_filename=None): if self.wikipedia_link_graph_sparse is None: if link_graph_filename is None: link_graph_filename = FileLocations.get_dropbox_wikipedia_path( ) + 'wikipedia_link_graph_sparse.deduped.15910478.pickle' if os.path.isfile(link_graph_filename): self.logger.info('loading wikipedia_link_graph_sparse from %s', link_graph_filename) with open(link_graph_filename, 'rb') as handle: self.wikipedia_link_graph_sparse = pickle.load(handle) self.logger.info('loaded')
def convert_link_graph_to_csr_and_csc(self): self.load_wikipeadia_link_graph() self.logger.info('converting to csr') csr = self.wikipedia_link_graph_sparse.tocsr() self.logger.info('converting to csc') csc = self.wikipedia_link_graph_sparse.tocsc() output_filename = FileLocations.get_dropbox_wikipedia_path( ) + 'wikipedia_link_graph_sparse_csr.deduped.15910478.pickle' self.logger.info('About to write %s', output_filename) with open(output_filename, 'wb') as handle: pickle.dump(csr, handle, protocol=pickle.HIGHEST_PROTOCOL) self.logger.info('file written = %s', output_filename) output_filename = FileLocations.get_dropbox_wikipedia_path( ) + 'wikipedia_link_graph_sparse_csc.deduped.15910478.pickle' self.logger.info('About to write %s', output_filename) with open(output_filename, 'wb') as handle: pickle.dump(csc, handle, protocol=pickle.HIGHEST_PROTOCOL) self.logger.info('file written = %s', output_filename)
def extract_graph_from_compressed(self, wikititle_to_id_filename=None): self.logger.warning('running extract_graph_from_compressed().') self.logger.warning( '[this takes about 2hr 20 min on Dwane\'s home machine]') input_file = gzip.open(FileLocations.get_dropbox_wikipedia_path() + 'wikipedia-dump.json.gz', 'rt', encoding='utf-8') if wikititle_to_id_filename is not None: fn = wikititle_to_id_filename else: fn = FileLocations.get_dropbox_wikipedia_path( ) + 'wikititle_marisa_trie.case_insensitive.15910478.pickle' self.logger.warning( ' %s needs to be complete for these results to make most sense', fn) self.get_wikititle_case_insensitive_marisa_trie() count = 0 line = '{}' from_list = [] to_list = [] value_list = [] max_id = 0 while count < 25000000 and line is not None and line != '': count += 1 early_log = count <= 50000 and count % 10000 == 0 late_log = count > 50000 and count % 1000000 == 0 if early_log or late_log: self.logger.info('%d lines processed', count) output_filename = FileLocations.get_temp_path( ) + 'wikipedia_link_graph_sparse.deduped.' + str( count) + '.pickle' self.logger.info('saving file %s', output_filename) row = np.array(from_list) col = np.array(to_list) data = np.array(value_list) mtx = sparse.coo_matrix((data, (row, col)), shape=(max_id + 1, max_id + 1)) self.logger.info('About to write %s', output_filename) with open(output_filename, 'wb') as handle: pickle.dump(mtx, handle, protocol=pickle.HIGHEST_PROTOCOL) self.logger.info('file written = %s', output_filename) line = input_file.readline() if line != '': try: data = json.loads(line) except json.decoder.JSONDecodeError as e: self.logger.warning( "type error decoding json: json = %s, error = %s", line, str(e)) break # pprint.pprint(data) if 'links' in data: fid = data['wid'] if self.get_wikititle_id_from_id(fid)[0][0] != fid: self.logger.info( '%s -> %s ', fid, self.get_wikititle_id_from_id(fid)[0][0]) fid = self.get_wikititle_id_from_id(fid)[0][0] if fid > max_id: max_id = fid for link in data['links']: link_name = link[ 'id'] # this is not numeric, has underscores, matches WikiTitle if link_name in self.wikititle_marisa_trie: link_list = self.wikititle_marisa_trie[link_name] link_cid = link_list[0][0] if link_cid > max_id: max_id = link_cid # d['type'] = link['type'] # do we care about link type? assuming no from_list.append(fid) to_list.append(link_cid) value_list.append(1) self.logger.info('%d lines processed', count) output_filename = FileLocations.get_temp_path( ) + 'wikipedia_link_graph_sparse.deduped.' + str(count) + '.pickle' self.logger.info('saving file %s', output_filename) row = np.array(from_list) col = np.array(to_list) data = np.array(value_list) mtx = sparse.coo_matrix((data, (row, col)), shape=(max_id, max_id)) self.logger.info('About to write %s', output_filename) with open(output_filename, 'wb') as handle: pickle.dump(mtx, handle, protocol=pickle.HIGHEST_PROTOCOL) self.logger.info('file written = %s', output_filename)
def get_dexter_dataset(self, path=None, filename='short.json'): if path is None: path = FileLocations.get_dropbox_wikipedia_path() with open(path + filename) as f: content = f.readlines() return content