Exemplo n.º 1
0
 def get_wikipedia_link_graph_sparse_csc(self):
     filename = FileLocations.get_dropbox_wikipedia_path(
     ) + 'wikipedia_link_graph_sparse_csc.deduped.15910478.pickle'
     self.logger.info('Loading %s', filename)
     with open(filename, 'rb') as handle:
         self.wikipedia_link_graph_sparse_csc = pickle.load(handle)
     self.logger.info('Loaded %s', filename)
     return self.wikipedia_link_graph_sparse_csc
Exemplo n.º 2
0
 def load_wikititle_id_by_id(self, filename=None):
     if self.wikititle_id_by_id is None:
         if filename is None:
             filename = FileLocations.get_dropbox_wikipedia_path(
             ) + 'wikititle_id_by_id.case_insensitive.15910478.pickle'
         self.logger.info('Loading %s', filename)
         with open(filename, 'rb') as handle:
             self.wikititle_id_by_id = pickle.load(handle)
         self.logger.info('Loaded %s', filename)
Exemplo n.º 3
0
 def load_wikipeadia_link_graph(self, link_graph_filename=None):
     if self.wikipedia_link_graph_sparse is None:
         if link_graph_filename is None:
             link_graph_filename = FileLocations.get_dropbox_wikipedia_path(
             ) + 'wikipedia_link_graph_sparse.deduped.15910478.pickle'
         if os.path.isfile(link_graph_filename):
             self.logger.info('loading wikipedia_link_graph_sparse from %s',
                              link_graph_filename)
             with open(link_graph_filename, 'rb') as handle:
                 self.wikipedia_link_graph_sparse = pickle.load(handle)
             self.logger.info('loaded')
Exemplo n.º 4
0
    def convert_link_graph_to_csr_and_csc(self):

        self.load_wikipeadia_link_graph()
        self.logger.info('converting to csr')
        csr = self.wikipedia_link_graph_sparse.tocsr()
        self.logger.info('converting to csc')
        csc = self.wikipedia_link_graph_sparse.tocsc()

        output_filename = FileLocations.get_dropbox_wikipedia_path(
        ) + 'wikipedia_link_graph_sparse_csr.deduped.15910478.pickle'
        self.logger.info('About to write %s', output_filename)
        with open(output_filename, 'wb') as handle:
            pickle.dump(csr, handle, protocol=pickle.HIGHEST_PROTOCOL)
        self.logger.info('file written = %s', output_filename)

        output_filename = FileLocations.get_dropbox_wikipedia_path(
        ) + 'wikipedia_link_graph_sparse_csc.deduped.15910478.pickle'
        self.logger.info('About to write %s', output_filename)
        with open(output_filename, 'wb') as handle:
            pickle.dump(csc, handle, protocol=pickle.HIGHEST_PROTOCOL)
        self.logger.info('file written = %s', output_filename)
Exemplo n.º 5
0
    def extract_graph_from_compressed(self, wikititle_to_id_filename=None):
        self.logger.warning('running extract_graph_from_compressed().')
        self.logger.warning(
            '[this takes about 2hr 20 min on Dwane\'s home machine]')
        input_file = gzip.open(FileLocations.get_dropbox_wikipedia_path() +
                               'wikipedia-dump.json.gz',
                               'rt',
                               encoding='utf-8')
        if wikititle_to_id_filename is not None:
            fn = wikititle_to_id_filename
        else:
            fn = FileLocations.get_dropbox_wikipedia_path(
            ) + 'wikititle_marisa_trie.case_insensitive.15910478.pickle'
        self.logger.warning(
            ' %s needs to be complete for these results to make most sense',
            fn)
        self.get_wikititle_case_insensitive_marisa_trie()

        count = 0
        line = '{}'
        from_list = []
        to_list = []
        value_list = []
        max_id = 0

        while count < 25000000 and line is not None and line != '':
            count += 1
            early_log = count <= 50000 and count % 10000 == 0
            late_log = count > 50000 and count % 1000000 == 0
            if early_log or late_log:
                self.logger.info('%d lines processed', count)
                output_filename = FileLocations.get_temp_path(
                ) + 'wikipedia_link_graph_sparse.deduped.' + str(
                    count) + '.pickle'
                self.logger.info('saving file %s', output_filename)
                row = np.array(from_list)
                col = np.array(to_list)
                data = np.array(value_list)
                mtx = sparse.coo_matrix((data, (row, col)),
                                        shape=(max_id + 1, max_id + 1))
                self.logger.info('About to write %s', output_filename)
                with open(output_filename, 'wb') as handle:
                    pickle.dump(mtx, handle, protocol=pickle.HIGHEST_PROTOCOL)
                self.logger.info('file written = %s', output_filename)

            line = input_file.readline()
            if line != '':
                try:
                    data = json.loads(line)
                except json.decoder.JSONDecodeError as e:
                    self.logger.warning(
                        "type error decoding json: json = %s, error = %s",
                        line, str(e))
                    break
                # pprint.pprint(data)
                if 'links' in data:
                    fid = data['wid']
                    if self.get_wikititle_id_from_id(fid)[0][0] != fid:
                        self.logger.info(
                            '%s -> %s ', fid,
                            self.get_wikititle_id_from_id(fid)[0][0])
                        fid = self.get_wikititle_id_from_id(fid)[0][0]

                    if fid > max_id:
                        max_id = fid
                    for link in data['links']:
                        link_name = link[
                            'id']  # this is not numeric, has underscores, matches WikiTitle

                        if link_name in self.wikititle_marisa_trie:
                            link_list = self.wikititle_marisa_trie[link_name]
                            link_cid = link_list[0][0]

                            if link_cid > max_id:
                                max_id = link_cid

                            # d['type'] = link['type'] # do we care about link type? assuming no
                            from_list.append(fid)
                            to_list.append(link_cid)
                            value_list.append(1)

        self.logger.info('%d lines processed', count)
        output_filename = FileLocations.get_temp_path(
        ) + 'wikipedia_link_graph_sparse.deduped.' + str(count) + '.pickle'
        self.logger.info('saving file %s', output_filename)
        row = np.array(from_list)
        col = np.array(to_list)
        data = np.array(value_list)
        mtx = sparse.coo_matrix((data, (row, col)), shape=(max_id, max_id))
        self.logger.info('About to write %s', output_filename)
        with open(output_filename, 'wb') as handle:
            pickle.dump(mtx, handle, protocol=pickle.HIGHEST_PROTOCOL)
        self.logger.info('file written = %s', output_filename)
Exemplo n.º 6
0
 def get_dexter_dataset(self, path=None, filename='short.json'):
     if path is None:
         path = FileLocations.get_dropbox_wikipedia_path()
     with open(path + filename) as f:
         content = f.readlines()
     return content