def prepare_k_cross_random_datesets(edges, settings, vx_count, edge_count, base_path): ds_name = settings["name"] k = settings["k_subset_count"] k_frac = 1 / k k_size = 0 random.shuffle(edges) for i in range(1, k + 1): full_path = path.join(base_path, '{:03}_{}.cross.csv'.format(i, ds_name)) start = int(((i - 1) * k_frac) * edge_count) end = int((i * k_frac) * edge_count) k_size = end - start util.write_edges_to_file(edges[start:end], full_path) metadata = { "name": ds_name, "vertices": vx_count, "edges": edge_count, "set_count": k, "format_type": "basic-edge-list", "split_method": settings["split_method"], "training_sets_size": k_size * (k - 1), "test_sets_size": k_size, "created": util.now_as_string() } meta_path = path.join(base_path, '{}_meta.json'.format(ds_name)) util.write_to_json(metadata, meta_path) print('Data files for "{}" dataset succesfully created '.format(ds_name) + '({} vertices, {} edges).'.format(vx_count, edge_count)) print('For details, see: {}'.format(meta_path))
def prepare_chrono_perc_dataset(ts_edges, settings, vx_count, edge_count, base_path): ds_name = settings["name"] test_frac = settings["test_perc"] / 100 test_edges_count = int(edge_count * test_frac) train_edges_count = edge_count - test_edges_count test_path = path.join(base_path, '{:03}_{}.test.csv'.format(1, ds_name)) train_path = path.join(base_path, '{:03}_{}.train.csv'.format(1, ds_name)) edges = util.triples_to_rear_pairs(ts_edges) util.write_edges_to_file(edges[:train_edges_count], train_path) util.write_edges_to_file(edges[train_edges_count:], test_path) metadata = { "name": ds_name, "vertices": vx_count, "edges": edge_count, "set_count": 1, "format_type": "basic-edge-list", "split_method": settings["split_method"], "training_sets_size": train_edges_count, "test_sets_size": test_edges_count, "created": util.now_as_string() } meta_path = path.join(base_path, '{}_meta.json'.format(ds_name)) util.write_to_json(metadata, meta_path) print('Data files for "{}" dataset succesfully created '.format(ds_name) + '({} vertices, {} edges).'.format(vx_count, edge_count)) print('For details, see: {}'.format(meta_path))
def prepare_chrono_from_dataset(ts_edges, settings, vx_count, edge_count, base_path): ds_name = settings["name"] test_path = path.join(base_path, '{:03}_{}.test.csv'.format(1, ds_name)) train_path = path.join(base_path, '{:03}_{}.train.csv'.format(1, ds_name)) split_ts = util.str_to_utc_ts(settings["test_from"]) split_index = util.find_utc_edges_split_index(ts_edges, split_ts) edges = util.triples_to_rear_pairs(ts_edges) util.write_edges_to_file(edges[:split_index], train_path) util.write_edges_to_file(edges[split_index:], test_path) metadata = { "name": ds_name, "vertices": vx_count, "edges": edge_count, "set_count": 1, "format_type": "basic-edge-list", "split_method": settings["split_method"], "training_sets_size": split_index, "test_sets_size": edge_count - split_index, "created": util.now_as_string() } meta_path = path.join(base_path, '{}_meta.json'.format(ds_name)) util.write_to_json(metadata, meta_path) print('Data files for "{}" dataset succesfully created '.format(ds_name) + '({} vertices, {} edges).'.format(vx_count, edge_count)) print('For details, see: {}'.format(meta_path))
def update_yearly_data(category, year, filepath): if not is_cached(filepath): print('Raw data for {}:{}. Cache not available.'.format( category, year)) total = fetch_article_count(category, year) articles = [] for offset in range(0, total, ARXIV_DATA_CHUNK_SIZE): print_progress_info(category, year, total, offset) chunk_size = ARXIV_DATA_CHUNK_SIZE if offset + chunk_size > total: chunk_size = total - offset data = fetch_data_chunk(category, year, offset, chunk_size) parse_data_chunk(category, articles, data) wait() util.write_to_json(articles, filepath) print('Raw data for {}:{}. Cache updated ({} of {} articles).'.format( category, year, len(articles), total)) else: print('Raw data for {}:{}. Cache present.'.format(category, year))