import zipfile import networkx as nx from io import BytesIO import csv import graph_info_csv_helpers as utils __author__ = "Henry Carscadden" __email__ = '*****@*****.**' """ This file downloads networks from Alex Arena's website and reads into the a buffer; from the buffer, we extract node attributes and build a graph. The attributes and graph are written to file. """ data_url = "http://deim.urv.cat/~alexandre.arenas/data/" base_url = "http://deim.urv.cat/~alexandre.arenas/data/welcome.htm" parsed_html = utils.soupify(base_url) for link in parsed_html.find_all('a'): if 'zip' in link.get('href'): url = data_url + link.get('href') pajek_lines = [] graph_zipped = utils.get_zip_fp(url) for file in graph_zipped.infolist(): ext = file.filename[-3:].lower() if ext == "net" or ext == "paj": pajek_lines = graph_zipped.read(file.filename).decode('utf-8') if 'jazz' in file.filename: pajek_lines = "\n".join( list( map( lambda x: " ".join( x.strip(' ').replace('\t', '').split(' ')),
import os import io import numpy as np import graph_info_csv_helpers as utils import networkx as nx import tarfile import urllib.request edge_list_path = '../snap_networks/edge_lists/' node_id_path = '../snap_networks/node_id_mappings/' snap_data_url = "https://sparse.tamu.edu/SNAP?per_page=All" bytes_limit = 20000000 index_page_parsed = utils.soupify(snap_data_url) rows = index_page_parsed.find_all('table')[1].find_all('tr') for i in range(1, len(rows)): row = rows[i] row_data = [attr for attr in row.find_all('td')] name = row_data[1].string multigraph = 'multigraph' in row_data[6].string.lower() dataset_url = row.find_all('a')[-1].get('href') site = urllib.request.urlopen(dataset_url) metadata = site.info() if int(metadata['Content-Length']) > bytes_limit: file_size = metadata['Content-Length'] utils.insert_into_undownloaded_db(name, dataset_url, 0, file_size) else: ext = dataset_url[-3:].lower() if ext == ".gz": with urllib.request.urlopen(dataset_url) as tarred_mtx:
import graph_info_csv_helpers as utils import urllib.request import igraph import os base_url = "http://vlado.fmf.uni-lj.si/pub/networks/data/ucinet/" ucinet_parsed = utils.soupify(base_url + "ucidata.htm") for link in ucinet_parsed.find_all('a'): link_href = link.get('href') if link_href is not None: ext = link_href[-3:].lower() if ext == 'dat': with urllib.request.urlopen(base_url + link_href.split('/')[-1]) as dat_fp: file_data = dat_fp.read().decode('utf-8') with open('../dl_files/' + link_href.split('/')[-1] + '.dl', 'w', newline='') as tmp_fp: tmp_fp.write(file_data)
import os import io import numpy as np import graph_info_csv_helpers as utils import networkx as nx import tarfile import urllib.request base_dir = '../gleich_networks/' edge_list_path = base_dir + 'edge_lists/' node_id_path = base_dir + 'node_id_mappings/' base_url = "https://sparse.tamu.edu/Gleich" bytes_limit = 10000000 index_page_parsed = utils.soupify(base_url) rows = index_page_parsed.find_all('table')[1].find_all('tr') for i in range(1, len(rows)): row = rows[i] row_data = [attr for attr in row.find_all('td')] name = row_data[1].string multigraph = 'multigraph' in row_data[6].string.lower() dataset_url = row.find_all('a')[-1].get('href') site = urllib.request.urlopen(dataset_url) metadata = site.info() if int(metadata['Content-Length']) > bytes_limit: file_size = metadata['Content-Length'] utils.insert_into_undownloaded_db(name, dataset_url, 0, file_size) else: ext = dataset_url[-3:].lower() if ext == ".gz":
import json import scipy.io import networkx as nx import graph_info_csv_helpers as utils __author__ = "Henry Carscadden" __email__ = '*****@*****.**' """ This file downloads files from a large repository known as network repository. """ base_site = "http://networkrepository.com/" base_url = "http://networkrepository.com/networks.php" edge_list_path = '../network_repo_networks/edge_lists/' node_id_path = '../network_repo_networks/node_id_mappings/' parsed_networks_page = utils.soupify(base_url) def node_id_write(G, url, edge_list_path, node_id_path, name): old_attributes = list(G.nodes) G = nx.convert_node_labels_to_integers(G) id_mapping = [] node_list = list(G.nodes) for i in range(len(node_list)): id_mapping.append([old_attributes[i], str(node_list[i])]) mapping_file = open(node_id_path + name + '.csv', 'w', newline='') mapping_file_writer = csv.writer(mapping_file) mapping_file_writer.writerow(['id', 'name']) for tup in id_mapping: mapping_file_writer.writerow(list(tup)) mapping_file.close()
import os import io import numpy as np import graph_info_csv_helpers as utils import networkx as nx import tarfile import urllib.request base_dir = '../vanheukelum_networks/' edge_list_path = base_dir + 'edge_lists/' node_id_path = base_dir + 'node_id_mappings/' van_heukelum_url = "https://sparse.tamu.edu/vanHeukelum" bytes_limit = 10000000 index_page_parsed = utils.soupify(van_heukelum_url) rows = index_page_parsed.find_all('table')[1].find_all('tr') for i in range(1, len(rows)): row = rows[i] row_data = [attr for attr in row.find_all('td')] name = row_data[1].string multigraph = 'multigraph' in row_data[6].string.lower() dataset_url = row.find_all('a')[-1].get('href') site = urllib.request.urlopen(dataset_url) metadata = site.info() if int(metadata['Content-Length']) > bytes_limit: file_size = metadata['Content-Length'] utils.insert_into_undownloaded_db(name, dataset_url, 0, file_size) else: ext = dataset_url[-3:].lower() if ext == ".gz":