class Crawler(object): def __init__(self): """Create an item representing the 'PeeringDB organization ID' class if doesn't already exist. And fetch QIDs for organizations already in the wikibase.""" sys.stderr.write('Initialization...\n') # Helper for wiki access self.wh = Wikihandy() # Get the QID for the item representing the organization IDs orgid_qid = self.wh.get_qid( ORGID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add PeeringDB org IDs', # Commit message 'description': 'Identifier for an organization in the PeeringDB database' }) # Load the QIDs for organizations already available in the wikibase self.orgid2qid = self.wh.extid2qid(qid=orgid_qid) # Added properties will have this reference information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), URL_PDB_ORGS), (self.wh.get_pid('point in time'), today)] def run(self): """Fetch organizations information from PeeringDB and push to wikibase""" sys.stderr.write('Fetching PeeringDB data...\n') req = requests.get(URL_PDB_ORGS) if req.status_code != 200: sys.exit('Error while fetching AS names') organizations = json.loads(req.text)['data'] self.wh.login() # Login once for all threads for i, res in enumerate(map(self.update_org, organizations)): sys.stderr.write(f'\rProcessing... {i+1}/{len(organizations)}') def update_org(self, organization): """Add the organization to wikibase if it's not there and update properties""" # set property name statements = [[ self.wh.get_pid('instance of'), self.wh.get_qid('organization') ], [ self.wh.get_pid('name'), organization['name'].strip(), self.reference ]] # set property website if organization['website']: statements.append([ self.wh.get_pid('website'), organization['website'], self.reference ]) # set property country if organization['country'] in iso3166.countries_by_alpha2: country_qid = self.wh.get_qid( iso3166.countries_by_alpha2[organization['country']].name) if country_qid is not None: statements.append( [self.wh.get_pid('country'), country_qid, self.reference]) # Update name, website, and country for this organization org_qid = self.org_qid(organization) self.wh.upsert_statements('update peeringDB organization', org_qid, statements) return org_qid def org_qid(self, organization): """Find the organization QID or add it to wikibase if it is not yet there. Return the organization QID.""" # Check if the organization is in the wikibase if str(organization['id']) not in self.orgid2qid: # Set properties for this new organization org_qualifier = [ (self.wh.get_pid('instance of'), self.wh.get_qid(ORGID_LABEL)), ] statements = [[ self.wh.get_pid('external ID'), str(organization['id']), [], org_qualifier ]] # Add this organization to the wikibase org_qid = self.wh.add_item('add new peeringDB organization', label=organization['name'], statements=statements) # keep track of this QID self.orgid2qid[str(organization['id'])] = org_qid return self.orgid2qid[str(organization['id'])]
class Crawler(object): def __init__(self): """Create an item representing the PeeringDB exchange point ID class if doesn't already exist. And fetch QIDs for exchange points already in the wikibase.""" # Helper for wiki access self.wh = Wikihandy() # Get the QID of the item representing PeeringDB IX IDs ixid_qid = self.wh.get_qid( IXID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add PeeringDB ix IDs', # Commit message 'description': 'Identifier for an exchange point in the PeeringDB database' # Description }) # Load the QIDs for ix already available in the wikibase self.ixid2qid = self.wh.extid2qid(qid=ixid_qid) # Load the QIDs for peeringDB organizations self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL) # Added properties will have this reference information self.today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), URL_PDB_IXS), (self.wh.get_pid('point in time'), self.today)] def run(self): """Fetch ixs information from PeeringDB and push to wikibase. Using multiple threads for better performances.""" req = requests.get(URL_PDB_IXS) if req.status_code != 200: sys.exit('Error while fetching IXs data') ixs = json.loads(req.text)['data'] self.wh.login() # Login once for all threads for i, ix in enumerate(ixs): # Get more info for this IX req = requests.get(f'{URL_PDB_IXS}/{ix["id"]}') if req.status_code != 200: sys.exit('Error while fetching IXs data') ix_info = json.loads(req.text)['data'][0] # Update info in wiki self.update_ix(ix_info) sys.stderr.write(f'\rProcessing... {i+1}/{len(ixs)}') def update_ix(self, ix): """Add the ix to wikibase if it's not already there and update its properties.""" # set property name statements = [[ self.wh.get_pid('instance of'), self.wh.get_qid('Internet exchange point') ], [self.wh.get_pid('name'), ix['name'].strip(), self.reference]] # link to corresponding organization org_qid = self.orgid2qid.get(str(ix['org_id'])) if org_qid is not None: statements.append( [self.wh.get_pid('managed by'), org_qid, self.reference]) else: print('Error this organization is not in wikibase: ', ix['org_id']) # set property country if ix['country']: country_qid = self.wh.country2qid(ix['country']) if country_qid is not None: statements.append( [self.wh.get_pid('country'), country_qid, self.reference]) # set property website if ix['website']: statements.append( [self.wh.get_pid('website'), ix['website'], self.reference]) # set traffic webpage if ix['url_stats']: statements.append([ self.wh.get_pid('website'), ix['url_stats'], # statement self.reference, # reference [ (self.wh.get_pid('instance of'), self.wh.get_qid('traffic statistics')), ] # qualifier ]) ix_qid = self.ix_qid(ix) # Update name, website, and organization for this IX self.wh.upsert_statements('update peeringDB ixs', ix_qid, statements) # update LAN corresponding to this IX if 'ixlan_set' in ix: for ixlan in ix['ixlan_set']: pfx_url = f'{URL_PDB_LAN}/{ixlan["id"]}' pfx_ref = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), pfx_url), (self.wh.get_pid('point in time'), self.today)] req = requests.get(pfx_url) if req.status_code != 200: sys.exit('Error while fetching IXs data') lans = json.loads(req.text)['data'] for lan in lans: for prefix in lan['ixpfx_set']: pfx_qid = self.wh.prefix2qid(prefix['prefix'], create=True) pfx_stmts = [[ self.wh.get_pid('instance of'), self.wh.get_qid('peering LAN'), pfx_ref ], [self.wh.get_pid('managed by'), ix_qid, pfx_ref]] self.wh.upsert_statements('update peeringDB ixlan', pfx_qid, pfx_stmts) return ix_qid def ix_qid(self, ix): """Find the ix QID for the given ix. If this ix is not yet registered in the wikibase then add it. Return the ix QID.""" # Check if the IX is in the wikibase if str(ix['id']) not in self.ixid2qid: # Set properties for this new ix ix_qualifiers = [ (self.wh.get_pid('instance of'), self.wh.get_qid(IXID_LABEL)), ] statements = [(self.wh.get_pid('instance of'), self.wh.get_qid('Internet exchange point')), (self.wh.get_pid('external ID'), str(ix['id']), [], ix_qualifiers)] # Add this ix to the wikibase ix_qid = self.wh.add_item('add new peeringDB IX', label=ix['name'], description=ix['name_long'], statements=statements) # keep track of this QID self.ixid2qid[str(ix['id'])] = ix_qid return self.ixid2qid[str(ix['id'])]
class Crawler(object): def __init__(self): """Create an item representing the PeeringDB network ID class if doesn't already exist. And fetch QIDs for networks already in the wikibase.""" # Helper for wiki access self.wh = Wikihandy() # Get the QID of the item representing PeeringDB network IDs netid_qid = self.wh.get_qid( NETID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add PeeringDB net IDs', # Commit message 'description': 'Identifier for a network in the PeeringDB database' # Description }) # Load the QIDs for networks already available in the wikibase self.netid2qid = self.wh.extid2qid(qid=netid_qid) # Load the QIDs for peeringDB organizations self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL) # Load the QIDs for peeringDB IXs self.ixid2qid = self.wh.extid2qid(label=IXID_LABEL) # Added properties will have this reference information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), URL_PDB_NETS), (self.wh.get_pid('point in time'), today)] # Session object to fetch peeringdb data retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[104, 500, 502, 503, 504]) self.http_session = requests.Session() self.http_session.mount('https://', HTTPAdapter(max_retries=retries)) def run(self): """Fetch networks information from PeeringDB and push to wikibase. Using multiple threads for better performances.""" req = self.http_session.get(URL_PDB_NETS) if req.status_code != 200: sys.exit('Error while fetching data from API') networks = json.loads(req.text)['data'] self.wh.login() # Login once for all threads for i, _ in enumerate(map(self.update_net, networks)): sys.stderr.write(f'\rProcessing... {i+1}/{len(networks)}') def update_net(self, network): """Add the network to wikibase if it's not already there and update its properties.""" # set property name statements = [[ self.wh.get_pid('name'), network['name'].strip(), self.reference ]] # link to corresponding organization org_qid = self.orgid2qid.get(str(network['org_id'])) if org_qid is not None: statements.append( [self.wh.get_pid('managed by'), org_qid, self.reference]) else: print('Error this organization is not in wikibase: ', network['org_id']) # set property website if network['website']: statements.append([ self.wh.get_pid('website'), network['website'], self.reference ]) # Update IX membership # Fetch membership for this network netixlan_url = URL_PDB_NETS + f'/{network["id"]}' req = self.http_session.get(netixlan_url) if req.status_code != 200: sys.exit(f'Error while fetching network data (id={network["id"]})') net_details = json.loads(req.text)['data'] if len(net_details) > 1: print(net_details) net_details = net_details[0] # Push membership to wikidata today = self.wh.today() netixlan_ref = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), netixlan_url), (self.wh.get_pid('point in time'), today)] for ixlan in net_details['netixlan_set']: ix_qid = self.ixid2qid.get(str(ixlan['ix_id'])) if ix_qid is None: print(f'Unknown IX: ix_id={ixlan["ix_id"]}') continue statements.append( [self.wh.get_pid('member of'), ix_qid, netixlan_ref]) # Update name, website, and organization for this network net_qid = self.net_qid(network) self.wh.upsert_statements('update peeringDB networks', net_qid, statements) return net_qid def net_qid(self, network): """Find the network QID for the given network. If this network is not yet registered in the wikibase then find (or create) the item corresponding to the network ASN and register the peeringDB network ID with this item. Return the network QID.""" # Check if the network is in the wikibase if str(network['id']) not in self.netid2qid: # Find or create the corresponding ASN item net_qid = self.wh.asn2qid(network['asn'], create=True) # Set properties for this new network net_qualifiers = [ (self.wh.get_pid('instance of'), self.wh.get_qid(NETID_LABEL)), ] statements = [[ self.wh.get_pid('external ID'), str(network['id']), [], net_qualifiers ]] # Add this network to the wikibase self.wh.upsert_statements('add new peeringDB network', net_qid, statements=statements) # keep track of this QID self.netid2qid[str(network['id'])] = net_qid return self.netid2qid[str(network['id'])]
class Crawler(object): def __init__(self): """ """ # Helper for wiki access self.wh = Wikihandy(preload=True) # Get the QID for RIPE Atlas self.atlas_qid = self.wh.get_qid( 'RIPE Atlas', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas is a global, open, distributed Internet measurement platform, consisting of thousands of measurement devices that measure Internet connectivity in real time.', # Item description 'aliases': 'Atlas|atlas', 'statements': [[self.wh.get_pid('managed by'), self.wh.get_qid('RIPE NCC')]] }) # Get the QID for Atlas Probe self.atlas_probe_qid = self.wh.get_qid( 'Atlas probe', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas probes form the backbone of the RIPE Atlas infrastructure.', # Item description 'aliases': 'RIPE Atlas probe|atlas probe|RIPE atlas probe', 'statements': [[self.wh.get_pid('part of'), self.atlas_qid]] }) # Get the QID for Atlas Anchor self.atlas_anchor_qid = self.wh.get_qid( 'Atlas anchor', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas Anchors are located at hosts that can provide sufficient bandwidth to support a large number of incoming and outgoing measurements.', # Item description 'aliases': 'RIPE Atlas anchor|atlas anchor|RIPE atlas anchor', 'statements': [[self.wh.get_pid('part of'), self.atlas_qid]] }) # Get the QID of the item representing PeeringDB IX IDs self.probeid_qid = self.wh.get_qid( PROBEID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas probes', # Commit message 'description': 'Identifier for a probe in the RIPE Atlas measurement platform' # Description }) # Load the QIDs for probes already available in the wikibase self.probeid2qid = self.wh.extid2qid(qid=self.probeid_qid) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')), (self.wh.get_pid('reference URL'), URL), (self.wh.get_pid('point in time'), today)] self.v4_qualifiers = [(self.wh.get_pid('IP version'), self.wh.get_qid('IPv4'))] self.v6_qualifiers = [(self.wh.get_pid('IP version'), self.wh.get_qid('IPv6'))] def run(self): """Fetch probe information from Atlas API and push to wikibase. """ next_page = URL while next_page is not None: req = requests.get(next_page) if req.status_code != 200: sys.exit('Error while fetching the blocklist') info = json.loads(req.text) next_page = info['next'] for i, probe in enumerate(info['results']): self.update_probe(probe) sys.stderr.write(f'\rProcessed {i+1} probes') sys.stderr.write(f'\n') def update_probe(self, probe): """Add the probe to wikibase if it's not already there and update its properties.""" # TODO add status, geometry (geo-location) and IPs? # Properties for this probe statements = [] if probe['is_anchor']: statements.append( [self.wh.get_pid('instance of'), self.atlas_probe_qid]) statements.append( [self.wh.get_pid('instance of'), self.atlas_anchor_qid]) if probe['asn_v4']: as_qid = self.wh.asn2qid(probe['asn_v4']) if as_qid: statements.append([ self.wh.get_pid('part of'), as_qid, self.reference, self.v4_qualifiers ]) if probe['asn_v6']: as_qid = self.wh.asn2qid(probe['asn_v6']) if as_qid: statements.append([ self.wh.get_pid('part of'), as_qid, self.reference, self.v6_qualifiers ]) if probe['prefix_v4']: prefix_qid = self.wh.prefix2qid(probe['prefix_v4']) if prefix_qid: statements.append( [self.wh.get_pid('part of'), prefix_qid, self.reference]) if probe['prefix_v6']: prefix_qid = self.wh.prefix2qid(probe['prefix_v6']) if prefix_qid: statements.append( [self.wh.get_pid('part of'), prefix_qid, self.reference]) if probe['country_code']: statements.append([ self.wh.get_pid('country'), self.wh.country2qid(probe['country_code']), self.reference ]) if probe['first_connected']: statements.append([ self.wh.get_pid('start time'), self.wh.to_wbtime(probe['first_connected']), self.reference ]) if 'name' in probe['status']: # Get the QIDs for probes status status_qid = self.wh.get_qid( f'RIPE Atlas probe status: {probe["status"]["name"]}', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas probe status', # Commit message }) if probe['status_since']: statements.append([ self.wh.get_pid('status'), status_qid, self.reference, [(self.wh.get_pid('start time'), self.wh.to_wbtime(probe['status_since']))] ]) # set end time if the probe is abandonned if probe['status']['name'] == 'Abandoned' and probe['status_since']: statements.append([ self.wh.get_pid('end time'), self.wh.to_wbtime(probe['status_since']) ]) # Add probe tags for tag in probe['tags']: statements.append([ self.wh.get_pid('tag'), self.wh.get_qid(tag['name'], create={ 'summary': 'Add RIPE Atlas tag', }) ]) # Commit to wikibase # Get the probe QID (create if probe is not yet registered) and commit changes probe_qid = self.probe_qid(probe) self.wh.upsert_statements('update from RIPE Atlas probes', probe_qid, statements) def probe_qid(self, probe): """Find the ix QID for the given probe ID. If this probe is not yet registered in the wikibase then add it. Return the probe QID.""" id = str(probe['id']) # Check if the IX is in the wikibase if id not in self.probeid2qid: # Set properties for this new probe probeid_qualifiers = [ (self.wh.get_pid('instance of'), self.probeid_qid), ] statements = [ (self.wh.get_pid('instance of'), self.atlas_probe_qid), (self.wh.get_pid('external ID'), id, [], probeid_qualifiers) ] # Add this probe to the wikibase probe_qid = self.wh.add_item('add new RIPE Atlas probe', label=f'RIPE Atlas probe #{id}', description=probe['description'], statements=statements) # keep track of this QID self.probeid2qid[id] = probe_qid return self.probeid2qid[id]