class Crawler(object): def __init__(self): # Helper for wiki access self.wh = Wikihandy() # Reference information for data pushed to the wikibase self.reference = [ (self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')), (self.wh.get_pid('reference URL'), URL_RIPE_AS_NAME), (self.wh.get_pid('point in time'), self.wh.today()) ] def run(self): """Fetch the AS name file from RIPE website and process lines one by one""" req = requests.get(URL_RIPE_AS_NAME) if req.status_code != 200: sys.exit('Error while fetching AS names') self.wh.login() # Login once for all threads, not needed with OAuth for i, res in enumerate(map(self.update_asn, req.text.splitlines())): sys.stderr.write(f'\rProcessed {i} ASes') def update_asn(self, one_line): # Parse given line to get ASN, name, and country code asn, _, name_cc = one_line.partition(' ') name, _, cc = name_cc.rpartition(', ') asn_qid = self.wh.asn2qid(asn, create=True) cc_qid = self.wh.country2qid(cc, create=True) statements = [] statements.append( [self.wh.get_pid('country'), cc_qid, self.reference] ) # Set country if cc_qid is not None: statements.append( [self.wh.get_pid('name'), name, self.reference] ) # Set AS name try: # Update AS name and country self.wh.upsert_statements('updates from RIPE AS names', asn_qid, statements) except Exception as error: # print errors and continue running print('Error for: ', one_line) print(error) return asn_qid
class Crawler(object): def __init__(self): """Create an item representing the 'PeeringDB organization ID' class if doesn't already exist. And fetch QIDs for organizations already in the wikibase.""" sys.stderr.write('Initialization...\n') # Helper for wiki access self.wh = Wikihandy() # Get the QID for the item representing the organization IDs orgid_qid = self.wh.get_qid( ORGID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add PeeringDB org IDs', # Commit message 'description': 'Identifier for an organization in the PeeringDB database' }) # Load the QIDs for organizations already available in the wikibase self.orgid2qid = self.wh.extid2qid(qid=orgid_qid) # Added properties will have this reference information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), URL_PDB_ORGS), (self.wh.get_pid('point in time'), today)] def run(self): """Fetch organizations information from PeeringDB and push to wikibase""" sys.stderr.write('Fetching PeeringDB data...\n') req = requests.get(URL_PDB_ORGS) if req.status_code != 200: sys.exit('Error while fetching AS names') organizations = json.loads(req.text)['data'] self.wh.login() # Login once for all threads for i, res in enumerate(map(self.update_org, organizations)): sys.stderr.write(f'\rProcessing... {i+1}/{len(organizations)}') def update_org(self, organization): """Add the organization to wikibase if it's not there and update properties""" # set property name statements = [[ self.wh.get_pid('instance of'), self.wh.get_qid('organization') ], [ self.wh.get_pid('name'), organization['name'].strip(), self.reference ]] # set property website if organization['website']: statements.append([ self.wh.get_pid('website'), organization['website'], self.reference ]) # set property country if organization['country'] in iso3166.countries_by_alpha2: country_qid = self.wh.get_qid( iso3166.countries_by_alpha2[organization['country']].name) if country_qid is not None: statements.append( [self.wh.get_pid('country'), country_qid, self.reference]) # Update name, website, and country for this organization org_qid = self.org_qid(organization) self.wh.upsert_statements('update peeringDB organization', org_qid, statements) return org_qid def org_qid(self, organization): """Find the organization QID or add it to wikibase if it is not yet there. Return the organization QID.""" # Check if the organization is in the wikibase if str(organization['id']) not in self.orgid2qid: # Set properties for this new organization org_qualifier = [ (self.wh.get_pid('instance of'), self.wh.get_qid(ORGID_LABEL)), ] statements = [[ self.wh.get_pid('external ID'), str(organization['id']), [], org_qualifier ]] # Add this organization to the wikibase org_qid = self.wh.add_item('add new peeringDB organization', label=organization['name'], statements=statements) # keep track of this QID self.orgid2qid[str(organization['id'])] = org_qid return self.orgid2qid[str(organization['id'])]
class Crawler(object): def __init__(self): """Create an item representing the PeeringDB network ID class if doesn't already exist. And fetch QIDs for networks already in the wikibase.""" # Helper for wiki access self.wh = Wikihandy() # Get the QID of the item representing PeeringDB network IDs netid_qid = self.wh.get_qid( NETID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add PeeringDB net IDs', # Commit message 'description': 'Identifier for a network in the PeeringDB database' # Description }) # Load the QIDs for networks already available in the wikibase self.netid2qid = self.wh.extid2qid(qid=netid_qid) # Load the QIDs for peeringDB organizations self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL) # Load the QIDs for peeringDB IXs self.ixid2qid = self.wh.extid2qid(label=IXID_LABEL) # Added properties will have this reference information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), URL_PDB_NETS), (self.wh.get_pid('point in time'), today)] # Session object to fetch peeringdb data retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[104, 500, 502, 503, 504]) self.http_session = requests.Session() self.http_session.mount('https://', HTTPAdapter(max_retries=retries)) def run(self): """Fetch networks information from PeeringDB and push to wikibase. Using multiple threads for better performances.""" req = self.http_session.get(URL_PDB_NETS) if req.status_code != 200: sys.exit('Error while fetching data from API') networks = json.loads(req.text)['data'] self.wh.login() # Login once for all threads for i, _ in enumerate(map(self.update_net, networks)): sys.stderr.write(f'\rProcessing... {i+1}/{len(networks)}') def update_net(self, network): """Add the network to wikibase if it's not already there and update its properties.""" # set property name statements = [[ self.wh.get_pid('name'), network['name'].strip(), self.reference ]] # link to corresponding organization org_qid = self.orgid2qid.get(str(network['org_id'])) if org_qid is not None: statements.append( [self.wh.get_pid('managed by'), org_qid, self.reference]) else: print('Error this organization is not in wikibase: ', network['org_id']) # set property website if network['website']: statements.append([ self.wh.get_pid('website'), network['website'], self.reference ]) # Update IX membership # Fetch membership for this network netixlan_url = URL_PDB_NETS + f'/{network["id"]}' req = self.http_session.get(netixlan_url) if req.status_code != 200: sys.exit(f'Error while fetching network data (id={network["id"]})') net_details = json.loads(req.text)['data'] if len(net_details) > 1: print(net_details) net_details = net_details[0] # Push membership to wikidata today = self.wh.today() netixlan_ref = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), netixlan_url), (self.wh.get_pid('point in time'), today)] for ixlan in net_details['netixlan_set']: ix_qid = self.ixid2qid.get(str(ixlan['ix_id'])) if ix_qid is None: print(f'Unknown IX: ix_id={ixlan["ix_id"]}') continue statements.append( [self.wh.get_pid('member of'), ix_qid, netixlan_ref]) # Update name, website, and organization for this network net_qid = self.net_qid(network) self.wh.upsert_statements('update peeringDB networks', net_qid, statements) return net_qid def net_qid(self, network): """Find the network QID for the given network. If this network is not yet registered in the wikibase then find (or create) the item corresponding to the network ASN and register the peeringDB network ID with this item. Return the network QID.""" # Check if the network is in the wikibase if str(network['id']) not in self.netid2qid: # Find or create the corresponding ASN item net_qid = self.wh.asn2qid(network['asn'], create=True) # Set properties for this new network net_qualifiers = [ (self.wh.get_pid('instance of'), self.wh.get_qid(NETID_LABEL)), ] statements = [[ self.wh.get_pid('external ID'), str(network['id']), [], net_qualifiers ]] # Add this network to the wikibase self.wh.upsert_statements('add new peeringDB network', net_qid, statements=statements) # keep track of this QID self.netid2qid[str(network['id'])] = net_qid return self.netid2qid[str(network['id'])]
class Crawler(object): def __init__(self): """Initialize wikihandy and qualifiers for pushed data""" # Helper for wiki access self.wh = Wikihandy() # Added properties will have this additional information today = self.wh.today() self.caida_qid = self.wh.get_qid('CAIDA') # Get the QID for ASRank project self.asrank_qid = self.wh.get_qid( 'CAIDA ASRank', create={ # Create it if it doesn't exist 'summary': 'add CAIDA ASRank', # Commit message 'description': "CAIDA's AS ranking derived from topological data collected by CAIDA's Archipelago Measurement Infrastructure and BGP routing data collected by the Route Views Project and RIPE NCC.", # Item description 'statements': [[self.wh.get_pid('managed by'), self.caida_qid]] }) self.reference = [(self.wh.get_pid('source'), self.caida_qid), (self.wh.get_pid('reference URL'), URL_API), (self.wh.get_pid('point in time'), today)] def run(self): """Fetch networks information from ASRank and push to wikibase. """ self.wh.login() # Login once for all threads pool = ThreadPoolExecutor() has_next = True i = 0 while has_next: req = requests.get(URL_API + f'?offset={i}') if req.status_code != 200: sys.exit('Error while fetching data from API') ranking = json.loads(req.text)['data']['asns'] has_next = ranking['pageInfo']['hasNextPage'] for res in pool.map(self.update_net, ranking['edges']): sys.stderr.write( f'\rProcessing... {i+1}/{ranking["totalCount"]}') i += 1 pool.shutdown() def update_net(self, asn): """Add the network to wikibase if it's not already there and update its properties.""" asn = asn['node'] # Properties statements = [] if asn['asnName']: statements.append( [self.wh.get_pid('name'), asn['asnName'], self.reference]) # set countries cc = asn['country']['iso'] if cc: statements.append([ self.wh.get_pid('country'), self.wh.country2qid(cc), self.reference ]) # set rank statements.append([ self.wh.get_pid('ranking'), { 'amount': asn['rank'], 'unit': self.asrank_qid, }, self.reference ]) # Commit to wikibase # Get the AS QID (create if AS is not yet registered) and commit changes net_qid = self.wh.asn2qid(asn['asn'], create=True) self.wh.upsert_statements('update from CAIDA ASRank', net_qid, statements)
class Crawler(object): def __init__(self): """Create an item representing the PeeringDB exchange point ID class if doesn't already exist. And fetch QIDs for exchange points already in the wikibase.""" # Helper for wiki access self.wh = Wikihandy() # Get the QID of the item representing PeeringDB IX IDs ixid_qid = self.wh.get_qid( IXID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add PeeringDB ix IDs', # Commit message 'description': 'Identifier for an exchange point in the PeeringDB database' # Description }) # Load the QIDs for ix already available in the wikibase self.ixid2qid = self.wh.extid2qid(qid=ixid_qid) # Load the QIDs for peeringDB organizations self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL) # Added properties will have this reference information self.today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), URL_PDB_IXS), (self.wh.get_pid('point in time'), self.today)] def run(self): """Fetch ixs information from PeeringDB and push to wikibase. Using multiple threads for better performances.""" req = requests.get(URL_PDB_IXS) if req.status_code != 200: sys.exit('Error while fetching IXs data') ixs = json.loads(req.text)['data'] self.wh.login() # Login once for all threads for i, ix in enumerate(ixs): # Get more info for this IX req = requests.get(f'{URL_PDB_IXS}/{ix["id"]}') if req.status_code != 200: sys.exit('Error while fetching IXs data') ix_info = json.loads(req.text)['data'][0] # Update info in wiki self.update_ix(ix_info) sys.stderr.write(f'\rProcessing... {i+1}/{len(ixs)}') def update_ix(self, ix): """Add the ix to wikibase if it's not already there and update its properties.""" # set property name statements = [[ self.wh.get_pid('instance of'), self.wh.get_qid('Internet exchange point') ], [self.wh.get_pid('name'), ix['name'].strip(), self.reference]] # link to corresponding organization org_qid = self.orgid2qid.get(str(ix['org_id'])) if org_qid is not None: statements.append( [self.wh.get_pid('managed by'), org_qid, self.reference]) else: print('Error this organization is not in wikibase: ', ix['org_id']) # set property country if ix['country']: country_qid = self.wh.country2qid(ix['country']) if country_qid is not None: statements.append( [self.wh.get_pid('country'), country_qid, self.reference]) # set property website if ix['website']: statements.append( [self.wh.get_pid('website'), ix['website'], self.reference]) # set traffic webpage if ix['url_stats']: statements.append([ self.wh.get_pid('website'), ix['url_stats'], # statement self.reference, # reference [ (self.wh.get_pid('instance of'), self.wh.get_qid('traffic statistics')), ] # qualifier ]) ix_qid = self.ix_qid(ix) # Update name, website, and organization for this IX self.wh.upsert_statements('update peeringDB ixs', ix_qid, statements) # update LAN corresponding to this IX if 'ixlan_set' in ix: for ixlan in ix['ixlan_set']: pfx_url = f'{URL_PDB_LAN}/{ixlan["id"]}' pfx_ref = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), pfx_url), (self.wh.get_pid('point in time'), self.today)] req = requests.get(pfx_url) if req.status_code != 200: sys.exit('Error while fetching IXs data') lans = json.loads(req.text)['data'] for lan in lans: for prefix in lan['ixpfx_set']: pfx_qid = self.wh.prefix2qid(prefix['prefix'], create=True) pfx_stmts = [[ self.wh.get_pid('instance of'), self.wh.get_qid('peering LAN'), pfx_ref ], [self.wh.get_pid('managed by'), ix_qid, pfx_ref]] self.wh.upsert_statements('update peeringDB ixlan', pfx_qid, pfx_stmts) return ix_qid def ix_qid(self, ix): """Find the ix QID for the given ix. If this ix is not yet registered in the wikibase then add it. Return the ix QID.""" # Check if the IX is in the wikibase if str(ix['id']) not in self.ixid2qid: # Set properties for this new ix ix_qualifiers = [ (self.wh.get_pid('instance of'), self.wh.get_qid(IXID_LABEL)), ] statements = [(self.wh.get_pid('instance of'), self.wh.get_qid('Internet exchange point')), (self.wh.get_pid('external ID'), str(ix['id']), [], ix_qualifiers)] # Add this ix to the wikibase ix_qid = self.wh.add_item('add new peeringDB IX', label=ix['name'], description=ix['name_long'], statements=statements) # keep track of this QID self.ixid2qid[str(ix['id'])] = ix_qid return self.ixid2qid[str(ix['id'])]
class Crawler(object): def __init__(self): """Initialize wikihandy and qualifiers for pushed data""" # Helper for wiki access self.wh = Wikihandy() # Added properties will have this additional information today = self.wh.today() self.apnic_qid = self.wh.get_qid('APNIC') self.url = URL_API # url will change for each country self.reference = [(self.wh.get_pid('source'), self.apnic_qid), (self.wh.get_pid('reference URL'), self.url), (self.wh.get_pid('point in time'), today)] self.countries = iso3166.countries_by_alpha2 def run(self): """Fetch data from APNIC and push to wikibase. """ self.wh.login() # Login once for all threads pool = ThreadPoolExecutor() for cc, country in self.countries.items(): # Get the QID of the selected country / create this country if needed self.countryrank_qid = self.wh.get_qid( f'APNIC eyeball estimates ({cc})', create={ # Create it if it doesn't exist 'summary': 'add APNIC eyeball estimates for ' + cc, 'description': "APNIC's AS population estimates" + "based on advertisement for " + country.name, 'statements': [ [self.wh.get_pid('managed by'), self.apnic_qid], [self.wh.get_pid('website'), URL_API], [self.wh.get_pid('country'), self.wh.country2qid(cc)], ] }) self.countrypercent_qid = self.wh.get_qid( f'% of Internet users in {country.name}', create={ # Create it if it doesn't exist 'summary': 'add APNIC eyeball estimates for ' + cc, 'description': "APNIC's AS population estimates" + "based on advertisement for " + country.name, 'statements': [ [self.wh.get_pid('managed by'), self.apnic_qid], [self.wh.get_pid('website'), URL_API], [self.wh.get_pid('country'), self.wh.country2qid(cc)], ] }) self.url = URL_API + f'{cc}/{cc}.asns.json?m={MIN_POP_PERC}' req = requests.get(self.url) if req.status_code != 200: sys.exit('Error while fetching data for ' + cc) ranking = json.loads(req.text) # Make sure the ranking is sorted and add rank field ranking.sort(key=lambda x: x['percent'], reverse=True) for i, asn in enumerate(ranking): asn['rank'] = i # Push data to wiki for i, res in enumerate(pool.map(self.update_net, ranking)): sys.stderr.write( f'\rProcessing {country.name}... {i+1}/{len(ranking)}') pool.shutdown() def update_net(self, asn): """Add the network to wikibase if it's not already there and update its properties.""" # Properties statements = [] # set name if asn['autnum']: statements.append( [self.wh.get_pid('name'), asn['autnum'], self.reference]) # set country if asn['cc']: statements.append([ self.wh.get_pid('country'), self.wh.country2qid(asn['cc']), self.reference ]) # set rank statements.append([ self.wh.get_pid('ranking'), { 'amount': asn['rank'], 'unit': self.countryrank_qid, }, self.reference ]) # set population statements.append([ self.wh.get_pid('population'), { 'amount': asn['percent'], 'unit': self.countrypercent_qid, }, self.reference ]) # Commit to wikibase # Get the AS QID (create if AS is not yet registered) and commit changes net_qid = self.wh.asn2qid(asn['as'], create=True) self.wh.upsert_statements('update from APNIC eyeball ranking', net_qid, statements)