class Crawler(object): def __init__(self): # Helper for wiki access self.wh = Wikihandy() # Reference information for data pushed to the wikibase self.reference = [ (self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')), (self.wh.get_pid('reference URL'), URL_RIPE_AS_NAME), (self.wh.get_pid('point in time'), self.wh.today()) ] def run(self): """Fetch the AS name file from RIPE website and process lines one by one""" req = requests.get(URL_RIPE_AS_NAME) if req.status_code != 200: sys.exit('Error while fetching AS names') self.wh.login() # Login once for all threads, not needed with OAuth for i, res in enumerate(map(self.update_asn, req.text.splitlines())): sys.stderr.write(f'\rProcessed {i} ASes') def update_asn(self, one_line): # Parse given line to get ASN, name, and country code asn, _, name_cc = one_line.partition(' ') name, _, cc = name_cc.rpartition(', ') asn_qid = self.wh.asn2qid(asn, create=True) cc_qid = self.wh.country2qid(cc, create=True) statements = [] statements.append( [self.wh.get_pid('country'), cc_qid, self.reference] ) # Set country if cc_qid is not None: statements.append( [self.wh.get_pid('name'), name, self.reference] ) # Set AS name try: # Update AS name and country self.wh.upsert_statements('updates from RIPE AS names', asn_qid, statements) except Exception as error: # print errors and continue running print('Error for: ', one_line) print(error) return asn_qid
class Crawler(object): def __init__(self): """Fetch QIDs for MANRS actions (create them if they are not in the wikibase).""" # Helper for wiki access self.wh = Wikihandy() # Actions defined by MANRS self.actions = [{ 'label': 'MANRS Action 1: Filtering', 'description': 'Prevent propagation of incorrect routing information' }, { 'label': 'MANRS Action 2: Anti-spoofing', 'description': 'Prevent traffic with spoofed source IP addresses' }, { 'label': 'MANRS Action 3: Coordination', 'description': 'Facilitate global operational communication and coordination' }, { 'label': 'MANRS Action 4: Global Validation', 'description': 'Facilitate routing information on a global scale' }] # Get the QID for the four items representing MANRS actions for action in self.actions: action['qid'] = self.wh.get_qid( action['label'], create={ # Create it if it doesn't exist 'summary': 'add MANRS actions', # Commit message 'description': action['description'] # Item description }) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('MANRS')), (self.wh.get_pid('reference URL'), URL_MANRS), (self.wh.get_pid('point in time'), today)] def run(self): """Fetch networks information from MANRS and push to wikibase. """ req = requests.get(URL_MANRS) if req.status_code != 200: sys.exit('Error while fetching MANRS csv file') for i, row in enumerate(req.text.splitlines()): # Skip the header if i == 0: continue self.update_net(row) sys.stderr.write(f'\rProcessed {i} organizations') def update_net(self, one_line): """Add the network to wikibase if it's not already there and update its properties.""" _, areas, asns, act1, act2, act3, act4 = [ col.strip() for col in one_line.split(',') ] # Properties statements = [ [ self.wh.get_pid('member of'), self.wh.get_qid('MANRS'), self.reference ], ] # set countries for cc in areas.split(';'): statements.append([ self.wh.get_pid('country'), self.wh.country2qid(cc), self.reference ]) # set actions for i, action_bool in enumerate([act1, act2, act3, act4]): if action_bool == 'Yes': statements.append([ self.wh.get_pid('implements'), self.actions[i]['qid'], self.reference ]) # Commit to wikibase for asn in asns.split(';'): if asn: # ignore organizations with no ASN # Get the AS QID (create if AS is not yet registered) and commit changes net_qid = self.wh.asn2qid(asn, create=True) self.wh.upsert_statements('update from MANRS membership', net_qid, statements)
class Crawler(object): def __init__(self): """Initialize wikihandy """ # Helper for wiki access self.wh = Wikihandy() # Added properties will have this additional information self.org_qid = self.wh.get_qid(ORG) self.countries = iso3166.countries_by_alpha2 # Session object to fetch peeringdb data retries = Retry(total=15, backoff_factor=0.2, status_forcelist=[104, 500, 502, 503, 504]) self.http_session = requests.Session() self.http_session.mount('https://', HTTPAdapter(max_retries=retries)) def run(self): """Fetch data from API and push to wikibase. """ for cc, country in self.countries.items(): # Query IHR self.url = URL_API.format(country=cc) req = self.http_session.get(self.url + '&format=json') if req.status_code != 200: sys.exit('Error while fetching data for ' + cc) data = json.loads(req.text) ranking = data['results'] # Setup references today = self.wh.today() self.references = [ (self.wh.get_pid('source'), self.org_qid), (self.wh.get_pid('reference URL'), self.url), (self.wh.get_pid('point in time'), today), ] # Setup qualifiers country_qid = self.wh.country2qid(country.name) if country_qid is not None: self.qualifiers = [(self.wh.get_pid('country'), country_qid)] else: self.qualifiers = [] # Find the latest timebin in the data last_timebin = '1970-01-01' for r in ranking: if arrow.get(r['timebin']) > arrow.get(last_timebin): last_timebin = r['timebin'] # Make ranking and push data for metric, weight in [('Total eyeball', 'eyeball'), ('Total AS', 'as')]: # Get the QID of the selected country / create this country if needed self.countryrank_qid = self.wh.get_qid( f'IHR country ranking: {metric} ({cc})', create={ # Create it if it doesn't exist 'summary': f'add IHR {metric} ranking for ' + cc, 'description': f"IHR's ranking of networks ({metric}) for " + country.name, 'statements': [[self.wh.get_pid('managed by'), self.org_qid]] }) # Filter out unnecessary data selected = [ r for r in ranking if (r['weightscheme'] == weight and r['transitonly'] == False and r['hege'] > MIN_HEGE and r['timebin'] == last_timebin) ] # Make sure the ranking is sorted and add rank field selected.sort(key=lambda x: x['hege'], reverse=True) for i, asn in enumerate(selected): asn['rank'] = i # Push data to wiki for i, res in enumerate(map(self.update_entry, selected)): sys.stderr.write( f'\rProcessing {country.name}... {i+1}/{len(selected)}' ) sys.stderr.write('\n') def update_entry(self, asn): """Add the network to wikibase if it's not already there and update its properties.""" # Properties statements = [] # set rank statements.append([ self.wh.get_pid('ranking'), { 'amount': asn['rank'], 'unit': self.countryrank_qid, }, self.references, self.qualifiers ]) # Commit to wikibase # Get the AS QID (create if AS is not yet registered) and commit changes net_qid = self.wh.asn2qid(asn['asn'], create=True) self.wh.upsert_statements('update from IHR country ranking', net_qid, statements, asynchronous=False)
class Crawler(object): def __init__(self): """Initialize wikihandy and qualifiers for pushed data""" # Helper for wiki access self.wh = Wikihandy() # Added properties will have this additional information today = self.wh.today() self.org_qid = self.wh.get_qid('RIPE NCC') self.url = URL_API # url will change for each country self.reference = [ (self.wh.get_pid('source'), self.org_qid), (self.wh.get_pid('reference URL'), self.url), (self.wh.get_pid('point in time'), today) ] def get_last_line(self,line): """Keep the end of the last given line""" self.last_line = line.rpartition(' ')[2] def get_all_lines(self, line): """Keep the end of each given lines""" self.all_lines.append(line.rpartition(' ')[2]) def run(self): """Fetch data from RIPE and push to wikibase. """ now = date.today() today = f'{now.year}/{now.month:02d}/{now.day:02d}' logging.info('Connecting to the FTP server..') # Find latest roa files filepaths = [] ftp = FTP(FTP_URL) ftp.login() ftp.cwd(FTP_ROOT) self.all_lines = [] self.last_line = '' ftp.retrlines('LIST', callback=self.get_all_lines) logging.info('Listing directories...') logging.info(f'{self.all_lines}') for dir in self.all_lines: path = FTP_ROOT+'/'+dir ftp.cwd(path) self.last_line = '' while self.last_line not in ['roas.csv', 'repo.tar.gz']: ftp.cwd(self.last_line) path += self.last_line + '/' ftp.retrlines('LIST', callback=self.get_last_line) if self.last_line == 'roas.csv' and today in path: path += 'roas.csv' logging.info(f'Found ROA file: {path}') filepaths.append(path) for filepath in filepaths: self.url = URL_API+filepath logging.info(f'Fetching ROA file: {self.url}') req = requests.get( self.url ) if req.status_code != 200: sys.exit('Error while fetching data for '+filepath) # Aggregate data per prefix prefix_info = defaultdict(list) for line in req.text.splitlines(): url, asn, prefix, max_length, start, end = line.split(',') # Skip header if url=='URI': continue prefix_info[prefix].append({ 'url': url, 'asn': asn, 'max_length': max_length, 'start': start, 'end': end}) for i, (prefix, attributes) in enumerate(prefix_info.items()): self.update(prefix, attributes) sys.stderr.write(f'\rProcessing {filepath}... {i+1} prefixes ({prefix}) ') def update(self, prefix, attributes): """Add the prefix to wikibase if it's not already there and update its properties.""" statements = [] for att in attributes: qualifiers = [ [self.wh.get_pid('start time'), self.wh.to_wbtime(att['start'])], [self.wh.get_pid('end time'), self.wh.to_wbtime(att['end'])], # [self.wh.get_pid('reference URL'), url ] ] if att['max_length']: qualifiers.append( [self.wh.get_pid('maxLength'), {'amount': att['max_length']} ] ) # Properties asn_qid = self.wh.asn2qid(att['asn'], create=True) if asn_qid is None: print('Error: ', line) return statements.append( [ self.wh.get_pid('route origin authorization'), asn_qid, self.reference, qualifiers ] ) # Commit to wikibase # Get the prefix QID (create if prefix is not yet registered) and commit changes prefix_qid = self.wh.prefix2qid(prefix, create=True) self.wh.upsert_statements('update from RIPE RPKI data', prefix_qid, statements )
class Crawler(object): def __init__(self): """Create an item representing the PeeringDB network ID class if doesn't already exist. And fetch QIDs for networks already in the wikibase.""" # Helper for wiki access self.wh = Wikihandy() # Get the QID of the item representing PeeringDB network IDs netid_qid = self.wh.get_qid( NETID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add PeeringDB net IDs', # Commit message 'description': 'Identifier for a network in the PeeringDB database' # Description }) # Load the QIDs for networks already available in the wikibase self.netid2qid = self.wh.extid2qid(qid=netid_qid) # Load the QIDs for peeringDB organizations self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL) # Load the QIDs for peeringDB IXs self.ixid2qid = self.wh.extid2qid(label=IXID_LABEL) # Added properties will have this reference information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), URL_PDB_NETS), (self.wh.get_pid('point in time'), today)] # Session object to fetch peeringdb data retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[104, 500, 502, 503, 504]) self.http_session = requests.Session() self.http_session.mount('https://', HTTPAdapter(max_retries=retries)) def run(self): """Fetch networks information from PeeringDB and push to wikibase. Using multiple threads for better performances.""" req = self.http_session.get(URL_PDB_NETS) if req.status_code != 200: sys.exit('Error while fetching data from API') networks = json.loads(req.text)['data'] self.wh.login() # Login once for all threads for i, _ in enumerate(map(self.update_net, networks)): sys.stderr.write(f'\rProcessing... {i+1}/{len(networks)}') def update_net(self, network): """Add the network to wikibase if it's not already there and update its properties.""" # set property name statements = [[ self.wh.get_pid('name'), network['name'].strip(), self.reference ]] # link to corresponding organization org_qid = self.orgid2qid.get(str(network['org_id'])) if org_qid is not None: statements.append( [self.wh.get_pid('managed by'), org_qid, self.reference]) else: print('Error this organization is not in wikibase: ', network['org_id']) # set property website if network['website']: statements.append([ self.wh.get_pid('website'), network['website'], self.reference ]) # Update IX membership # Fetch membership for this network netixlan_url = URL_PDB_NETS + f'/{network["id"]}' req = self.http_session.get(netixlan_url) if req.status_code != 200: sys.exit(f'Error while fetching network data (id={network["id"]})') net_details = json.loads(req.text)['data'] if len(net_details) > 1: print(net_details) net_details = net_details[0] # Push membership to wikidata today = self.wh.today() netixlan_ref = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), netixlan_url), (self.wh.get_pid('point in time'), today)] for ixlan in net_details['netixlan_set']: ix_qid = self.ixid2qid.get(str(ixlan['ix_id'])) if ix_qid is None: print(f'Unknown IX: ix_id={ixlan["ix_id"]}') continue statements.append( [self.wh.get_pid('member of'), ix_qid, netixlan_ref]) # Update name, website, and organization for this network net_qid = self.net_qid(network) self.wh.upsert_statements('update peeringDB networks', net_qid, statements) return net_qid def net_qid(self, network): """Find the network QID for the given network. If this network is not yet registered in the wikibase then find (or create) the item corresponding to the network ASN and register the peeringDB network ID with this item. Return the network QID.""" # Check if the network is in the wikibase if str(network['id']) not in self.netid2qid: # Find or create the corresponding ASN item net_qid = self.wh.asn2qid(network['asn'], create=True) # Set properties for this new network net_qualifiers = [ (self.wh.get_pid('instance of'), self.wh.get_qid(NETID_LABEL)), ] statements = [[ self.wh.get_pid('external ID'), str(network['id']), [], net_qualifiers ]] # Add this network to the wikibase self.wh.upsert_statements('add new peeringDB network', net_qid, statements=statements) # keep track of this QID self.netid2qid[str(network['id'])] = net_qid return self.netid2qid[str(network['id'])]
class Crawler(object): def __init__(self): """Initialize wikihandy and qualifiers for pushed data""" # Helper for wiki access self.wh = Wikihandy() # Added properties will have this additional information today = self.wh.today() self.caida_qid = self.wh.get_qid('CAIDA') # Get the QID for ASRank project self.asrank_qid = self.wh.get_qid( 'CAIDA ASRank', create={ # Create it if it doesn't exist 'summary': 'add CAIDA ASRank', # Commit message 'description': "CAIDA's AS ranking derived from topological data collected by CAIDA's Archipelago Measurement Infrastructure and BGP routing data collected by the Route Views Project and RIPE NCC.", # Item description 'statements': [[self.wh.get_pid('managed by'), self.caida_qid]] }) self.reference = [(self.wh.get_pid('source'), self.caida_qid), (self.wh.get_pid('reference URL'), URL_API), (self.wh.get_pid('point in time'), today)] def run(self): """Fetch networks information from ASRank and push to wikibase. """ self.wh.login() # Login once for all threads pool = ThreadPoolExecutor() has_next = True i = 0 while has_next: req = requests.get(URL_API + f'?offset={i}') if req.status_code != 200: sys.exit('Error while fetching data from API') ranking = json.loads(req.text)['data']['asns'] has_next = ranking['pageInfo']['hasNextPage'] for res in pool.map(self.update_net, ranking['edges']): sys.stderr.write( f'\rProcessing... {i+1}/{ranking["totalCount"]}') i += 1 pool.shutdown() def update_net(self, asn): """Add the network to wikibase if it's not already there and update its properties.""" asn = asn['node'] # Properties statements = [] if asn['asnName']: statements.append( [self.wh.get_pid('name'), asn['asnName'], self.reference]) # set countries cc = asn['country']['iso'] if cc: statements.append([ self.wh.get_pid('country'), self.wh.country2qid(cc), self.reference ]) # set rank statements.append([ self.wh.get_pid('ranking'), { 'amount': asn['rank'], 'unit': self.asrank_qid, }, self.reference ]) # Commit to wikibase # Get the AS QID (create if AS is not yet registered) and commit changes net_qid = self.wh.asn2qid(asn['asn'], create=True) self.wh.upsert_statements('update from CAIDA ASRank', net_qid, statements)
class Crawler(object): def __init__(self): """ """ # Helper for wiki access self.wh = Wikihandy(preload=True) # Get the QID for Routeviews organization self.org_qid = self.wh.get_qid('Route Views') self.today = self.wh.today() def run(self): """Fetch BGP data from collectors and push to wikibase. """ today = arrow.now().replace(hour=0, minute=0) start = today.shift(hours=-1) end = today.shift(hours=1) stream = pybgpstream.BGPStream( from_time=int(start.timestamp()), until_time=int(end.timestamp()), record_type="ribs", ) rtree = radix.Radix() sys.stderr.write(f'\nReading BGP data:\n') for i, elem in enumerate(stream): # Extract the prefix and origin ASN msg = elem.fields prefix = msg['prefix'] origin_asn_str = msg['as-path'].split(' ')[-1] origin_asns = [] if '{' in origin_asn_str: origin_asns = origin_asn_str[1:-1].split(',') else: origin_asns = [origin_asn_str] # Store origin ASN in radix tree rnode = rtree.search_exact(prefix) if rnode is None: rnode = rtree.add(prefix) rnode.data['origin'] = defaultdict(set) for asn in origin_asns: rnode.data['origin'][asn].add(elem.collector) sys.stderr.write(f'\rProcessed {i+1} BGP messages') sys.stderr.write(f'\nPushing data to IYP...\n') # Push all prefixes data to IYP for i, rnode in enumerate(rtree): data = rnode.data['origin'] self.update_entry(rnode.prefix, data) sys.stderr.write(f'\rProcessed {i+1} prefixes') def update_entry(self, prefix, originasn_collector): """Add the prefix to wikibase if it's not already there and update its properties.""" statements = [] # set origin AS for asn, collectors in originasn_collector.items(): for collector in collectors: # Added properties will have this additional information url = URL_RV if 'rrc' in collector: url = URL_RIS self.reference = [ (self.wh.get_pid('source'), self.org_qid), (self.wh.get_pid('reference URL'), url.format(collector)), (self.wh.get_pid('point in time'), self.today) ] as_qid = self.wh.asn2qid(asn, create=True) statements.append( [self.wh.get_pid('originated by'), as_qid, self.reference]) # Commit to wikibase # Get the prefix QID (create if prefix is not yet registered) and commit changes prefix_qid = self.wh.prefix2qid(prefix, create=True) self.wh.upsert_statements('update from RIS/Routeviews RIBs', prefix_qid, statements)
class Crawler(object): def __init__(self): """Initialize wikihandy and qualifiers for pushed data""" # Helper for wiki access self.wh = Wikihandy() # Added properties will have this additional information today = self.wh.today() self.apnic_qid = self.wh.get_qid('APNIC') self.url = URL_API # url will change for each country self.reference = [(self.wh.get_pid('source'), self.apnic_qid), (self.wh.get_pid('reference URL'), self.url), (self.wh.get_pid('point in time'), today)] self.countries = iso3166.countries_by_alpha2 def run(self): """Fetch data from APNIC and push to wikibase. """ self.wh.login() # Login once for all threads pool = ThreadPoolExecutor() for cc, country in self.countries.items(): # Get the QID of the selected country / create this country if needed self.countryrank_qid = self.wh.get_qid( f'APNIC eyeball estimates ({cc})', create={ # Create it if it doesn't exist 'summary': 'add APNIC eyeball estimates for ' + cc, 'description': "APNIC's AS population estimates" + "based on advertisement for " + country.name, 'statements': [ [self.wh.get_pid('managed by'), self.apnic_qid], [self.wh.get_pid('website'), URL_API], [self.wh.get_pid('country'), self.wh.country2qid(cc)], ] }) self.countrypercent_qid = self.wh.get_qid( f'% of Internet users in {country.name}', create={ # Create it if it doesn't exist 'summary': 'add APNIC eyeball estimates for ' + cc, 'description': "APNIC's AS population estimates" + "based on advertisement for " + country.name, 'statements': [ [self.wh.get_pid('managed by'), self.apnic_qid], [self.wh.get_pid('website'), URL_API], [self.wh.get_pid('country'), self.wh.country2qid(cc)], ] }) self.url = URL_API + f'{cc}/{cc}.asns.json?m={MIN_POP_PERC}' req = requests.get(self.url) if req.status_code != 200: sys.exit('Error while fetching data for ' + cc) ranking = json.loads(req.text) # Make sure the ranking is sorted and add rank field ranking.sort(key=lambda x: x['percent'], reverse=True) for i, asn in enumerate(ranking): asn['rank'] = i # Push data to wiki for i, res in enumerate(pool.map(self.update_net, ranking)): sys.stderr.write( f'\rProcessing {country.name}... {i+1}/{len(ranking)}') pool.shutdown() def update_net(self, asn): """Add the network to wikibase if it's not already there and update its properties.""" # Properties statements = [] # set name if asn['autnum']: statements.append( [self.wh.get_pid('name'), asn['autnum'], self.reference]) # set country if asn['cc']: statements.append([ self.wh.get_pid('country'), self.wh.country2qid(asn['cc']), self.reference ]) # set rank statements.append([ self.wh.get_pid('ranking'), { 'amount': asn['rank'], 'unit': self.countryrank_qid, }, self.reference ]) # set population statements.append([ self.wh.get_pid('population'), { 'amount': asn['percent'], 'unit': self.countrypercent_qid, }, self.reference ]) # Commit to wikibase # Get the AS QID (create if AS is not yet registered) and commit changes net_qid = self.wh.asn2qid(asn['as'], create=True) self.wh.upsert_statements('update from APNIC eyeball ranking', net_qid, statements)
class Crawler(object): def __init__(self): """ """ # Helper for wiki access self.wh = Wikihandy(preload=True) # Get the QID for RIPE Atlas self.atlas_qid = self.wh.get_qid( 'RIPE Atlas', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas is a global, open, distributed Internet measurement platform, consisting of thousands of measurement devices that measure Internet connectivity in real time.', # Item description 'aliases': 'Atlas|atlas', 'statements': [[self.wh.get_pid('managed by'), self.wh.get_qid('RIPE NCC')]] }) # Get the QID for Atlas Probe self.atlas_probe_qid = self.wh.get_qid( 'Atlas probe', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas probes form the backbone of the RIPE Atlas infrastructure.', # Item description 'aliases': 'RIPE Atlas probe|atlas probe|RIPE atlas probe', 'statements': [[self.wh.get_pid('part of'), self.atlas_qid]] }) # Get the QID for Atlas Anchor self.atlas_anchor_qid = self.wh.get_qid( 'Atlas anchor', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas Anchors are located at hosts that can provide sufficient bandwidth to support a large number of incoming and outgoing measurements.', # Item description 'aliases': 'RIPE Atlas anchor|atlas anchor|RIPE atlas anchor', 'statements': [[self.wh.get_pid('part of'), self.atlas_qid]] }) # Get the QID of the item representing PeeringDB IX IDs self.probeid_qid = self.wh.get_qid( PROBEID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas probes', # Commit message 'description': 'Identifier for a probe in the RIPE Atlas measurement platform' # Description }) # Load the QIDs for probes already available in the wikibase self.probeid2qid = self.wh.extid2qid(qid=self.probeid_qid) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')), (self.wh.get_pid('reference URL'), URL), (self.wh.get_pid('point in time'), today)] self.v4_qualifiers = [(self.wh.get_pid('IP version'), self.wh.get_qid('IPv4'))] self.v6_qualifiers = [(self.wh.get_pid('IP version'), self.wh.get_qid('IPv6'))] def run(self): """Fetch probe information from Atlas API and push to wikibase. """ next_page = URL while next_page is not None: req = requests.get(next_page) if req.status_code != 200: sys.exit('Error while fetching the blocklist') info = json.loads(req.text) next_page = info['next'] for i, probe in enumerate(info['results']): self.update_probe(probe) sys.stderr.write(f'\rProcessed {i+1} probes') sys.stderr.write(f'\n') def update_probe(self, probe): """Add the probe to wikibase if it's not already there and update its properties.""" # TODO add status, geometry (geo-location) and IPs? # Properties for this probe statements = [] if probe['is_anchor']: statements.append( [self.wh.get_pid('instance of'), self.atlas_probe_qid]) statements.append( [self.wh.get_pid('instance of'), self.atlas_anchor_qid]) if probe['asn_v4']: as_qid = self.wh.asn2qid(probe['asn_v4']) if as_qid: statements.append([ self.wh.get_pid('part of'), as_qid, self.reference, self.v4_qualifiers ]) if probe['asn_v6']: as_qid = self.wh.asn2qid(probe['asn_v6']) if as_qid: statements.append([ self.wh.get_pid('part of'), as_qid, self.reference, self.v6_qualifiers ]) if probe['prefix_v4']: prefix_qid = self.wh.prefix2qid(probe['prefix_v4']) if prefix_qid: statements.append( [self.wh.get_pid('part of'), prefix_qid, self.reference]) if probe['prefix_v6']: prefix_qid = self.wh.prefix2qid(probe['prefix_v6']) if prefix_qid: statements.append( [self.wh.get_pid('part of'), prefix_qid, self.reference]) if probe['country_code']: statements.append([ self.wh.get_pid('country'), self.wh.country2qid(probe['country_code']), self.reference ]) if probe['first_connected']: statements.append([ self.wh.get_pid('start time'), self.wh.to_wbtime(probe['first_connected']), self.reference ]) if 'name' in probe['status']: # Get the QIDs for probes status status_qid = self.wh.get_qid( f'RIPE Atlas probe status: {probe["status"]["name"]}', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas probe status', # Commit message }) if probe['status_since']: statements.append([ self.wh.get_pid('status'), status_qid, self.reference, [(self.wh.get_pid('start time'), self.wh.to_wbtime(probe['status_since']))] ]) # set end time if the probe is abandonned if probe['status']['name'] == 'Abandoned' and probe['status_since']: statements.append([ self.wh.get_pid('end time'), self.wh.to_wbtime(probe['status_since']) ]) # Add probe tags for tag in probe['tags']: statements.append([ self.wh.get_pid('tag'), self.wh.get_qid(tag['name'], create={ 'summary': 'Add RIPE Atlas tag', }) ]) # Commit to wikibase # Get the probe QID (create if probe is not yet registered) and commit changes probe_qid = self.probe_qid(probe) self.wh.upsert_statements('update from RIPE Atlas probes', probe_qid, statements) def probe_qid(self, probe): """Find the ix QID for the given probe ID. If this probe is not yet registered in the wikibase then add it. Return the probe QID.""" id = str(probe['id']) # Check if the IX is in the wikibase if id not in self.probeid2qid: # Set properties for this new probe probeid_qualifiers = [ (self.wh.get_pid('instance of'), self.probeid_qid), ] statements = [ (self.wh.get_pid('instance of'), self.atlas_probe_qid), (self.wh.get_pid('external ID'), id, [], probeid_qualifiers) ] # Add this probe to the wikibase probe_qid = self.wh.add_item('add new RIPE Atlas probe', label=f'RIPE Atlas probe #{id}', description=probe['description'], statements=statements) # keep track of this QID self.probeid2qid[id] = probe_qid return self.probeid2qid[id]
class Crawler(object): def __init__(self): """ """ # Helper for wiki access self.wh = Wikihandy(preload=True) # Get the QID for Spamhaus organization self.spamhaus_qid = self.wh.get_qid( 'Spamhaus', create={ # Create it if it doesn't exist 'summary': 'add Spamhaus organization', # Commit message 'description': 'The Spamhaus Project is an international organisation to track email spammers and spam-related activity', # Item description 'aliases': 'The Spamhaus Project|the spamhaus project', 'statements': [[ self.wh.get_pid('instance of'), self.wh.get_qid('organization') ]] }) # Get the QID for Spamhaus DROP project self.drop_qid = self.wh.get_qid( 'Spamhaus DROP lists', create={ # Create it if it doesn't exist 'summary': 'add Spamhaus block list', # Commit message 'description': "The Spamhaus Don't Route Or Peer Lists", # Item description 'statements': [[self.wh.get_pid('managed by'), self.spamhaus_qid]] }) # Get the QID for Spamhaus ASN-DROP list self.asn_drop_qid = self.wh.get_qid( 'Spamhaus ASN-DROP list', create={ # Create it if it doesn't exist 'summary': 'add Spamhaus block list', # Commit message 'description': 'ASN-DROP contains a list of Autonomous System Numbers controlled by spammers or cyber criminals, as well as "hijacked" ASNs. ', # Item description 'statements': [[self.wh.get_pid('managed by'), self.spamhaus_qid], [self.wh.get_pid('part of'), self.drop_qid]] }) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.spamhaus_qid), (self.wh.get_pid('reference URL'), URL), (self.wh.get_pid('point in time'), today)] def run(self): """Fetch blocklist from Spamhaus and push to wikibase. """ req = requests.get(URL) if req.status_code != 200: sys.exit('Error while fetching the blocklist') for i, row in enumerate(req.text.splitlines()): # Skip the header if row.startswith(';'): continue self.update_net(row) sys.stderr.write(f'\rProcessed {i+1} ASes') sys.stderr.write(f'\n') def update_net(self, one_line): """Add the network to wikibase if it's not already there and update its properties.""" asn, _, cc_name = one_line.partition(';') asn = int(asn[2:]) cc, name = [word.strip() for word in cc_name.split('|')] # Properties for this AS statements = [ [ self.wh.get_pid('reported in'), self.asn_drop_qid, self.reference ], [self.wh.get_pid('name'), name, self.reference], ] # set countries if len(cc) == 2: cc_qid = self.wh.country2qid(cc) if cc_qid is not None: statements.append( [self.wh.get_pid('country'), cc_qid, self.reference]) # Commit to wikibase # Get the AS QID (create if AS is not yet registered) and commit changes net_qid = self.wh.asn2qid(asn, create=True) self.wh.upsert_statements('update from Spamhaus ASN DROP list', net_qid, statements)