示例#1
0
class Crawler(object):
    def __init__(self):

        # Helper for wiki access
        self.wh = Wikihandy()

        # Reference information for data pushed to the wikibase
        self.reference = [
            (self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')),
            (self.wh.get_pid('reference URL'), URL_RIPE_AS_NAME),
            (self.wh.get_pid('point in time'), self.wh.today())
            ]

    def run(self):
        """Fetch the AS name file from RIPE website and process lines one by one"""

        req = requests.get(URL_RIPE_AS_NAME)
        if req.status_code != 200:
            sys.exit('Error while fetching AS names')

        self.wh.login() # Login once for all threads, not needed with OAuth
        
        for i, res in enumerate(map(self.update_asn, req.text.splitlines())):
            sys.stderr.write(f'\rProcessed {i} ASes')
            

    def update_asn(self, one_line):
        # Parse given line to get ASN, name, and country code 
        asn, _, name_cc = one_line.partition(' ')
        name, _, cc = name_cc.rpartition(', ')

        asn_qid = self.wh.asn2qid(asn, create=True)
        cc_qid = self.wh.country2qid(cc, create=True)

        statements = []
        statements.append( [self.wh.get_pid('country'), cc_qid, self.reference] )  # Set country
        if cc_qid is not None:
            statements.append( [self.wh.get_pid('name'), name, self.reference] )       # Set AS name

        try:
            # Update AS name and country
            self.wh.upsert_statements('updates from RIPE AS names', asn_qid, statements)

        except Exception as error:
            # print errors and continue running
            print('Error for: ', one_line)
            print(error)

        return asn_qid
示例#2
0
class Crawler(object):
    def __init__(self):
        """Fetch QIDs for MANRS actions (create them if they are not in the 
        wikibase)."""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Actions defined by MANRS
        self.actions = [{
            'label':
            'MANRS Action 1: Filtering',
            'description':
            'Prevent propagation of incorrect routing information'
        }, {
            'label':
            'MANRS Action 2: Anti-spoofing',
            'description':
            'Prevent traffic with spoofed source IP addresses'
        }, {
            'label':
            'MANRS Action 3: Coordination',
            'description':
            'Facilitate global operational communication and coordination'
        }, {
            'label':
            'MANRS Action 4: Global Validation',
            'description':
            'Facilitate routing information on a global scale'
        }]

        # Get the QID for the four items representing MANRS actions
        for action in self.actions:
            action['qid'] = self.wh.get_qid(
                action['label'],
                create={  # Create it if it doesn't exist
                    'summary': 'add MANRS actions',  # Commit message
                    'description': action['description']  # Item description
                })

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('MANRS')),
                          (self.wh.get_pid('reference URL'), URL_MANRS),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch networks information from MANRS and push to wikibase. """

        req = requests.get(URL_MANRS)
        if req.status_code != 200:
            sys.exit('Error while fetching MANRS csv file')

        for i, row in enumerate(req.text.splitlines()):
            # Skip the header
            if i == 0:
                continue

            self.update_net(row)
            sys.stderr.write(f'\rProcessed {i} organizations')

    def update_net(self, one_line):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        _, areas, asns, act1, act2, act3, act4 = [
            col.strip() for col in one_line.split(',')
        ]

        # Properties
        statements = [
            [
                self.wh.get_pid('member of'),
                self.wh.get_qid('MANRS'), self.reference
            ],
        ]

        # set countries
        for cc in areas.split(';'):
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(cc), self.reference
            ])

        # set actions
        for i, action_bool in enumerate([act1, act2, act3, act4]):
            if action_bool == 'Yes':
                statements.append([
                    self.wh.get_pid('implements'), self.actions[i]['qid'],
                    self.reference
                ])

        # Commit to wikibase
        for asn in asns.split(';'):
            if asn:  # ignore organizations with no ASN
                # Get the AS QID (create if AS is not yet registered) and commit changes
                net_qid = self.wh.asn2qid(asn, create=True)
                self.wh.upsert_statements('update from MANRS membership',
                                          net_qid, statements)
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy """

        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        self.org_qid = self.wh.get_qid(ORG)
        self.countries = iso3166.countries_by_alpha2

        # Session object to fetch peeringdb data
        retries = Retry(total=15,
                        backoff_factor=0.2,
                        status_forcelist=[104, 500, 502, 503, 504])

        self.http_session = requests.Session()
        self.http_session.mount('https://', HTTPAdapter(max_retries=retries))

    def run(self):
        """Fetch data from API and push to wikibase. """

        for cc, country in self.countries.items():
            # Query IHR
            self.url = URL_API.format(country=cc)
            req = self.http_session.get(self.url + '&format=json')
            if req.status_code != 200:
                sys.exit('Error while fetching data for ' + cc)
            data = json.loads(req.text)
            ranking = data['results']

            # Setup references
            today = self.wh.today()
            self.references = [
                (self.wh.get_pid('source'), self.org_qid),
                (self.wh.get_pid('reference URL'), self.url),
                (self.wh.get_pid('point in time'), today),
            ]

            # Setup qualifiers
            country_qid = self.wh.country2qid(country.name)
            if country_qid is not None:
                self.qualifiers = [(self.wh.get_pid('country'), country_qid)]
            else:
                self.qualifiers = []

            # Find the latest timebin in the data
            last_timebin = '1970-01-01'
            for r in ranking:
                if arrow.get(r['timebin']) > arrow.get(last_timebin):
                    last_timebin = r['timebin']

            # Make ranking and push data
            for metric, weight in [('Total eyeball', 'eyeball'),
                                   ('Total AS', 'as')]:

                # Get the QID of the selected country / create this country if needed
                self.countryrank_qid = self.wh.get_qid(
                    f'IHR country ranking: {metric} ({cc})',
                    create={  # Create it if it doesn't exist
                        'summary':
                        f'add IHR {metric} ranking for ' + cc,
                        'description':
                        f"IHR's ranking of networks ({metric}) for " +
                        country.name,
                        'statements':
                        [[self.wh.get_pid('managed by'), self.org_qid]]
                    })

                # Filter out unnecessary data
                selected = [
                    r for r in ranking if
                    (r['weightscheme'] == weight and r['transitonly'] == False
                     and r['hege'] > MIN_HEGE and r['timebin'] == last_timebin)
                ]

                # Make sure the ranking is sorted and add rank field
                selected.sort(key=lambda x: x['hege'], reverse=True)
                for i, asn in enumerate(selected):
                    asn['rank'] = i

                # Push data to wiki
                for i, res in enumerate(map(self.update_entry, selected)):
                    sys.stderr.write(
                        f'\rProcessing {country.name}... {i+1}/{len(selected)}'
                    )

                sys.stderr.write('\n')

    def update_entry(self, asn):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        # Properties
        statements = []

        # set rank
        statements.append([
            self.wh.get_pid('ranking'), {
                'amount': asn['rank'],
                'unit': self.countryrank_qid,
            }, self.references, self.qualifiers
        ])

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn['asn'], create=True)
        self.wh.upsert_statements('update from IHR country ranking',
                                  net_qid,
                                  statements,
                                  asynchronous=False)
示例#4
0
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy and qualifiers for pushed data"""
    
        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        today = self.wh.today()
        self.org_qid = self.wh.get_qid('RIPE NCC')
        self.url = URL_API  # url will change for each country
        self.reference = [
                (self.wh.get_pid('source'), self.org_qid),
                (self.wh.get_pid('reference URL'), self.url),
                (self.wh.get_pid('point in time'), today)
                ]

    def get_last_line(self,line):
        """Keep the end of the last given line"""

        self.last_line = line.rpartition(' ')[2]

    def get_all_lines(self, line):
        """Keep the end of each given lines"""

        self.all_lines.append(line.rpartition(' ')[2])

    def run(self):
        """Fetch data from RIPE and push to wikibase. """

        now = date.today()
        today = f'{now.year}/{now.month:02d}/{now.day:02d}'

        logging.info('Connecting to the FTP server..')
        # Find latest roa files
        filepaths = []
        ftp = FTP(FTP_URL)
        ftp.login()
        ftp.cwd(FTP_ROOT)

        self.all_lines = []
        self.last_line = ''
        ftp.retrlines('LIST', callback=self.get_all_lines)

        logging.info('Listing directories...')
        logging.info(f'{self.all_lines}')
        for dir in self.all_lines:
            path = FTP_ROOT+'/'+dir
            ftp.cwd(path)
            self.last_line = ''
            while self.last_line not in ['roas.csv', 'repo.tar.gz']:
                ftp.cwd(self.last_line)
                path += self.last_line + '/'
                ftp.retrlines('LIST', callback=self.get_last_line)

            if self.last_line == 'roas.csv' and today in path:
                path += 'roas.csv'
                logging.info(f'Found ROA file: {path}')
                filepaths.append(path)

        for filepath in filepaths:
            self.url = URL_API+filepath
            logging.info(f'Fetching ROA file: {self.url}')
            req = requests.get( self.url )
            if req.status_code != 200:
                sys.exit('Error while fetching data for '+filepath)
            
            # Aggregate data per prefix
            prefix_info = defaultdict(list)
            for line in req.text.splitlines():
                url, asn, prefix, max_length, start, end = line.split(',')
                
                # Skip header
                if url=='URI':
                    continue

                prefix_info[prefix].append({
                    'url': url, 
                    'asn': asn, 
                    'max_length': max_length, 
                    'start': start, 
                    'end': end})

            for i, (prefix, attributes) in enumerate(prefix_info.items()):
                self.update(prefix, attributes)
                sys.stderr.write(f'\rProcessing {filepath}... {i+1} prefixes ({prefix})     ')

    def update(self, prefix, attributes):
        """Add the prefix to wikibase if it's not already there and update its
        properties."""

        statements = []
        for att in attributes:
        
            qualifiers = [
                    [self.wh.get_pid('start time'), self.wh.to_wbtime(att['start'])],
                    [self.wh.get_pid('end time'), self.wh.to_wbtime(att['end'])],
                #    [self.wh.get_pid('reference URL'), url ] 
                    ]

            if att['max_length']:
                qualifiers.append( [self.wh.get_pid('maxLength'), {'amount': att['max_length']} ] )

            # Properties
            asn_qid = self.wh.asn2qid(att['asn'], create=True)
            if asn_qid is None:
                print('Error: ', line)
                return

            statements.append(
                        [ self.wh.get_pid('route origin authorization'), 
                            asn_qid,
                            self.reference,
                            qualifiers
                        ]
                    )

        # Commit to wikibase
        # Get the prefix QID (create if prefix is not yet registered) and commit changes
        prefix_qid = self.wh.prefix2qid(prefix, create=True) 
        self.wh.upsert_statements('update from RIPE RPKI data', prefix_qid, statements )
class Crawler(object):
    def __init__(self):
        """Create an item representing the PeeringDB network ID class if 
        doesn't already exist. And fetch QIDs for networks already in the
        wikibase."""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID of the item representing PeeringDB network IDs
        netid_qid = self.wh.get_qid(
            NETID_LABEL,
            create={  # Create it if it doesn't exist
                'summary': 'add PeeringDB net IDs',  # Commit message
                'description':
                'Identifier for a network in the PeeringDB database'  # Description
            })

        # Load the QIDs for networks already available in the wikibase
        self.netid2qid = self.wh.extid2qid(qid=netid_qid)
        # Load the QIDs for peeringDB organizations
        self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL)
        # Load the QIDs for peeringDB IXs
        self.ixid2qid = self.wh.extid2qid(label=IXID_LABEL)

        # Added properties will have this reference information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('PeeringDB')),
                          (self.wh.get_pid('reference URL'), URL_PDB_NETS),
                          (self.wh.get_pid('point in time'), today)]

        # Session object to fetch peeringdb data
        retries = Retry(total=5,
                        backoff_factor=0.1,
                        status_forcelist=[104, 500, 502, 503, 504])

        self.http_session = requests.Session()
        self.http_session.mount('https://', HTTPAdapter(max_retries=retries))

    def run(self):
        """Fetch networks information from PeeringDB and push to wikibase. 
        Using multiple threads for better performances."""

        req = self.http_session.get(URL_PDB_NETS)
        if req.status_code != 200:
            sys.exit('Error while fetching data from API')
        networks = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, _ in enumerate(map(self.update_net, networks)):
            sys.stderr.write(f'\rProcessing... {i+1}/{len(networks)}')

    def update_net(self, network):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        # set property name
        statements = [[
            self.wh.get_pid('name'), network['name'].strip(), self.reference
        ]]

        # link to corresponding organization
        org_qid = self.orgid2qid.get(str(network['org_id']))
        if org_qid is not None:
            statements.append(
                [self.wh.get_pid('managed by'), org_qid, self.reference])
        else:
            print('Error this organization is not in wikibase: ',
                  network['org_id'])

        # set property website
        if network['website']:
            statements.append([
                self.wh.get_pid('website'), network['website'], self.reference
            ])

        # Update IX membership
        # Fetch membership for this network
        netixlan_url = URL_PDB_NETS + f'/{network["id"]}'

        req = self.http_session.get(netixlan_url)
        if req.status_code != 200:
            sys.exit(f'Error while fetching network data (id={network["id"]})')

        net_details = json.loads(req.text)['data']
        if len(net_details) > 1:
            print(net_details)

        net_details = net_details[0]

        # Push membership to wikidata
        today = self.wh.today()
        netixlan_ref = [(self.wh.get_pid('source'),
                         self.wh.get_qid('PeeringDB')),
                        (self.wh.get_pid('reference URL'), netixlan_url),
                        (self.wh.get_pid('point in time'), today)]

        for ixlan in net_details['netixlan_set']:
            ix_qid = self.ixid2qid.get(str(ixlan['ix_id']))
            if ix_qid is None:
                print(f'Unknown IX: ix_id={ixlan["ix_id"]}')
                continue
            statements.append(
                [self.wh.get_pid('member of'), ix_qid, netixlan_ref])

        # Update name, website, and organization for this network
        net_qid = self.net_qid(network)
        self.wh.upsert_statements('update peeringDB networks', net_qid,
                                  statements)

        return net_qid

    def net_qid(self, network):
        """Find the network QID for the given network.
        If this network is not yet registered in the wikibase then find (or 
        create) the item corresponding to the network ASN and register 
        the peeringDB network ID with this item.

        Return the network QID."""

        # Check if the network is in the wikibase
        if str(network['id']) not in self.netid2qid:
            # Find or create the corresponding ASN item
            net_qid = self.wh.asn2qid(network['asn'], create=True)
            # Set properties for this new network
            net_qualifiers = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(NETID_LABEL)),
            ]
            statements = [[
                self.wh.get_pid('external ID'),
                str(network['id']), [], net_qualifiers
            ]]

            # Add this network to the wikibase
            self.wh.upsert_statements('add new peeringDB network',
                                      net_qid,
                                      statements=statements)
            # keep track of this QID
            self.netid2qid[str(network['id'])] = net_qid

        return self.netid2qid[str(network['id'])]
示例#6
0
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy and qualifiers for pushed data"""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        today = self.wh.today()
        self.caida_qid = self.wh.get_qid('CAIDA')

        # Get the QID for ASRank project
        self.asrank_qid = self.wh.get_qid(
            'CAIDA ASRank',
            create={  # Create it if it doesn't exist
                'summary': 'add CAIDA ASRank',  # Commit message
                'description':
                "CAIDA's AS ranking derived from topological data collected by CAIDA's Archipelago Measurement Infrastructure and BGP routing data collected by the Route Views Project and RIPE NCC.",  # Item description
                'statements': [[self.wh.get_pid('managed by'), self.caida_qid]]
            })

        self.reference = [(self.wh.get_pid('source'), self.caida_qid),
                          (self.wh.get_pid('reference URL'), URL_API),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch networks information from ASRank and push to wikibase. """

        self.wh.login()  # Login once for all threads
        pool = ThreadPoolExecutor()
        has_next = True
        i = 0
        while has_next:
            req = requests.get(URL_API + f'?offset={i}')
            if req.status_code != 200:
                sys.exit('Error while fetching data from API')

            ranking = json.loads(req.text)['data']['asns']
            has_next = ranking['pageInfo']['hasNextPage']

            for res in pool.map(self.update_net, ranking['edges']):
                sys.stderr.write(
                    f'\rProcessing... {i+1}/{ranking["totalCount"]}')
                i += 1

        pool.shutdown()

    def update_net(self, asn):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        asn = asn['node']

        # Properties
        statements = []

        if asn['asnName']:
            statements.append(
                [self.wh.get_pid('name'), asn['asnName'], self.reference])

        # set countries
        cc = asn['country']['iso']
        if cc:
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(cc), self.reference
            ])

        # set rank
        statements.append([
            self.wh.get_pid('ranking'), {
                'amount': asn['rank'],
                'unit': self.asrank_qid,
            }, self.reference
        ])

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn['asn'], create=True)
        self.wh.upsert_statements('update from CAIDA ASRank', net_qid,
                                  statements)
class Crawler(object):
    def __init__(self):
        """
        """

        # Helper for wiki access
        self.wh = Wikihandy(preload=True)

        # Get the QID for Routeviews organization
        self.org_qid = self.wh.get_qid('Route Views')
        self.today = self.wh.today()

    def run(self):
        """Fetch BGP data from collectors and push to wikibase. """

        today = arrow.now().replace(hour=0, minute=0)
        start = today.shift(hours=-1)
        end = today.shift(hours=1)
        stream = pybgpstream.BGPStream(
            from_time=int(start.timestamp()),
            until_time=int(end.timestamp()),
            record_type="ribs",
        )

        rtree = radix.Radix()

        sys.stderr.write(f'\nReading BGP data:\n')
        for i, elem in enumerate(stream):
            # Extract the prefix and origin ASN
            msg = elem.fields
            prefix = msg['prefix']
            origin_asn_str = msg['as-path'].split(' ')[-1]
            origin_asns = []
            if '{' in origin_asn_str:
                origin_asns = origin_asn_str[1:-1].split(',')
            else:
                origin_asns = [origin_asn_str]

            # Store origin ASN in radix tree
            rnode = rtree.search_exact(prefix)
            if rnode is None:
                rnode = rtree.add(prefix)
                rnode.data['origin'] = defaultdict(set)

            for asn in origin_asns:
                rnode.data['origin'][asn].add(elem.collector)
                sys.stderr.write(f'\rProcessed {i+1} BGP messages')

        sys.stderr.write(f'\nPushing data to IYP...\n')

        # Push all prefixes data to IYP
        for i, rnode in enumerate(rtree):
            data = rnode.data['origin']
            self.update_entry(rnode.prefix, data)
            sys.stderr.write(f'\rProcessed {i+1} prefixes')

    def update_entry(self, prefix, originasn_collector):
        """Add the prefix to wikibase if it's not already there and update its properties."""

        statements = []

        # set origin AS
        for asn, collectors in originasn_collector.items():
            for collector in collectors:
                # Added properties will have this additional information
                url = URL_RV
                if 'rrc' in collector:
                    url = URL_RIS

                self.reference = [
                    (self.wh.get_pid('source'), self.org_qid),
                    (self.wh.get_pid('reference URL'), url.format(collector)),
                    (self.wh.get_pid('point in time'), self.today)
                ]

                as_qid = self.wh.asn2qid(asn, create=True)
                statements.append(
                    [self.wh.get_pid('originated by'), as_qid, self.reference])

        # Commit to wikibase
        # Get the prefix QID (create if prefix is not yet registered) and commit changes
        prefix_qid = self.wh.prefix2qid(prefix, create=True)
        self.wh.upsert_statements('update from RIS/Routeviews RIBs',
                                  prefix_qid, statements)
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy and qualifiers for pushed data"""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        today = self.wh.today()
        self.apnic_qid = self.wh.get_qid('APNIC')
        self.url = URL_API  # url will change for each country
        self.reference = [(self.wh.get_pid('source'), self.apnic_qid),
                          (self.wh.get_pid('reference URL'), self.url),
                          (self.wh.get_pid('point in time'), today)]

        self.countries = iso3166.countries_by_alpha2

    def run(self):
        """Fetch data from APNIC and push to wikibase. """

        self.wh.login()  # Login once for all threads
        pool = ThreadPoolExecutor()

        for cc, country in self.countries.items():

            # Get the QID of the selected country / create this country if needed
            self.countryrank_qid = self.wh.get_qid(
                f'APNIC eyeball estimates ({cc})',
                create={  # Create it if it doesn't exist
                    'summary':
                    'add APNIC eyeball estimates for ' + cc,
                    'description':
                    "APNIC's AS population estimates" +
                    "based on advertisement for " + country.name,
                    'statements': [
                        [self.wh.get_pid('managed by'), self.apnic_qid],
                        [self.wh.get_pid('website'), URL_API],
                        [self.wh.get_pid('country'),
                         self.wh.country2qid(cc)],
                    ]
                })

            self.countrypercent_qid = self.wh.get_qid(
                f'% of Internet users in {country.name}',
                create={  # Create it if it doesn't exist
                    'summary':
                    'add APNIC eyeball estimates for ' + cc,
                    'description':
                    "APNIC's AS population estimates" +
                    "based on advertisement for " + country.name,
                    'statements': [
                        [self.wh.get_pid('managed by'), self.apnic_qid],
                        [self.wh.get_pid('website'), URL_API],
                        [self.wh.get_pid('country'),
                         self.wh.country2qid(cc)],
                    ]
                })

            self.url = URL_API + f'{cc}/{cc}.asns.json?m={MIN_POP_PERC}'
            req = requests.get(self.url)
            if req.status_code != 200:
                sys.exit('Error while fetching data for ' + cc)

            ranking = json.loads(req.text)
            # Make sure the ranking is sorted and add rank field
            ranking.sort(key=lambda x: x['percent'], reverse=True)
            for i, asn in enumerate(ranking):
                asn['rank'] = i

            # Push data to wiki
            for i, res in enumerate(pool.map(self.update_net, ranking)):
                sys.stderr.write(
                    f'\rProcessing {country.name}... {i+1}/{len(ranking)}')

        pool.shutdown()

    def update_net(self, asn):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        # Properties
        statements = []

        # set name
        if asn['autnum']:
            statements.append(
                [self.wh.get_pid('name'), asn['autnum'], self.reference])

        # set country
        if asn['cc']:
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(asn['cc']), self.reference
            ])

        # set rank
        statements.append([
            self.wh.get_pid('ranking'), {
                'amount': asn['rank'],
                'unit': self.countryrank_qid,
            }, self.reference
        ])

        # set population
        statements.append([
            self.wh.get_pid('population'), {
                'amount': asn['percent'],
                'unit': self.countrypercent_qid,
            }, self.reference
        ])

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn['as'], create=True)
        self.wh.upsert_statements('update from APNIC eyeball ranking', net_qid,
                                  statements)
class Crawler(object):
    def __init__(self):
        """
        """

        # Helper for wiki access
        self.wh = Wikihandy(preload=True)

        # Get the QID for RIPE Atlas
        self.atlas_qid = self.wh.get_qid(
            'RIPE Atlas',
            create={  # Create it if it doesn't exist
                'summary':
                'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas is a global, open, distributed Internet measurement platform, consisting of thousands of measurement devices that measure Internet connectivity in real time.',  # Item description
                'aliases':
                'Atlas|atlas',
                'statements':
                [[self.wh.get_pid('managed by'),
                  self.wh.get_qid('RIPE NCC')]]
            })

        # Get the QID for Atlas Probe
        self.atlas_probe_qid = self.wh.get_qid(
            'Atlas probe',
            create={  # Create it if it doesn't exist
                'summary': 'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas probes form the backbone of the RIPE Atlas infrastructure.',  # Item description
                'aliases': 'RIPE Atlas probe|atlas probe|RIPE atlas probe',
                'statements': [[self.wh.get_pid('part of'), self.atlas_qid]]
            })

        # Get the QID for Atlas Anchor
        self.atlas_anchor_qid = self.wh.get_qid(
            'Atlas anchor',
            create={  # Create it if it doesn't exist
                'summary': 'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas Anchors are located at hosts that can provide sufficient bandwidth to support a large number of incoming and outgoing measurements.',  # Item description
                'aliases': 'RIPE Atlas anchor|atlas anchor|RIPE atlas anchor',
                'statements': [[self.wh.get_pid('part of'), self.atlas_qid]]
            })

        # Get the QID of the item representing PeeringDB IX IDs
        self.probeid_qid = self.wh.get_qid(
            PROBEID_LABEL,
            create={  # Create it if it doesn't exist
                'summary':
                'add RIPE Atlas probes',  # Commit message
                'description':
                'Identifier for a probe in the RIPE Atlas measurement platform'  # Description
            })

        # Load the QIDs for probes already available in the wikibase
        self.probeid2qid = self.wh.extid2qid(qid=self.probeid_qid)

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('RIPE NCC')),
                          (self.wh.get_pid('reference URL'), URL),
                          (self.wh.get_pid('point in time'), today)]

        self.v4_qualifiers = [(self.wh.get_pid('IP version'),
                               self.wh.get_qid('IPv4'))]

        self.v6_qualifiers = [(self.wh.get_pid('IP version'),
                               self.wh.get_qid('IPv6'))]

    def run(self):
        """Fetch probe information from Atlas API and push to wikibase. """

        next_page = URL

        while next_page is not None:
            req = requests.get(next_page)
            if req.status_code != 200:
                sys.exit('Error while fetching the blocklist')

            info = json.loads(req.text)
            next_page = info['next']

            for i, probe in enumerate(info['results']):

                self.update_probe(probe)
                sys.stderr.write(f'\rProcessed {i+1} probes')
            sys.stderr.write(f'\n')

    def update_probe(self, probe):
        """Add the probe to wikibase if it's not already there and update its
        properties."""

        # TODO add status, geometry (geo-location) and IPs?

        # Properties for this probe
        statements = []

        if probe['is_anchor']:
            statements.append(
                [self.wh.get_pid('instance of'), self.atlas_probe_qid])
            statements.append(
                [self.wh.get_pid('instance of'), self.atlas_anchor_qid])
        if probe['asn_v4']:
            as_qid = self.wh.asn2qid(probe['asn_v4'])
            if as_qid:
                statements.append([
                    self.wh.get_pid('part of'), as_qid, self.reference,
                    self.v4_qualifiers
                ])
        if probe['asn_v6']:
            as_qid = self.wh.asn2qid(probe['asn_v6'])
            if as_qid:
                statements.append([
                    self.wh.get_pid('part of'), as_qid, self.reference,
                    self.v6_qualifiers
                ])
        if probe['prefix_v4']:
            prefix_qid = self.wh.prefix2qid(probe['prefix_v4'])
            if prefix_qid:
                statements.append(
                    [self.wh.get_pid('part of'), prefix_qid, self.reference])
        if probe['prefix_v6']:
            prefix_qid = self.wh.prefix2qid(probe['prefix_v6'])
            if prefix_qid:
                statements.append(
                    [self.wh.get_pid('part of'), prefix_qid, self.reference])
        if probe['country_code']:
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(probe['country_code']), self.reference
            ])
        if probe['first_connected']:
            statements.append([
                self.wh.get_pid('start time'),
                self.wh.to_wbtime(probe['first_connected']), self.reference
            ])

        if 'name' in probe['status']:
            # Get the QIDs for probes status
            status_qid = self.wh.get_qid(
                f'RIPE Atlas probe status: {probe["status"]["name"]}',
                create={  # Create it if it doesn't exist
                    'summary': 'add RIPE Atlas probe status',  # Commit message
                })

            if probe['status_since']:
                statements.append([
                    self.wh.get_pid('status'), status_qid, self.reference,
                    [(self.wh.get_pid('start time'),
                      self.wh.to_wbtime(probe['status_since']))]
                ])

            # set end time if the probe is abandonned
            if probe['status']['name'] == 'Abandoned' and probe['status_since']:
                statements.append([
                    self.wh.get_pid('end time'),
                    self.wh.to_wbtime(probe['status_since'])
                ])

        # Add probe tags
        for tag in probe['tags']:
            statements.append([
                self.wh.get_pid('tag'),
                self.wh.get_qid(tag['name'],
                                create={
                                    'summary': 'Add RIPE Atlas tag',
                                })
            ])

        # Commit to wikibase
        # Get the probe QID (create if probe is not yet registered) and commit changes
        probe_qid = self.probe_qid(probe)
        self.wh.upsert_statements('update from RIPE Atlas probes', probe_qid,
                                  statements)

    def probe_qid(self, probe):
        """Find the ix QID for the given probe ID.
        If this probe is not yet registered in the wikibase then add it.

        Return the probe QID."""

        id = str(probe['id'])

        # Check if the IX is in the wikibase
        if id not in self.probeid2qid:
            # Set properties for this new probe
            probeid_qualifiers = [
                (self.wh.get_pid('instance of'), self.probeid_qid),
            ]
            statements = [
                (self.wh.get_pid('instance of'), self.atlas_probe_qid),
                (self.wh.get_pid('external ID'), id, [], probeid_qualifiers)
            ]

            # Add this probe to the wikibase
            probe_qid = self.wh.add_item('add new RIPE Atlas probe',
                                         label=f'RIPE Atlas probe #{id}',
                                         description=probe['description'],
                                         statements=statements)
            # keep track of this QID
            self.probeid2qid[id] = probe_qid

        return self.probeid2qid[id]
示例#10
0
class Crawler(object):
    def __init__(self):
        """
        """

        # Helper for wiki access
        self.wh = Wikihandy(preload=True)

        # Get the QID for Spamhaus organization
        self.spamhaus_qid = self.wh.get_qid(
            'Spamhaus',
            create={  # Create it if it doesn't exist
                'summary':
                'add Spamhaus organization',  # Commit message
                'description':
                'The Spamhaus Project is an international organisation to track email spammers and spam-related activity',  # Item description
                'aliases':
                'The Spamhaus Project|the spamhaus project',
                'statements': [[
                    self.wh.get_pid('instance of'),
                    self.wh.get_qid('organization')
                ]]
            })

        # Get the QID for Spamhaus DROP project
        self.drop_qid = self.wh.get_qid(
            'Spamhaus DROP lists',
            create={  # Create it if it doesn't exist
                'summary': 'add Spamhaus block list',  # Commit message
                'description':
                "The Spamhaus Don't Route Or Peer Lists",  # Item description
                'statements':
                [[self.wh.get_pid('managed by'), self.spamhaus_qid]]
            })

        # Get the QID for Spamhaus ASN-DROP list
        self.asn_drop_qid = self.wh.get_qid(
            'Spamhaus ASN-DROP list',
            create={  # Create it if it doesn't exist
                'summary':
                'add Spamhaus block list',  # Commit message
                'description':
                'ASN-DROP contains a list of Autonomous System Numbers controlled by spammers or cyber criminals, as well as "hijacked" ASNs. ',  # Item description
                'statements':
                [[self.wh.get_pid('managed by'), self.spamhaus_qid],
                 [self.wh.get_pid('part of'), self.drop_qid]]
            })

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'), self.spamhaus_qid),
                          (self.wh.get_pid('reference URL'), URL),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch blocklist from Spamhaus and push to wikibase. """

        req = requests.get(URL)
        if req.status_code != 200:
            sys.exit('Error while fetching the blocklist')

        for i, row in enumerate(req.text.splitlines()):
            # Skip the header
            if row.startswith(';'):
                continue

            self.update_net(row)
            sys.stderr.write(f'\rProcessed {i+1} ASes')
        sys.stderr.write(f'\n')

    def update_net(self, one_line):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        asn, _, cc_name = one_line.partition(';')
        asn = int(asn[2:])
        cc, name = [word.strip() for word in cc_name.split('|')]

        # Properties for this AS
        statements = [
            [
                self.wh.get_pid('reported in'), self.asn_drop_qid,
                self.reference
            ],
            [self.wh.get_pid('name'), name, self.reference],
        ]

        # set countries
        if len(cc) == 2:
            cc_qid = self.wh.country2qid(cc)
            if cc_qid is not None:
                statements.append(
                    [self.wh.get_pid('country'), cc_qid, self.reference])

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn, create=True)
        self.wh.upsert_statements('update from Spamhaus ASN DROP list',
                                  net_qid, statements)