示例#1
0
class Crawler(object):
    def __init__(self):

        # Helper for wiki access
        self.wh = Wikihandy()

        # Reference information for data pushed to the wikibase
        self.reference = [
            (self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')),
            (self.wh.get_pid('reference URL'), URL_RIPE_AS_NAME),
            (self.wh.get_pid('point in time'), self.wh.today())
            ]

    def run(self):
        """Fetch the AS name file from RIPE website and process lines one by one"""

        req = requests.get(URL_RIPE_AS_NAME)
        if req.status_code != 200:
            sys.exit('Error while fetching AS names')

        self.wh.login() # Login once for all threads, not needed with OAuth
        
        for i, res in enumerate(map(self.update_asn, req.text.splitlines())):
            sys.stderr.write(f'\rProcessed {i} ASes')
            

    def update_asn(self, one_line):
        # Parse given line to get ASN, name, and country code 
        asn, _, name_cc = one_line.partition(' ')
        name, _, cc = name_cc.rpartition(', ')

        asn_qid = self.wh.asn2qid(asn, create=True)
        cc_qid = self.wh.country2qid(cc, create=True)

        statements = []
        statements.append( [self.wh.get_pid('country'), cc_qid, self.reference] )  # Set country
        if cc_qid is not None:
            statements.append( [self.wh.get_pid('name'), name, self.reference] )       # Set AS name

        try:
            # Update AS name and country
            self.wh.upsert_statements('updates from RIPE AS names', asn_qid, statements)

        except Exception as error:
            # print errors and continue running
            print('Error for: ', one_line)
            print(error)

        return asn_qid
示例#2
0
class Crawler(object):
    def __init__(self):
        """Fetch QIDs for MANRS actions (create them if they are not in the 
        wikibase)."""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Actions defined by MANRS
        self.actions = [{
            'label':
            'MANRS Action 1: Filtering',
            'description':
            'Prevent propagation of incorrect routing information'
        }, {
            'label':
            'MANRS Action 2: Anti-spoofing',
            'description':
            'Prevent traffic with spoofed source IP addresses'
        }, {
            'label':
            'MANRS Action 3: Coordination',
            'description':
            'Facilitate global operational communication and coordination'
        }, {
            'label':
            'MANRS Action 4: Global Validation',
            'description':
            'Facilitate routing information on a global scale'
        }]

        # Get the QID for the four items representing MANRS actions
        for action in self.actions:
            action['qid'] = self.wh.get_qid(
                action['label'],
                create={  # Create it if it doesn't exist
                    'summary': 'add MANRS actions',  # Commit message
                    'description': action['description']  # Item description
                })

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('MANRS')),
                          (self.wh.get_pid('reference URL'), URL_MANRS),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch networks information from MANRS and push to wikibase. """

        req = requests.get(URL_MANRS)
        if req.status_code != 200:
            sys.exit('Error while fetching MANRS csv file')

        for i, row in enumerate(req.text.splitlines()):
            # Skip the header
            if i == 0:
                continue

            self.update_net(row)
            sys.stderr.write(f'\rProcessed {i} organizations')

    def update_net(self, one_line):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        _, areas, asns, act1, act2, act3, act4 = [
            col.strip() for col in one_line.split(',')
        ]

        # Properties
        statements = [
            [
                self.wh.get_pid('member of'),
                self.wh.get_qid('MANRS'), self.reference
            ],
        ]

        # set countries
        for cc in areas.split(';'):
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(cc), self.reference
            ])

        # set actions
        for i, action_bool in enumerate([act1, act2, act3, act4]):
            if action_bool == 'Yes':
                statements.append([
                    self.wh.get_pid('implements'), self.actions[i]['qid'],
                    self.reference
                ])

        # Commit to wikibase
        for asn in asns.split(';'):
            if asn:  # ignore organizations with no ASN
                # Get the AS QID (create if AS is not yet registered) and commit changes
                net_qid = self.wh.asn2qid(asn, create=True)
                self.wh.upsert_statements('update from MANRS membership',
                                          net_qid, statements)
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy """

        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        self.org_qid = self.wh.get_qid(ORG)
        self.countries = iso3166.countries_by_alpha2

        # Session object to fetch peeringdb data
        retries = Retry(total=15,
                        backoff_factor=0.2,
                        status_forcelist=[104, 500, 502, 503, 504])

        self.http_session = requests.Session()
        self.http_session.mount('https://', HTTPAdapter(max_retries=retries))

    def run(self):
        """Fetch data from API and push to wikibase. """

        for cc, country in self.countries.items():
            # Query IHR
            self.url = URL_API.format(country=cc)
            req = self.http_session.get(self.url + '&format=json')
            if req.status_code != 200:
                sys.exit('Error while fetching data for ' + cc)
            data = json.loads(req.text)
            ranking = data['results']

            # Setup references
            today = self.wh.today()
            self.references = [
                (self.wh.get_pid('source'), self.org_qid),
                (self.wh.get_pid('reference URL'), self.url),
                (self.wh.get_pid('point in time'), today),
            ]

            # Setup qualifiers
            country_qid = self.wh.country2qid(country.name)
            if country_qid is not None:
                self.qualifiers = [(self.wh.get_pid('country'), country_qid)]
            else:
                self.qualifiers = []

            # Find the latest timebin in the data
            last_timebin = '1970-01-01'
            for r in ranking:
                if arrow.get(r['timebin']) > arrow.get(last_timebin):
                    last_timebin = r['timebin']

            # Make ranking and push data
            for metric, weight in [('Total eyeball', 'eyeball'),
                                   ('Total AS', 'as')]:

                # Get the QID of the selected country / create this country if needed
                self.countryrank_qid = self.wh.get_qid(
                    f'IHR country ranking: {metric} ({cc})',
                    create={  # Create it if it doesn't exist
                        'summary':
                        f'add IHR {metric} ranking for ' + cc,
                        'description':
                        f"IHR's ranking of networks ({metric}) for " +
                        country.name,
                        'statements':
                        [[self.wh.get_pid('managed by'), self.org_qid]]
                    })

                # Filter out unnecessary data
                selected = [
                    r for r in ranking if
                    (r['weightscheme'] == weight and r['transitonly'] == False
                     and r['hege'] > MIN_HEGE and r['timebin'] == last_timebin)
                ]

                # Make sure the ranking is sorted and add rank field
                selected.sort(key=lambda x: x['hege'], reverse=True)
                for i, asn in enumerate(selected):
                    asn['rank'] = i

                # Push data to wiki
                for i, res in enumerate(map(self.update_entry, selected)):
                    sys.stderr.write(
                        f'\rProcessing {country.name}... {i+1}/{len(selected)}'
                    )

                sys.stderr.write('\n')

    def update_entry(self, asn):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        # Properties
        statements = []

        # set rank
        statements.append([
            self.wh.get_pid('ranking'), {
                'amount': asn['rank'],
                'unit': self.countryrank_qid,
            }, self.references, self.qualifiers
        ])

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn['asn'], create=True)
        self.wh.upsert_statements('update from IHR country ranking',
                                  net_qid,
                                  statements,
                                  asynchronous=False)
示例#4
0
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy and qualifiers for pushed data"""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        today = self.wh.today()
        self.caida_qid = self.wh.get_qid('CAIDA')

        # Get the QID for ASRank project
        self.asrank_qid = self.wh.get_qid(
            'CAIDA ASRank',
            create={  # Create it if it doesn't exist
                'summary': 'add CAIDA ASRank',  # Commit message
                'description':
                "CAIDA's AS ranking derived from topological data collected by CAIDA's Archipelago Measurement Infrastructure and BGP routing data collected by the Route Views Project and RIPE NCC.",  # Item description
                'statements': [[self.wh.get_pid('managed by'), self.caida_qid]]
            })

        self.reference = [(self.wh.get_pid('source'), self.caida_qid),
                          (self.wh.get_pid('reference URL'), URL_API),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch networks information from ASRank and push to wikibase. """

        self.wh.login()  # Login once for all threads
        pool = ThreadPoolExecutor()
        has_next = True
        i = 0
        while has_next:
            req = requests.get(URL_API + f'?offset={i}')
            if req.status_code != 200:
                sys.exit('Error while fetching data from API')

            ranking = json.loads(req.text)['data']['asns']
            has_next = ranking['pageInfo']['hasNextPage']

            for res in pool.map(self.update_net, ranking['edges']):
                sys.stderr.write(
                    f'\rProcessing... {i+1}/{ranking["totalCount"]}')
                i += 1

        pool.shutdown()

    def update_net(self, asn):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        asn = asn['node']

        # Properties
        statements = []

        if asn['asnName']:
            statements.append(
                [self.wh.get_pid('name'), asn['asnName'], self.reference])

        # set countries
        cc = asn['country']['iso']
        if cc:
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(cc), self.reference
            ])

        # set rank
        statements.append([
            self.wh.get_pid('ranking'), {
                'amount': asn['rank'],
                'unit': self.asrank_qid,
            }, self.reference
        ])

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn['asn'], create=True)
        self.wh.upsert_statements('update from CAIDA ASRank', net_qid,
                                  statements)
class Crawler(object):
    def __init__(self):
        """Create an item representing the PeeringDB exchange point ID class if 
        doesn't already exist. And fetch QIDs for exchange points already in the
        wikibase."""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID of the item representing PeeringDB IX IDs
        ixid_qid = self.wh.get_qid(
            IXID_LABEL,
            create={  # Create it if it doesn't exist
                'summary':
                'add PeeringDB ix IDs',  # Commit message
                'description':
                'Identifier for an exchange point in the PeeringDB database'  # Description
            })

        # Load the QIDs for ix already available in the wikibase
        self.ixid2qid = self.wh.extid2qid(qid=ixid_qid)
        # Load the QIDs for peeringDB organizations
        self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL)

        # Added properties will have this reference information
        self.today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('PeeringDB')),
                          (self.wh.get_pid('reference URL'), URL_PDB_IXS),
                          (self.wh.get_pid('point in time'), self.today)]

    def run(self):
        """Fetch ixs information from PeeringDB and push to wikibase. 
        Using multiple threads for better performances."""

        req = requests.get(URL_PDB_IXS)
        if req.status_code != 200:
            sys.exit('Error while fetching IXs data')
        ixs = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, ix in enumerate(ixs):

            # Get more info for this IX
            req = requests.get(f'{URL_PDB_IXS}/{ix["id"]}')
            if req.status_code != 200:
                sys.exit('Error while fetching IXs data')
            ix_info = json.loads(req.text)['data'][0]

            # Update info in wiki
            self.update_ix(ix_info)

            sys.stderr.write(f'\rProcessing... {i+1}/{len(ixs)}')

    def update_ix(self, ix):
        """Add the ix to wikibase if it's not already there and update its
        properties."""

        # set property name
        statements = [[
            self.wh.get_pid('instance of'),
            self.wh.get_qid('Internet exchange point')
        ], [self.wh.get_pid('name'), ix['name'].strip(), self.reference]]

        # link to corresponding organization
        org_qid = self.orgid2qid.get(str(ix['org_id']))
        if org_qid is not None:
            statements.append(
                [self.wh.get_pid('managed by'), org_qid, self.reference])
        else:
            print('Error this organization is not in wikibase: ', ix['org_id'])

        # set property country
        if ix['country']:
            country_qid = self.wh.country2qid(ix['country'])
            if country_qid is not None:
                statements.append(
                    [self.wh.get_pid('country'), country_qid, self.reference])

        # set property website
        if ix['website']:
            statements.append(
                [self.wh.get_pid('website'), ix['website'], self.reference])

        # set traffic webpage
        if ix['url_stats']:
            statements.append([
                self.wh.get_pid('website'),
                ix['url_stats'],  # statement
                self.reference,  # reference 
                [
                    (self.wh.get_pid('instance of'),
                     self.wh.get_qid('traffic statistics')),
                ]  # qualifier
            ])

        ix_qid = self.ix_qid(ix)
        # Update name, website, and organization for this IX
        self.wh.upsert_statements('update peeringDB ixs', ix_qid, statements)

        # update LAN corresponding to this IX
        if 'ixlan_set' in ix:
            for ixlan in ix['ixlan_set']:
                pfx_url = f'{URL_PDB_LAN}/{ixlan["id"]}'
                pfx_ref = [(self.wh.get_pid('source'),
                            self.wh.get_qid('PeeringDB')),
                           (self.wh.get_pid('reference URL'), pfx_url),
                           (self.wh.get_pid('point in time'), self.today)]

                req = requests.get(pfx_url)
                if req.status_code != 200:
                    sys.exit('Error while fetching IXs data')
                lans = json.loads(req.text)['data']

                for lan in lans:
                    for prefix in lan['ixpfx_set']:
                        pfx_qid = self.wh.prefix2qid(prefix['prefix'],
                                                     create=True)

                        pfx_stmts = [[
                            self.wh.get_pid('instance of'),
                            self.wh.get_qid('peering LAN'), pfx_ref
                        ], [self.wh.get_pid('managed by'), ix_qid, pfx_ref]]

                        self.wh.upsert_statements('update peeringDB ixlan',
                                                  pfx_qid, pfx_stmts)

        return ix_qid

    def ix_qid(self, ix):
        """Find the ix QID for the given ix.
        If this ix is not yet registered in the wikibase then add it.

        Return the ix QID."""

        # Check if the IX is in the wikibase
        if str(ix['id']) not in self.ixid2qid:
            # Set properties for this new ix
            ix_qualifiers = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(IXID_LABEL)),
            ]
            statements = [(self.wh.get_pid('instance of'),
                           self.wh.get_qid('Internet exchange point')),
                          (self.wh.get_pid('external ID'), str(ix['id']), [],
                           ix_qualifiers)]

            # Add this ix to the wikibase
            ix_qid = self.wh.add_item('add new peeringDB IX',
                                      label=ix['name'],
                                      description=ix['name_long'],
                                      statements=statements)
            # keep track of this QID
            self.ixid2qid[str(ix['id'])] = ix_qid

        return self.ixid2qid[str(ix['id'])]
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy and qualifiers for pushed data"""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        today = self.wh.today()
        self.apnic_qid = self.wh.get_qid('APNIC')
        self.url = URL_API  # url will change for each country
        self.reference = [(self.wh.get_pid('source'), self.apnic_qid),
                          (self.wh.get_pid('reference URL'), self.url),
                          (self.wh.get_pid('point in time'), today)]

        self.countries = iso3166.countries_by_alpha2

    def run(self):
        """Fetch data from APNIC and push to wikibase. """

        self.wh.login()  # Login once for all threads
        pool = ThreadPoolExecutor()

        for cc, country in self.countries.items():

            # Get the QID of the selected country / create this country if needed
            self.countryrank_qid = self.wh.get_qid(
                f'APNIC eyeball estimates ({cc})',
                create={  # Create it if it doesn't exist
                    'summary':
                    'add APNIC eyeball estimates for ' + cc,
                    'description':
                    "APNIC's AS population estimates" +
                    "based on advertisement for " + country.name,
                    'statements': [
                        [self.wh.get_pid('managed by'), self.apnic_qid],
                        [self.wh.get_pid('website'), URL_API],
                        [self.wh.get_pid('country'),
                         self.wh.country2qid(cc)],
                    ]
                })

            self.countrypercent_qid = self.wh.get_qid(
                f'% of Internet users in {country.name}',
                create={  # Create it if it doesn't exist
                    'summary':
                    'add APNIC eyeball estimates for ' + cc,
                    'description':
                    "APNIC's AS population estimates" +
                    "based on advertisement for " + country.name,
                    'statements': [
                        [self.wh.get_pid('managed by'), self.apnic_qid],
                        [self.wh.get_pid('website'), URL_API],
                        [self.wh.get_pid('country'),
                         self.wh.country2qid(cc)],
                    ]
                })

            self.url = URL_API + f'{cc}/{cc}.asns.json?m={MIN_POP_PERC}'
            req = requests.get(self.url)
            if req.status_code != 200:
                sys.exit('Error while fetching data for ' + cc)

            ranking = json.loads(req.text)
            # Make sure the ranking is sorted and add rank field
            ranking.sort(key=lambda x: x['percent'], reverse=True)
            for i, asn in enumerate(ranking):
                asn['rank'] = i

            # Push data to wiki
            for i, res in enumerate(pool.map(self.update_net, ranking)):
                sys.stderr.write(
                    f'\rProcessing {country.name}... {i+1}/{len(ranking)}')

        pool.shutdown()

    def update_net(self, asn):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        # Properties
        statements = []

        # set name
        if asn['autnum']:
            statements.append(
                [self.wh.get_pid('name'), asn['autnum'], self.reference])

        # set country
        if asn['cc']:
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(asn['cc']), self.reference
            ])

        # set rank
        statements.append([
            self.wh.get_pid('ranking'), {
                'amount': asn['rank'],
                'unit': self.countryrank_qid,
            }, self.reference
        ])

        # set population
        statements.append([
            self.wh.get_pid('population'), {
                'amount': asn['percent'],
                'unit': self.countrypercent_qid,
            }, self.reference
        ])

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn['as'], create=True)
        self.wh.upsert_statements('update from APNIC eyeball ranking', net_qid,
                                  statements)
class Crawler(object):
    def __init__(self):
        """
        """

        # Helper for wiki access
        self.wh = Wikihandy(preload=True)

        # Get the QID for RIPE Atlas
        self.atlas_qid = self.wh.get_qid(
            'RIPE Atlas',
            create={  # Create it if it doesn't exist
                'summary':
                'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas is a global, open, distributed Internet measurement platform, consisting of thousands of measurement devices that measure Internet connectivity in real time.',  # Item description
                'aliases':
                'Atlas|atlas',
                'statements':
                [[self.wh.get_pid('managed by'),
                  self.wh.get_qid('RIPE NCC')]]
            })

        # Get the QID for Atlas Probe
        self.atlas_probe_qid = self.wh.get_qid(
            'Atlas probe',
            create={  # Create it if it doesn't exist
                'summary': 'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas probes form the backbone of the RIPE Atlas infrastructure.',  # Item description
                'aliases': 'RIPE Atlas probe|atlas probe|RIPE atlas probe',
                'statements': [[self.wh.get_pid('part of'), self.atlas_qid]]
            })

        # Get the QID for Atlas Anchor
        self.atlas_anchor_qid = self.wh.get_qid(
            'Atlas anchor',
            create={  # Create it if it doesn't exist
                'summary': 'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas Anchors are located at hosts that can provide sufficient bandwidth to support a large number of incoming and outgoing measurements.',  # Item description
                'aliases': 'RIPE Atlas anchor|atlas anchor|RIPE atlas anchor',
                'statements': [[self.wh.get_pid('part of'), self.atlas_qid]]
            })

        # Get the QID of the item representing PeeringDB IX IDs
        self.probeid_qid = self.wh.get_qid(
            PROBEID_LABEL,
            create={  # Create it if it doesn't exist
                'summary':
                'add RIPE Atlas probes',  # Commit message
                'description':
                'Identifier for a probe in the RIPE Atlas measurement platform'  # Description
            })

        # Load the QIDs for probes already available in the wikibase
        self.probeid2qid = self.wh.extid2qid(qid=self.probeid_qid)

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('RIPE NCC')),
                          (self.wh.get_pid('reference URL'), URL),
                          (self.wh.get_pid('point in time'), today)]

        self.v4_qualifiers = [(self.wh.get_pid('IP version'),
                               self.wh.get_qid('IPv4'))]

        self.v6_qualifiers = [(self.wh.get_pid('IP version'),
                               self.wh.get_qid('IPv6'))]

    def run(self):
        """Fetch probe information from Atlas API and push to wikibase. """

        next_page = URL

        while next_page is not None:
            req = requests.get(next_page)
            if req.status_code != 200:
                sys.exit('Error while fetching the blocklist')

            info = json.loads(req.text)
            next_page = info['next']

            for i, probe in enumerate(info['results']):

                self.update_probe(probe)
                sys.stderr.write(f'\rProcessed {i+1} probes')
            sys.stderr.write(f'\n')

    def update_probe(self, probe):
        """Add the probe to wikibase if it's not already there and update its
        properties."""

        # TODO add status, geometry (geo-location) and IPs?

        # Properties for this probe
        statements = []

        if probe['is_anchor']:
            statements.append(
                [self.wh.get_pid('instance of'), self.atlas_probe_qid])
            statements.append(
                [self.wh.get_pid('instance of'), self.atlas_anchor_qid])
        if probe['asn_v4']:
            as_qid = self.wh.asn2qid(probe['asn_v4'])
            if as_qid:
                statements.append([
                    self.wh.get_pid('part of'), as_qid, self.reference,
                    self.v4_qualifiers
                ])
        if probe['asn_v6']:
            as_qid = self.wh.asn2qid(probe['asn_v6'])
            if as_qid:
                statements.append([
                    self.wh.get_pid('part of'), as_qid, self.reference,
                    self.v6_qualifiers
                ])
        if probe['prefix_v4']:
            prefix_qid = self.wh.prefix2qid(probe['prefix_v4'])
            if prefix_qid:
                statements.append(
                    [self.wh.get_pid('part of'), prefix_qid, self.reference])
        if probe['prefix_v6']:
            prefix_qid = self.wh.prefix2qid(probe['prefix_v6'])
            if prefix_qid:
                statements.append(
                    [self.wh.get_pid('part of'), prefix_qid, self.reference])
        if probe['country_code']:
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(probe['country_code']), self.reference
            ])
        if probe['first_connected']:
            statements.append([
                self.wh.get_pid('start time'),
                self.wh.to_wbtime(probe['first_connected']), self.reference
            ])

        if 'name' in probe['status']:
            # Get the QIDs for probes status
            status_qid = self.wh.get_qid(
                f'RIPE Atlas probe status: {probe["status"]["name"]}',
                create={  # Create it if it doesn't exist
                    'summary': 'add RIPE Atlas probe status',  # Commit message
                })

            if probe['status_since']:
                statements.append([
                    self.wh.get_pid('status'), status_qid, self.reference,
                    [(self.wh.get_pid('start time'),
                      self.wh.to_wbtime(probe['status_since']))]
                ])

            # set end time if the probe is abandonned
            if probe['status']['name'] == 'Abandoned' and probe['status_since']:
                statements.append([
                    self.wh.get_pid('end time'),
                    self.wh.to_wbtime(probe['status_since'])
                ])

        # Add probe tags
        for tag in probe['tags']:
            statements.append([
                self.wh.get_pid('tag'),
                self.wh.get_qid(tag['name'],
                                create={
                                    'summary': 'Add RIPE Atlas tag',
                                })
            ])

        # Commit to wikibase
        # Get the probe QID (create if probe is not yet registered) and commit changes
        probe_qid = self.probe_qid(probe)
        self.wh.upsert_statements('update from RIPE Atlas probes', probe_qid,
                                  statements)

    def probe_qid(self, probe):
        """Find the ix QID for the given probe ID.
        If this probe is not yet registered in the wikibase then add it.

        Return the probe QID."""

        id = str(probe['id'])

        # Check if the IX is in the wikibase
        if id not in self.probeid2qid:
            # Set properties for this new probe
            probeid_qualifiers = [
                (self.wh.get_pid('instance of'), self.probeid_qid),
            ]
            statements = [
                (self.wh.get_pid('instance of'), self.atlas_probe_qid),
                (self.wh.get_pid('external ID'), id, [], probeid_qualifiers)
            ]

            # Add this probe to the wikibase
            probe_qid = self.wh.add_item('add new RIPE Atlas probe',
                                         label=f'RIPE Atlas probe #{id}',
                                         description=probe['description'],
                                         statements=statements)
            # keep track of this QID
            self.probeid2qid[id] = probe_qid

        return self.probeid2qid[id]
示例#8
0
class Crawler(object):
    def __init__(self):
        """
        """

        # Helper for wiki access
        self.wh = Wikihandy(preload=True)

        # Get the QID for Spamhaus organization
        self.spamhaus_qid = self.wh.get_qid(
            'Spamhaus',
            create={  # Create it if it doesn't exist
                'summary':
                'add Spamhaus organization',  # Commit message
                'description':
                'The Spamhaus Project is an international organisation to track email spammers and spam-related activity',  # Item description
                'aliases':
                'The Spamhaus Project|the spamhaus project',
                'statements': [[
                    self.wh.get_pid('instance of'),
                    self.wh.get_qid('organization')
                ]]
            })

        # Get the QID for Spamhaus DROP project
        self.drop_qid = self.wh.get_qid(
            'Spamhaus DROP lists',
            create={  # Create it if it doesn't exist
                'summary': 'add Spamhaus block list',  # Commit message
                'description':
                "The Spamhaus Don't Route Or Peer Lists",  # Item description
                'statements':
                [[self.wh.get_pid('managed by'), self.spamhaus_qid]]
            })

        # Get the QID for Spamhaus ASN-DROP list
        self.asn_drop_qid = self.wh.get_qid(
            'Spamhaus ASN-DROP list',
            create={  # Create it if it doesn't exist
                'summary':
                'add Spamhaus block list',  # Commit message
                'description':
                'ASN-DROP contains a list of Autonomous System Numbers controlled by spammers or cyber criminals, as well as "hijacked" ASNs. ',  # Item description
                'statements':
                [[self.wh.get_pid('managed by'), self.spamhaus_qid],
                 [self.wh.get_pid('part of'), self.drop_qid]]
            })

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'), self.spamhaus_qid),
                          (self.wh.get_pid('reference URL'), URL),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch blocklist from Spamhaus and push to wikibase. """

        req = requests.get(URL)
        if req.status_code != 200:
            sys.exit('Error while fetching the blocklist')

        for i, row in enumerate(req.text.splitlines()):
            # Skip the header
            if row.startswith(';'):
                continue

            self.update_net(row)
            sys.stderr.write(f'\rProcessed {i+1} ASes')
        sys.stderr.write(f'\n')

    def update_net(self, one_line):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        asn, _, cc_name = one_line.partition(';')
        asn = int(asn[2:])
        cc, name = [word.strip() for word in cc_name.split('|')]

        # Properties for this AS
        statements = [
            [
                self.wh.get_pid('reported in'), self.asn_drop_qid,
                self.reference
            ],
            [self.wh.get_pid('name'), name, self.reference],
        ]

        # set countries
        if len(cc) == 2:
            cc_qid = self.wh.country2qid(cc)
            if cc_qid is not None:
                statements.append(
                    [self.wh.get_pid('country'), cc_qid, self.reference])

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn, create=True)
        self.wh.upsert_statements('update from Spamhaus ASN DROP list',
                                  net_qid, statements)