Пример #1
0
    def parse(self, response):

        api_response = api.revisions(
            [response.url.replace('https://www.wikidata.org/wiki/', '')])

        bot_url = self.get_bot_url(response)
        operator_url = self.get_operator_url(response)

        data = {
            'url':
            unquote(unescape(response.url.replace('https://', ''))),
            'bot_url':
            bot_url,
            'bot_name':
            self.get_bot_name(response),
            'bot_has_red_link':
            1 if striper.RED_LINK_RE.match(bot_url) else 0,
            'operator_url':
            operator_url,
            'operator_name':
            self.get_operator_name(response),
            'operator_has_red_link':
            1 if striper.RED_LINK_RE.match(operator_url) else 0,
            'is_successful':
            1 if response.url in self.json_data['successful_requests'] else 0,
            'first_edit':
            self.get_first_edit(api_response),
            'last_edit':
            self.get_last_edit(api_response),
            'closed_at':
            self.get_closed_at(response),
            'revision_count':
            self.get_revision_count(api_response),
            'editor_count':
            self.get_editor_count(api_response),
            'html':
            response.css('div#bodyContent').extract_first(),
            'task':
            None,
            'code':
            None,
            'function':
            None,
            'archive_comment':
            self.get_archive_comment(response.url),
            'summary':
            None,
            'retrieved_at':
            datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
        }

        for key in ['task', 'code', 'function', 'summary']:
            data[key] = self.xpath(key, response)

        for key, symbol in self.XPATH['symbols'].items():
            data[key + '_symbol_count'] = len(response.xpath(symbol))

        db.insert('requests_for_permissions', data)
Пример #2
0
    def create(cls):

        bots_with_botflag = []
        bots_without_botflag = []
        extension_bots = []
        bots = []

        for file in cls.FILES:
            with open(file) as f:
                reader = csv.reader(f)
                bots += [row for row in reader][0]

        with open('data/spiders/bots_with_botflag.csv') as f:
            reader = csv.reader(f)
            bots_with_botflag += [row for row in reader][0]

        with open('data/spiders/bots_without_botflag.csv') as f:
            reader = csv.reader(f)
            bots_without_botflag += [row for row in reader][0]

        with open('data/spiders/extension_bots.csv') as f:
            reader = csv.reader(f)
            extension_bots += [row for row in reader][0]

        bots = set(bots)

        batches = [
            list(bots)[i * 50:(i + 1) * 50]
            for i in range(int(len(bots) / 50) + 1)
        ]

        for batch in batches:

            for bot in api.users(batch)['query']['users']:

                if db.exists('bots', 'name', bot['name']):
                    continue

                bot['retrieved_at'] = datetime.datetime.utcnow().strftime(
                    "%Y-%m-%d %H:%M:%S")
                bot['has_botflag'] = 1 if bot[
                    'name'] in bots_with_botflag else 0 if bot[
                        'name'] in bots_without_botflag else None
                bot['is_extension_bot'] = 1 if bot[
                    'name'] in extension_bots else 0

                if 'invalid' in bot or 'missing' in bot:
                    bot = {
                        'name': bot['name'],
                        'retrieved_at': bot['retrieved_at'],
                        'has_botflag': bot['has_botflag'],
                        'is_extension_bot': bot['is_extension_bot']
                    }
                    db.insert('bots', bot)
                    continue

                bot.pop('blockinfo', None)

                bot['groups'] = ','.join(bot['groups'])
                bot['implicitgroups'] = ','.join(bot['implicitgroups'])
                bot['rights'] = ','.join(bot['rights'])
                bot['blockid'] = bot['blockid'] if 'blockid' in bot else None
                bot['blockedby'] = bot[
                    'blockedby'] if 'blockedby' in bot else None
                bot['blockedbyid'] = bot[
                    'blockedbyid'] if 'blockedbyid' in bot else None
                bot['blockedtimestamp'] = bot[
                    'blockedtimestamp'] if 'blockedtimestamp' in bot else None
                bot['blockreason'] = bot[
                    'blockreason'] if 'blockreason' in bot else None
                bot['blockexpiry'] = bot[
                    'blockexpiry'] if 'blockexpiry' in bot else None

                db.insert('bots', bot)


# BotsTableCreator.create()