def parse(self, response): api_response = api.revisions( [response.url.replace('https://www.wikidata.org/wiki/', '')]) bot_url = self.get_bot_url(response) operator_url = self.get_operator_url(response) data = { 'url': unquote(unescape(response.url.replace('https://', ''))), 'bot_url': bot_url, 'bot_name': self.get_bot_name(response), 'bot_has_red_link': 1 if striper.RED_LINK_RE.match(bot_url) else 0, 'operator_url': operator_url, 'operator_name': self.get_operator_name(response), 'operator_has_red_link': 1 if striper.RED_LINK_RE.match(operator_url) else 0, 'is_successful': 1 if response.url in self.json_data['successful_requests'] else 0, 'first_edit': self.get_first_edit(api_response), 'last_edit': self.get_last_edit(api_response), 'closed_at': self.get_closed_at(response), 'revision_count': self.get_revision_count(api_response), 'editor_count': self.get_editor_count(api_response), 'html': response.css('div#bodyContent').extract_first(), 'task': None, 'code': None, 'function': None, 'archive_comment': self.get_archive_comment(response.url), 'summary': None, 'retrieved_at': datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"), } for key in ['task', 'code', 'function', 'summary']: data[key] = self.xpath(key, response) for key, symbol in self.XPATH['symbols'].items(): data[key + '_symbol_count'] = len(response.xpath(symbol)) db.insert('requests_for_permissions', data)
def create(cls): bots_with_botflag = [] bots_without_botflag = [] extension_bots = [] bots = [] for file in cls.FILES: with open(file) as f: reader = csv.reader(f) bots += [row for row in reader][0] with open('data/spiders/bots_with_botflag.csv') as f: reader = csv.reader(f) bots_with_botflag += [row for row in reader][0] with open('data/spiders/bots_without_botflag.csv') as f: reader = csv.reader(f) bots_without_botflag += [row for row in reader][0] with open('data/spiders/extension_bots.csv') as f: reader = csv.reader(f) extension_bots += [row for row in reader][0] bots = set(bots) batches = [ list(bots)[i * 50:(i + 1) * 50] for i in range(int(len(bots) / 50) + 1) ] for batch in batches: for bot in api.users(batch)['query']['users']: if db.exists('bots', 'name', bot['name']): continue bot['retrieved_at'] = datetime.datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") bot['has_botflag'] = 1 if bot[ 'name'] in bots_with_botflag else 0 if bot[ 'name'] in bots_without_botflag else None bot['is_extension_bot'] = 1 if bot[ 'name'] in extension_bots else 0 if 'invalid' in bot or 'missing' in bot: bot = { 'name': bot['name'], 'retrieved_at': bot['retrieved_at'], 'has_botflag': bot['has_botflag'], 'is_extension_bot': bot['is_extension_bot'] } db.insert('bots', bot) continue bot.pop('blockinfo', None) bot['groups'] = ','.join(bot['groups']) bot['implicitgroups'] = ','.join(bot['implicitgroups']) bot['rights'] = ','.join(bot['rights']) bot['blockid'] = bot['blockid'] if 'blockid' in bot else None bot['blockedby'] = bot[ 'blockedby'] if 'blockedby' in bot else None bot['blockedbyid'] = bot[ 'blockedbyid'] if 'blockedbyid' in bot else None bot['blockedtimestamp'] = bot[ 'blockedtimestamp'] if 'blockedtimestamp' in bot else None bot['blockreason'] = bot[ 'blockreason'] if 'blockreason' in bot else None bot['blockexpiry'] = bot[ 'blockexpiry'] if 'blockexpiry' in bot else None db.insert('bots', bot) # BotsTableCreator.create()