class GetCurrentTalkContent(GetCurrentContent): """ The simple prefix behavior means this operation will only work on namespace 0 pages. I wouldn't rely on this operation being around for long. """ input_field = MultiParam('titles', val_prefix='Talk:', key_prefix=False) examples = [ OperationExample('This page does not exist'), OperationExample('Coffee') ]
class GetPageHTML(Operation): input_field = SingleParam('title') examples = [OperationExample('Africa', limit=1)] output_type = Operation _limit = 1 def __init__(self, *a, **kw): super(GetPageHTML, self).__init__(*a, **kw) self.web_client = getattr(self.client, 'web_client', DEFAULT_WEB_CLIENT) self.raise_exc = kw.pop('raise_exc', True) source_info = getattr(self.client, 'source_info', None) if source_info: main_title = source_info.mainpage main_url = source_info.base self.base_url = main_url[:-len(main_title)] else: self.base_url = DEFAULT_BASE_URL self.url = self.base_url + self.input_param self.results = {} def process(self): try: resp = self.web_client.get(self.url) except Exception as e: self.exception = e if self.raise_exc: raise return self self.results[self.url] = resp.text raise NoMoreResults()
class GetAllImageInfos(GetImageInfos): field_prefix = 'gai' input_field = None fields = [StaticParam('generator', 'allimages'), StaticParam('prop', 'imageinfo'), StaticParam('gaiprop', DEFAULT_IMAGE_PROPS)] examples = [OperationExample()]
class GetPageRevisionInfos(QueryOperation): """ Fetch revisions for pages. """ field_prefix = 'rv' input_field = MultiParam('titles', key_prefix=False) fields = [ StaticParam('prop', 'revisions'), MultiParam('prop', DEFAULT_PROPS) ] output_type = [RevisionInfo] examples = [OperationExample('Coffee', 10)] def extract_results(self, query_resp): ret = [] pages = [ p for p in query_resp.get('pages', {}).values() if 'missing' not in p ] for pid_dict in pages: for rev in pid_dict.get('revisions', []): rev_dict = dict(pid_dict) rev_dict.update(rev) rev_info = RevisionInfo.from_query(rev_dict, source=self.source) ret.append(rev_info) return ret
class GetExternalLinks(QueryOperation): """ Fetch page outgoing links to URLs outside of source wiki. """ field_prefix = 'el' input_field = SingleParam('titles', key_prefix=False) fields = [StaticParam('prop', 'extlinks')] output_type = [ExternalLink] examples = [OperationExample('Croatian War of Independence')] def extract_results(self, query_resp): ret = [] for pid_dict in query_resp.get('pages', {}).values(): for el in pid_dict.get('extlinks', []): cur_dict = dict(pid_dict) cur_dict['source'] = self.source cur_dict['url'] = el.get('*') link = ExternalLink.from_query(cur_dict) ret.append(link) return ret def prepare_params(self, **kw): params = super(GetExternalLinks, self).prepare_params(**kw) if params.get('elcontinue'): params['eloffset'] = params.pop('elcontinue') return params
class GetProtections(QueryOperation): field_prefix = 'in' input_field = MultiParam('titles', key_prefix=False) fields = [StaticParam('prop', 'info'), StaticParam('inprop', 'protection')] output_type = ProtectionInfo examples = [ OperationExample('Coffee'), OperationExample('Coffee|House'), OperationExample(['Coffee', 'House']) ] def extract_results(self, query_resp): ret = [] for page_id, page in query_resp['pages'].iteritems(): ret.append(ProtectionInfo(page['protection'])) return ret
class GetFlattenedCategory(Operation): """ Fetch all category's sub-categories. """ subop_chain = [ Tune(Recursive(GetSubcategoryInfos), priority='subcat_count') ] examples = [OperationExample('Africa', 100)]
class GetCategoryRecursive(Operation): """ Fetch all the members of category and its sub-categories. A Wikipedia category tree can have a large number of shallow categories, so this operation will prioritize the larger categories by default. """ subop_chain = (GetFlattenedCategory, Tune(GetCategory, priority='total_count')) examples = [ OperationExample('Africa', 100), OperationExample('Lists of slang', 10) ] def __init__(self, input_param, *a, **kw): super(GetCategoryRecursive, self).__init__(input_param, *a, **kw) root_cat_op = GetCategory(input_param, client=self.client) self.subop_queues[-1].op_queue.add(root_cat_op, 10**6)
class GetRevisionContent(GetCurrentContent): input_field = SingleParam('revids', key_prefix=False, attr='rev_id') fields = [ StaticParam('prop', 'revisions'), MultiParam('prop', DEFAULT_PROPS + '|content'), SingleParam('parse', False) ] examples = [OperationExample('539916351')]
class GetCategoryArticlesRecursive(Operation): """ Fetch all pages (namespace 0 and 1) in category and its sub- categories. """ subop_chain = (GetFlattenedCategory, Tune(GetCategoryArticles, priority='page_count')) examples = [ OperationExample('Africa', 100), OperationExample('Lists of slang', 10) ] def __init__(self, input_param, *a, **kw): cls = GetCategoryArticlesRecursive super(cls, self).__init__(input_param, *a, **kw) root_cat_op = GetCategoryArticles(input_param, client=self.client) self.subop_queues[-1].op_queue.add(root_cat_op, 10**6)
class GetAllCategoryInfos(GetSubcategoryInfos): """ Fetch all categories on the source wiki. """ field_prefix = 'gac' input_field = None fields = [ StaticParam('generator', 'allcategories'), StaticParam('prop', 'categoryinfo') ] examples = [OperationExample(doc='basic allcats')]
class GetLinks(QueryOperation): """ Fetch page's outgoing links to other pages on source wiki. """ field_prefix = 'gpl' input_field = SingleParam('titles', key_prefix=False) fields = [ StaticParam('generator', 'links'), StaticParam('prop', 'info'), StaticParam('inprop', 'subjectid|talkid|protection'), MultiParam('namespace') ] output_type = [PageInfo] examples = [OperationExample('Coffee'), OperationExample('Aabach')] def extract_results(self, query_resp): ret = [] for pid, pid_dict in query_resp['pages'].iteritems(): page_info = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_info) return ret
class GetRevisionInfos(GetPageRevisionInfos): """ Fetch information about specific revision. """ input_field = MultiParam('revids', attr='rev_id', key_prefix=False) output_type = RevisionInfo examples = [OperationExample(['538903663', '539916351', '531458383'])] def prepare_params(self, *a, **kw): ret = super(GetRevisionInfos, self).prepare_params() ret.pop(self.field_prefix + 'limit', None) return ret
class GetParsedTranscludes(Operation): ''' Template names may redirect, but this operation doesn't handle that yet ''' subop_chain = [GetTranscludes, GetCurrentContent, GetParsedTemplates] examples = [OperationExample('ArticleHistory', 10)] def _update_results(self, results): _, _, tmpl_name = self.input_param.rpartition(':') filt_res = [ res for res in results if res.name.lower() == tmpl_name.lower() ] return super(GetParsedTranscludes, self)._update_results(filt_res)
class GetCurrentContent(QueryOperation): """ Fetch full content for current (top) revision. """ input_field = MultiParam('titles', key_prefix=False, attr='title') field_prefix = 'rv' fields = [ StaticParam('prop', 'revisions'), MultiParam('prop', DEFAULT_PROPS + '|content'), SingleParam('parse', False), SingleParam('redirects', True, key_prefix=False) ] examples = [ OperationExample('This page does not exist'), OperationExample('Coffee') ] output_type = Revision def extract_results(self, query_resp): ret = [] #redirect_list = query_resp.get('redirects', []) # TODO #redirects = dict([(r['from'], r['to']) for r in redirect_list]) requested_title = self.input_param is_parsed = self.kwargs.get('rvparse', False) pages = query_resp.get('pages', {}) for page_id, pid_dict in pages.iteritems(): if int(page_id) < 0: continue rev_dict = dict(pid_dict) rev_dict.update(pid_dict['revisions'][0]) revision = Revision.from_query(rev_dict, source=self.source, is_parsed=is_parsed) revision.req_title = requested_title ret.append(revision) return ret
class GetPageInfo(QueryOperation): field_prefix = 'in' input_field = MultiParam('titles', key_prefix=False) fields = [ StaticParam('prop', 'info'), MultiParam('prop', 'subjectid|talkid|protection') ] output_type = PageInfo examples = [OperationExample(['Coffee', 'Category:Africa'])] def extract_results(self, query_resp): ret = [] for k, pid_dict in query_resp['pages'].iteritems(): page_info = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_info) return ret
class GetTemplates(QueryOperation): field_prefix = 'gtl' input_field = MultiParam('titles', key_prefix=False) fields = [ StaticParam('generator', 'templates'), StaticParam('prop', 'info'), StaticParam('inprop', 'subjectid|talkid|protection') ] output_type = [PageInfo] examples = [OperationExample('Coffee')] def extract_results(self, query_resp): ret = [] for k, pid_dict in query_resp['pages'].iteritems(): page_ident = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_ident) return ret
class GetTranscludes(QueryOperation): input_field = SingleParam('title', val_prefix='Template:') field_prefix = 'gei' fields = [ StaticParam('generator', 'embeddedin'), StaticParam('prop', 'info'), StaticParam('inprop', 'subjectid|talkid|protection') ] output_type = [PageInfo] examples = [OperationExample('Template:ArticleHistory')] def extract_results(self, query_resp): ret = [] for k, pid_dict in query_resp.get('pages', {}).items(): page_ident = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_ident) return ret
class GetUserContribs(QueryOperation): field_prefix = 'uc' input_field = SingleParam('user') fields = [ StaticParam('list', 'usercontribs'), StaticParam('ucprop', DEFAULT_PROPS) ] output_type = [RevisionInfo] examples = [OperationExample('Jimbo Wales')] def extract_results(self, query_resp): ret = [] for rev_dict in query_resp.get('usercontribs', []): user_contrib = RevisionInfo.from_query(rev_dict, source=self.source) ret.append(user_contrib) return ret
class GetParsedTemplates(Operation): input_field = PassthroughParam('content') output_type = [TemplateReference] examples = [OperationExample(_BASIC_CITE_TEST, limit=1)] @property def remaining(self): if self.results: return 0 return 1 # TODO: fix def process(self): if None in self.results: raise NoMoreResults() content = getattr(self.input_param, 'content', self.input_param) res = get_page_templates(content, raise_exc=False) self.results[None] = res return list(res)
class GetRecentChanges(QueryOperation): field_prefix = 'grc' input_field = None fields = [ StaticParam('generator', 'recentchanges'), StaticParam('prop', 'info'), StaticParam('inprop', 'subjectid|talkid|protection') ] output_type = [PageInfo] examples = [OperationExample()] def extract_results(self, query_resp): ret = [] for pid, pid_dict in query_resp['pages'].iteritems(): if pid.startswith('-'): continue page_ident = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_ident) return ret
class GetBacklinks(QueryOperation): """ Fetch page's incoming links from other pages on source wiki. """ field_prefix = 'gbl' input_field = SingleParam('title') fields = [ StaticParam('generator', 'backlinks'), StaticParam('prop', 'info'), StaticParam('inprop', 'subjectid|talkid|protection') ] output_type = [PageInfo] examples = [OperationExample('Coffee')] def extract_results(self, query_resp): ret = [] for pid, pid_dict in query_resp.get('pages', {}).iteritems(): page_info = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_info) return ret
class GetCoordinates(QueryOperation): field_prefix = 'co' input_field = MultiParam('titles', key_prefix=False) fields = [ StaticParam('prop', 'coordinates'), SingleParam('primary', 'all'), # primary, secondary, all MultiParam('prop', DEFAULT_COORD_PROPS) ] output_type = [CoordinateIdentifier] examples = [OperationExample(['White House', 'Mount Everest'])] def extract_results(self, query_resp): ret = [] for k, pid_dict in query_resp['pages'].iteritems(): page_ident = PageIdentifier.from_query(pid_dict, source=self.source) for coord in pid_dict['coordinates']: coord_ident = CoordinateIdentifier(coord, page_ident) ret.append(coord_ident) return ret
class GetImages(QueryOperation): """ Fetch the images embedded on pages. """ field_prefix = 'gim' input_field = MultiParam('titles', key_prefix=False) fields = [StaticParam('generator', 'images'), StaticParam('prop', 'info'), StaticParam('inprop', 'subjectid|talkid|protection')] output_type = [PageInfo] examples = [OperationExample('Coffee')] def extract_results(self, query_resp): ret = [] for pid, pid_dict in query_resp['pages'].iteritems(): if pid.startswith('-'): pid_dict['pageid'] = None # TODO: breaks consistency :/ page_ident = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_ident) return ret
class GetInterwikiLinks(QueryOperation): """ Fetch pages' interwiki links. """ field_prefix = 'iw' input_field = MultiParam('titles', key_prefix=False) fields = [StaticParam('prop', 'iwlinks'), SingleParam('url', True)] output_type = [InterwikiLink] examples = [OperationExample('Coffee')] def extract_results(self, query_resp): ret = [] for pid_dict in query_resp.get('pages', {}).values(): for iwd in pid_dict.get('iwlinks', []): cur_dict = dict(pid_dict) cur_dict['source'] = self.source cur_dict['url'] = iwd.get('url') cur_dict['prefix'] = iwd.get('prefix') link = InterwikiLink.from_query(cur_dict) ret.append(link) return ret
class GetCategory(QueryOperation): """ Fetch the members in category. """ field_prefix = 'gcm' input_field = SingleParam('title', val_prefix='Category:') fields = [ StaticParam('generator', 'categorymembers'), StaticParam('prop', 'info'), StaticParam('inprop', 'subjectid|talkid|protection'), MultiParam('namespace') ] output_type = [PageInfo] examples = [OperationExample('Featured_articles')] def extract_results(self, query_resp): ret = [] for k, pid_dict in query_resp['pages'].iteritems(): page_ident = PageInfo.from_query(pid_dict, source=self.source) ret.append(page_ident) return ret
class GetCategoryList(QueryOperation): """ Fetch the categories containing pages. """ field_prefix = 'gcl' input_field = MultiParam('titles', key_prefix=False) fields = [ StaticParam('generator', 'categories'), StaticParam('prop', 'categoryinfo'), SingleParam('gclshow', '') ] # hidden, !hidden output_type = [CategoryInfo] examples = [OperationExample('Physics')] def extract_results(self, query_resp): ret = [] for k, pid_dict in query_resp['pages'].iteritems(): cat_info = CategoryInfo.from_query(pid_dict, source=self.source) if cat_info.page_id < 0: continue ret.append(cat_info) return ret
class GeoSearch(QueryOperation): field_prefix = 'gs' input_field = MultiParam('coord') fields = [ StaticParam('list', 'geosearch'), SingleParam('radius', 10000), # must be within 10 and 10000 #SingleParam('maxdim', 1000), # does not work? SingleParam('globe', 'earth'), # which planet? donno... SingleParam('namespace'), StaticParam('gsprop', DEFAULT_COORD_PROPS) ] output_type = [CoordinateIdentifier] examples = [OperationExample(('37.8197', '-122.479'), 1)] def extract_results(self, query_resp): ret = [] for pid_dict in query_resp['geosearch']: page_ident = PageIdentifier.from_query(pid_dict, source=self.source) coord_ident = CoordinateIdentifier(pid_dict, page_ident) ret.append(coord_ident) return ret
class GetSubcategoryInfos(QueryOperation): """ Fetch `CategoryInfo` for category, used to count the members of sub-categories. """ field_prefix = 'gcm' input_field = SingleParam('title', val_prefix='Category:') fields = [ StaticParam('generator', 'categorymembers'), StaticParam('prop', 'categoryinfo'), StaticParam('gcmtype', 'subcat') ] output_type = [CategoryInfo] examples = [OperationExample('FA-Class_articles')] def extract_results(self, query_resp): ret = [] for k, pid_dict in query_resp['pages'].iteritems(): pid_dict.update(pid_dict.get('categoryinfo', {})) cat_info = CategoryInfo.from_query(pid_dict, source=self.source) if cat_info.page_id < 0: continue ret.append(cat_info) return ret
class GetFeedbackV5(QueryOperation): """ article feedback v5 breaks standards in a couple ways. * the various v5 APIs use different prefixes (af/afvf) * it doesn't put its results under 'query', requiring a custom post_process_response() """ field_prefix = 'afvf' input_field = SingleParam('pageid') fields = [ StaticParam('list', 'articlefeedbackv5-view-feedback'), SingleParam('filter', default='featured') ] output_type = list examples = [OperationExample('604727')] def post_process_response(self, response): if not response.results: return {} return dict(response.results) def extract_results(self, query_resp): count = query_resp['articlefeedbackv5-view-feedback']['count'] return ['TODO'] * int(count)