Пример #1
0
class GetCurrentTalkContent(GetCurrentContent):
    """
    The simple prefix behavior means this operation will only work on
    namespace 0 pages. I wouldn't rely on this operation being around
    for long.
    """
    input_field = MultiParam('titles', val_prefix='Talk:', key_prefix=False)
    examples = [
        OperationExample('This page does not exist'),
        OperationExample('Coffee')
    ]
Пример #2
0
class GetPageHTML(Operation):
    input_field = SingleParam('title')
    examples = [OperationExample('Africa', limit=1)]
    output_type = Operation
    _limit = 1

    def __init__(self, *a, **kw):
        super(GetPageHTML, self).__init__(*a, **kw)
        self.web_client = getattr(self.client,
                                  'web_client',
                                  DEFAULT_WEB_CLIENT)
        self.raise_exc = kw.pop('raise_exc', True)
        source_info = getattr(self.client, 'source_info', None)
        if source_info:
            main_title = source_info.mainpage
            main_url = source_info.base
            self.base_url = main_url[:-len(main_title)]
        else:
            self.base_url = DEFAULT_BASE_URL
        self.url = self.base_url + self.input_param
        self.results = {}

    def process(self):
        try:
            resp = self.web_client.get(self.url)
        except Exception as e:
            self.exception = e
            if self.raise_exc:
                raise
            return self
        self.results[self.url] = resp.text
        raise NoMoreResults()
Пример #3
0
class GetAllImageInfos(GetImageInfos):
    field_prefix = 'gai'
    input_field = None
    fields = [StaticParam('generator', 'allimages'),
              StaticParam('prop', 'imageinfo'),
              StaticParam('gaiprop', DEFAULT_IMAGE_PROPS)]
    examples = [OperationExample()]
Пример #4
0
class GetPageRevisionInfos(QueryOperation):
    """
    Fetch revisions for pages.
    """
    field_prefix = 'rv'
    input_field = MultiParam('titles', key_prefix=False)
    fields = [
        StaticParam('prop', 'revisions'),
        MultiParam('prop', DEFAULT_PROPS)
    ]
    output_type = [RevisionInfo]
    examples = [OperationExample('Coffee', 10)]

    def extract_results(self, query_resp):
        ret = []
        pages = [
            p for p in query_resp.get('pages', {}).values()
            if 'missing' not in p
        ]
        for pid_dict in pages:
            for rev in pid_dict.get('revisions', []):
                rev_dict = dict(pid_dict)
                rev_dict.update(rev)
                rev_info = RevisionInfo.from_query(rev_dict,
                                                   source=self.source)
                ret.append(rev_info)
        return ret
Пример #5
0
class GetExternalLinks(QueryOperation):
    """
    Fetch page outgoing links to URLs outside of source wiki.
    """
    field_prefix = 'el'
    input_field = SingleParam('titles', key_prefix=False)
    fields = [StaticParam('prop', 'extlinks')]
    output_type = [ExternalLink]
    examples = [OperationExample('Croatian War of Independence')]

    def extract_results(self, query_resp):
        ret = []
        for pid_dict in query_resp.get('pages', {}).values():
            for el in pid_dict.get('extlinks', []):
                cur_dict = dict(pid_dict)
                cur_dict['source'] = self.source
                cur_dict['url'] = el.get('*')
                link = ExternalLink.from_query(cur_dict)
                ret.append(link)
        return ret

    def prepare_params(self, **kw):
        params = super(GetExternalLinks, self).prepare_params(**kw)
        if params.get('elcontinue'):
            params['eloffset'] = params.pop('elcontinue')
        return params
Пример #6
0
class GetProtections(QueryOperation):
    field_prefix = 'in'
    input_field = MultiParam('titles', key_prefix=False)
    fields = [StaticParam('prop', 'info'), StaticParam('inprop', 'protection')]
    output_type = ProtectionInfo
    examples = [
        OperationExample('Coffee'),
        OperationExample('Coffee|House'),
        OperationExample(['Coffee', 'House'])
    ]

    def extract_results(self, query_resp):
        ret = []
        for page_id, page in query_resp['pages'].iteritems():
            ret.append(ProtectionInfo(page['protection']))
        return ret
Пример #7
0
class GetFlattenedCategory(Operation):
    """
    Fetch all category's sub-categories.
    """
    subop_chain = [
        Tune(Recursive(GetSubcategoryInfos), priority='subcat_count')
    ]
    examples = [OperationExample('Africa', 100)]
Пример #8
0
class GetCategoryRecursive(Operation):
    """
    Fetch all the members of category and its sub-categories. A Wikipedia
    category tree can have a large number of shallow categories, so this
    operation will prioritize the larger categories by default.
    """
    subop_chain = (GetFlattenedCategory,
                   Tune(GetCategory, priority='total_count'))
    examples = [
        OperationExample('Africa', 100),
        OperationExample('Lists of slang', 10)
    ]

    def __init__(self, input_param, *a, **kw):
        super(GetCategoryRecursive, self).__init__(input_param, *a, **kw)
        root_cat_op = GetCategory(input_param, client=self.client)
        self.subop_queues[-1].op_queue.add(root_cat_op, 10**6)
Пример #9
0
class GetRevisionContent(GetCurrentContent):
    input_field = SingleParam('revids', key_prefix=False, attr='rev_id')
    fields = [
        StaticParam('prop', 'revisions'),
        MultiParam('prop', DEFAULT_PROPS + '|content'),
        SingleParam('parse', False)
    ]
    examples = [OperationExample('539916351')]
Пример #10
0
class GetCategoryArticlesRecursive(Operation):
    """
    Fetch all pages (namespace 0 and 1) in category and its sub-
    categories.
    """
    subop_chain = (GetFlattenedCategory,
                   Tune(GetCategoryArticles, priority='page_count'))
    examples = [
        OperationExample('Africa', 100),
        OperationExample('Lists of slang', 10)
    ]

    def __init__(self, input_param, *a, **kw):
        cls = GetCategoryArticlesRecursive
        super(cls, self).__init__(input_param, *a, **kw)
        root_cat_op = GetCategoryArticles(input_param, client=self.client)
        self.subop_queues[-1].op_queue.add(root_cat_op, 10**6)
Пример #11
0
class GetAllCategoryInfos(GetSubcategoryInfos):
    """
    Fetch all categories on the source wiki.
    """
    field_prefix = 'gac'
    input_field = None
    fields = [
        StaticParam('generator', 'allcategories'),
        StaticParam('prop', 'categoryinfo')
    ]
    examples = [OperationExample(doc='basic allcats')]
Пример #12
0
class GetLinks(QueryOperation):
    """
    Fetch page's outgoing links to other pages on source wiki.
    """
    field_prefix = 'gpl'
    input_field = SingleParam('titles', key_prefix=False)
    fields = [
        StaticParam('generator', 'links'),
        StaticParam('prop', 'info'),
        StaticParam('inprop', 'subjectid|talkid|protection'),
        MultiParam('namespace')
    ]
    output_type = [PageInfo]
    examples = [OperationExample('Coffee'), OperationExample('Aabach')]

    def extract_results(self, query_resp):
        ret = []
        for pid, pid_dict in query_resp['pages'].iteritems():
            page_info = PageInfo.from_query(pid_dict, source=self.source)
            ret.append(page_info)
        return ret
Пример #13
0
class GetRevisionInfos(GetPageRevisionInfos):
    """
    Fetch information about specific revision.
    """
    input_field = MultiParam('revids', attr='rev_id', key_prefix=False)
    output_type = RevisionInfo
    examples = [OperationExample(['538903663', '539916351', '531458383'])]

    def prepare_params(self, *a, **kw):
        ret = super(GetRevisionInfos, self).prepare_params()
        ret.pop(self.field_prefix + 'limit', None)
        return ret
Пример #14
0
class GetParsedTranscludes(Operation):
    '''
    Template names may redirect, but this operation doesn't handle that yet
    '''
    subop_chain = [GetTranscludes, GetCurrentContent, GetParsedTemplates]
    examples = [OperationExample('ArticleHistory', 10)]

    def _update_results(self, results):
        _, _, tmpl_name = self.input_param.rpartition(':')
        filt_res = [
            res for res in results if res.name.lower() == tmpl_name.lower()
        ]
        return super(GetParsedTranscludes, self)._update_results(filt_res)
Пример #15
0
class GetCurrentContent(QueryOperation):
    """
    Fetch full content for current (top) revision.
    """
    input_field = MultiParam('titles', key_prefix=False, attr='title')
    field_prefix = 'rv'
    fields = [
        StaticParam('prop', 'revisions'),
        MultiParam('prop', DEFAULT_PROPS + '|content'),
        SingleParam('parse', False),
        SingleParam('redirects', True, key_prefix=False)
    ]
    examples = [
        OperationExample('This page does not exist'),
        OperationExample('Coffee')
    ]
    output_type = Revision

    def extract_results(self, query_resp):
        ret = []
        #redirect_list = query_resp.get('redirects', [])  # TODO
        #redirects = dict([(r['from'], r['to']) for r in redirect_list])
        requested_title = self.input_param
        is_parsed = self.kwargs.get('rvparse', False)

        pages = query_resp.get('pages', {})
        for page_id, pid_dict in pages.iteritems():
            if int(page_id) < 0:
                continue
            rev_dict = dict(pid_dict)
            rev_dict.update(pid_dict['revisions'][0])
            revision = Revision.from_query(rev_dict,
                                           source=self.source,
                                           is_parsed=is_parsed)
            revision.req_title = requested_title
            ret.append(revision)
        return ret
Пример #16
0
class GetPageInfo(QueryOperation):
    field_prefix = 'in'
    input_field = MultiParam('titles', key_prefix=False)
    fields = [
        StaticParam('prop', 'info'),
        MultiParam('prop', 'subjectid|talkid|protection')
    ]
    output_type = PageInfo
    examples = [OperationExample(['Coffee', 'Category:Africa'])]

    def extract_results(self, query_resp):
        ret = []
        for k, pid_dict in query_resp['pages'].iteritems():
            page_info = PageInfo.from_query(pid_dict, source=self.source)
            ret.append(page_info)
        return ret
Пример #17
0
class GetTemplates(QueryOperation):
    field_prefix = 'gtl'
    input_field = MultiParam('titles', key_prefix=False)
    fields = [
        StaticParam('generator', 'templates'),
        StaticParam('prop', 'info'),
        StaticParam('inprop', 'subjectid|talkid|protection')
    ]
    output_type = [PageInfo]
    examples = [OperationExample('Coffee')]

    def extract_results(self, query_resp):
        ret = []
        for k, pid_dict in query_resp['pages'].iteritems():
            page_ident = PageInfo.from_query(pid_dict, source=self.source)
            ret.append(page_ident)
        return ret
Пример #18
0
class GetTranscludes(QueryOperation):
    input_field = SingleParam('title', val_prefix='Template:')
    field_prefix = 'gei'
    fields = [
        StaticParam('generator', 'embeddedin'),
        StaticParam('prop', 'info'),
        StaticParam('inprop', 'subjectid|talkid|protection')
    ]
    output_type = [PageInfo]
    examples = [OperationExample('Template:ArticleHistory')]

    def extract_results(self, query_resp):
        ret = []
        for k, pid_dict in query_resp.get('pages', {}).items():
            page_ident = PageInfo.from_query(pid_dict, source=self.source)
            ret.append(page_ident)
        return ret
Пример #19
0
class GetUserContribs(QueryOperation):
    field_prefix = 'uc'
    input_field = SingleParam('user')
    fields = [
        StaticParam('list', 'usercontribs'),
        StaticParam('ucprop', DEFAULT_PROPS)
    ]
    output_type = [RevisionInfo]
    examples = [OperationExample('Jimbo Wales')]

    def extract_results(self, query_resp):
        ret = []
        for rev_dict in query_resp.get('usercontribs', []):
            user_contrib = RevisionInfo.from_query(rev_dict,
                                                   source=self.source)
            ret.append(user_contrib)
        return ret
Пример #20
0
class GetParsedTemplates(Operation):
    input_field = PassthroughParam('content')
    output_type = [TemplateReference]
    examples = [OperationExample(_BASIC_CITE_TEST, limit=1)]

    @property
    def remaining(self):
        if self.results:
            return 0
        return 1  # TODO: fix

    def process(self):
        if None in self.results:
            raise NoMoreResults()
        content = getattr(self.input_param, 'content', self.input_param)
        res = get_page_templates(content, raise_exc=False)
        self.results[None] = res
        return list(res)
Пример #21
0
class GetRecentChanges(QueryOperation):
    field_prefix = 'grc'
    input_field = None
    fields = [
        StaticParam('generator', 'recentchanges'),
        StaticParam('prop', 'info'),
        StaticParam('inprop', 'subjectid|talkid|protection')
    ]
    output_type = [PageInfo]
    examples = [OperationExample()]

    def extract_results(self, query_resp):
        ret = []
        for pid, pid_dict in query_resp['pages'].iteritems():
            if pid.startswith('-'):
                continue
            page_ident = PageInfo.from_query(pid_dict, source=self.source)
            ret.append(page_ident)
        return ret
Пример #22
0
class GetBacklinks(QueryOperation):
    """
    Fetch page's incoming links from other pages on source wiki.
    """
    field_prefix = 'gbl'
    input_field = SingleParam('title')
    fields = [
        StaticParam('generator', 'backlinks'),
        StaticParam('prop', 'info'),
        StaticParam('inprop', 'subjectid|talkid|protection')
    ]
    output_type = [PageInfo]
    examples = [OperationExample('Coffee')]

    def extract_results(self, query_resp):
        ret = []
        for pid, pid_dict in query_resp.get('pages', {}).iteritems():
            page_info = PageInfo.from_query(pid_dict, source=self.source)
            ret.append(page_info)
        return ret
Пример #23
0
class GetCoordinates(QueryOperation):
    field_prefix = 'co'
    input_field = MultiParam('titles', key_prefix=False)
    fields = [
        StaticParam('prop', 'coordinates'),
        SingleParam('primary', 'all'),  # primary, secondary, all
        MultiParam('prop', DEFAULT_COORD_PROPS)
    ]
    output_type = [CoordinateIdentifier]
    examples = [OperationExample(['White House', 'Mount Everest'])]

    def extract_results(self, query_resp):
        ret = []
        for k, pid_dict in query_resp['pages'].iteritems():
            page_ident = PageIdentifier.from_query(pid_dict,
                                                   source=self.source)
            for coord in pid_dict['coordinates']:
                coord_ident = CoordinateIdentifier(coord, page_ident)
            ret.append(coord_ident)
        return ret
Пример #24
0
class GetImages(QueryOperation):
    """
    Fetch the images embedded on pages.
    """
    field_prefix = 'gim'
    input_field = MultiParam('titles', key_prefix=False)
    fields = [StaticParam('generator', 'images'),
              StaticParam('prop', 'info'),
              StaticParam('inprop', 'subjectid|talkid|protection')]
    output_type = [PageInfo]
    examples = [OperationExample('Coffee')]

    def extract_results(self, query_resp):
        ret = []
        for pid, pid_dict in query_resp['pages'].iteritems():
            if pid.startswith('-'):
                pid_dict['pageid'] = None  # TODO: breaks consistency :/
            page_ident = PageInfo.from_query(pid_dict,
                                             source=self.source)
            ret.append(page_ident)
        return ret
Пример #25
0
class GetInterwikiLinks(QueryOperation):
    """
    Fetch pages' interwiki links.
    """
    field_prefix = 'iw'
    input_field = MultiParam('titles', key_prefix=False)
    fields = [StaticParam('prop', 'iwlinks'), SingleParam('url', True)]
    output_type = [InterwikiLink]
    examples = [OperationExample('Coffee')]

    def extract_results(self, query_resp):
        ret = []
        for pid_dict in query_resp.get('pages', {}).values():
            for iwd in pid_dict.get('iwlinks', []):
                cur_dict = dict(pid_dict)
                cur_dict['source'] = self.source
                cur_dict['url'] = iwd.get('url')
                cur_dict['prefix'] = iwd.get('prefix')
                link = InterwikiLink.from_query(cur_dict)
                ret.append(link)
        return ret
Пример #26
0
class GetCategory(QueryOperation):
    """
    Fetch the members in category.
    """
    field_prefix = 'gcm'
    input_field = SingleParam('title', val_prefix='Category:')
    fields = [
        StaticParam('generator', 'categorymembers'),
        StaticParam('prop', 'info'),
        StaticParam('inprop', 'subjectid|talkid|protection'),
        MultiParam('namespace')
    ]
    output_type = [PageInfo]
    examples = [OperationExample('Featured_articles')]

    def extract_results(self, query_resp):
        ret = []
        for k, pid_dict in query_resp['pages'].iteritems():
            page_ident = PageInfo.from_query(pid_dict, source=self.source)
            ret.append(page_ident)
        return ret
Пример #27
0
class GetCategoryList(QueryOperation):
    """
    Fetch the categories containing pages.
    """
    field_prefix = 'gcl'
    input_field = MultiParam('titles', key_prefix=False)
    fields = [
        StaticParam('generator', 'categories'),
        StaticParam('prop', 'categoryinfo'),
        SingleParam('gclshow', '')
    ]  # hidden, !hidden
    output_type = [CategoryInfo]
    examples = [OperationExample('Physics')]

    def extract_results(self, query_resp):
        ret = []
        for k, pid_dict in query_resp['pages'].iteritems():
            cat_info = CategoryInfo.from_query(pid_dict, source=self.source)
            if cat_info.page_id < 0:
                continue
            ret.append(cat_info)
        return ret
Пример #28
0
class GeoSearch(QueryOperation):
    field_prefix = 'gs'
    input_field = MultiParam('coord')
    fields = [
        StaticParam('list', 'geosearch'),
        SingleParam('radius', 10000),  # must be within 10 and 10000
        #SingleParam('maxdim', 1000),  # does not work?
        SingleParam('globe', 'earth'),  # which planet? donno...
        SingleParam('namespace'),
        StaticParam('gsprop', DEFAULT_COORD_PROPS)
    ]
    output_type = [CoordinateIdentifier]
    examples = [OperationExample(('37.8197', '-122.479'), 1)]

    def extract_results(self, query_resp):
        ret = []
        for pid_dict in query_resp['geosearch']:
            page_ident = PageIdentifier.from_query(pid_dict,
                                                   source=self.source)
            coord_ident = CoordinateIdentifier(pid_dict, page_ident)
            ret.append(coord_ident)
        return ret
Пример #29
0
class GetSubcategoryInfos(QueryOperation):
    """
    Fetch `CategoryInfo` for category, used to count the members of
    sub-categories.
    """
    field_prefix = 'gcm'
    input_field = SingleParam('title', val_prefix='Category:')
    fields = [
        StaticParam('generator', 'categorymembers'),
        StaticParam('prop', 'categoryinfo'),
        StaticParam('gcmtype', 'subcat')
    ]
    output_type = [CategoryInfo]
    examples = [OperationExample('FA-Class_articles')]

    def extract_results(self, query_resp):
        ret = []
        for k, pid_dict in query_resp['pages'].iteritems():
            pid_dict.update(pid_dict.get('categoryinfo', {}))
            cat_info = CategoryInfo.from_query(pid_dict, source=self.source)
            if cat_info.page_id < 0:
                continue
            ret.append(cat_info)
        return ret
Пример #30
0
class GetFeedbackV5(QueryOperation):
    """
    article feedback v5 breaks standards in a couple ways.
      * the various v5 APIs use different prefixes (af/afvf)
      * it doesn't put its results under 'query', requiring a custom
      post_process_response()
    """
    field_prefix = 'afvf'
    input_field = SingleParam('pageid')
    fields = [
        StaticParam('list', 'articlefeedbackv5-view-feedback'),
        SingleParam('filter', default='featured')
    ]
    output_type = list
    examples = [OperationExample('604727')]

    def post_process_response(self, response):
        if not response.results:
            return {}
        return dict(response.results)

    def extract_results(self, query_resp):
        count = query_resp['articlefeedbackv5-view-feedback']['count']
        return ['TODO'] * int(count)