示例#1
0
    def get_random_page_ids_for_lang(self, lang_code):

        # Call the API to get a random sample of page ids
        site = self.sites[lang_code]

        # API allows random page_ids to be retrieved max 500 at a time
        page_ids = []
        params = {
            'action': 'query',
            'generator': 'random',
            'grnnamespace': '0',
            'grnlimit': 500,
            'prop': 'info'
        }
        for i in range(0, int(self.pages_per_lang / 500)):
            request = api.APIRequest(site, params)
            result = request.query()
            page_ids.extend(result['query']['pages'].keys())

        if self.pages_per_lang % 500 > 0:
            params['grnlimit'] = self.pages_per_lang % 500
            request = api.APIRequest(site, params)
            result = request.query()
            page_ids.extend(result['query']['pages'].keys())

        return page_ids
示例#2
0
    def find_outer_section(title, text, id):
        # Check if closing comment is in here, if not look for the outer section.
        # If there is an outer section, choose it only if it has a closing statement,
        if len(title) > 1:
            section_title = title[1].encode('ascii', 'ignore')
            params = {
                'action': 'query',
                'titles': title[0],
                'prop': 'revisions',
                'rvprop': 'content',
                'format': 'json',
                'redirects': 'yes'
            }
            result = api.APIRequest(site, params).query()
            whole_text = _clean_wiki_text(
                result['query']['pages'][id]['revisions'][0]['*'])

            import wikichatter as wc
            parsed_whole_text = wc.parse(whole_text.encode('ascii', 'ignore'))
            sections = parsed_whole_text['sections']

            for outer_section in sections:
                found_subection = get_section(outer_section['subsections'],
                                              section_title)
                if found_subection:
                    outer_comments = outer_section['comments']
                    for comment in outer_comments:
                        comment_text = '\n'.join(comment['text_blocks'])
                        if re.search(_CLOSE_COMMENT_RE, comment_text):
                            params = {
                                'action': 'parse',
                                'prop': 'sections',
                                'page': title[0],
                                'redirects': 'yes'
                            }
                            result = api.APIRequest(site, params).query()
                            for s in result['parse']['sections']:
                                if s['line'] == outer_section.get(
                                        'heading').strip():
                                    section_index = s['index']
                                    params = {
                                        'action': 'query',
                                        'titles': title[0],
                                        'prop': 'revisions',
                                        'rvprop': 'content',
                                        'rvsection': section_index,
                                        'format': 'json',
                                        'redirects': 'yes'
                                    }
                                    result = api.APIRequest(site,
                                                            params).query()
                                    final_section_text = result['query'][
                                        'pages'][id]['revisions'][0]['*']
                                    return final_section_text
        return text
示例#3
0
def fileHook(parser_env, namespace, body):
    (file_name, pipe, size) = body.partition('|')

    site = wiki.Wiki('https://en.wikipedia.org/w/api.php')
    params = {
        'action': 'query',
        'titles': 'File:' + file_name,
        'prop': 'imageinfo',
        'iiprop': 'url|thumbmime',
        'iiurlwidth': size
    }
    request = api.APIRequest(site, params)
    result = request.query()
    try:
        url = result['query']['pages'].values()[0]['imageinfo'][0]['thumburl']
        desc_url = result['query']['pages'].values(
        )[0]['imageinfo'][0]['descriptionurl']
        width = result['query']['pages'].values(
        )[0]['imageinfo'][0]['thumbwidth']
        height = result['query']['pages'].values(
        )[0]['imageinfo'][0]['thumbheight']
    except:
        return file_name
    text = '<a href="%s" class="image">' % desc_url
    text += '<img alt="%s" src="%s" width="%s" height="%s"></a>' % (
        file_name, url, width, height)
    return text
示例#4
0
def parse_text(content):
    links = {}
    rough_links = re.findall(linkP, content)

    #~ if len(links) > 50:

    title_lists = list2params(rough_links)
    for title_list in title_lists:

        params = {
            'action': 'query',
            'titles': title_list,
            'redirects': 1
        }  #, 'pllimit': 500, 'redirects':1}
        request = api.APIRequest(site, params)
        if very_verbose: print('   ' + 'query: ' + str(params))
        result = request.query()

        for page in result['query']['pages']:
            if page != '-1' and 'ns' in result['query']['pages'][page]:
                if result['query']['pages'][page]['ns'] == 0:
                    link = result['query']['pages'][page][
                        'title']  #.replace(' ', '_')
                    links[link] = 1

    return links
    def extlinks_extraction(self, lang, title):
        links = []
        linklist = []

        site = wiki.Wiki("https://" + lang + ".wikipedia.org/w/api.php")

        #urllib2.quote(title.encode("utf8"))
        #title = title.encode("utf-8")
        params = {
            'action': 'query',
            'titles': title,
            'prop': 'extlinks',
            'ellimit': 500
        }
        req = api.APIRequest(site, params)

        for res in req.queryGen():
            #pprint.pprint(res)
            for pidkey in res['query']['pages']:
                #print res['query']['pages']
                if 'extlinks' in res['query']['pages'][pidkey]:
                    linklist = res['query']['pages'][pidkey][
                        'extlinks'] + linklist
            links = links + linklist
            linklist = []

    #    print links
        return links
示例#6
0
def queries():
    if args.queryfile:
        for l in open(args.queryfile):
            yield l.strip()
    elif args.query:
        yield args.query
    elif args.category and not HAS_WIKITOOLS:
        sys.exit(
            "-cat option given, but wikitools package is not present, see < https://github.com/alexz-enwp/wikitools >"
        )
    elif args.category and HAS_WIKITOOLS:
        site = wiki.Wiki("https://commons.wikimedia.org/w/api.php")
        query = []
        params = {
            'action': 'query',
            'prop': 'imageinfo',
            'iiprop': 'url',
            'generator': 'categorymembers',
            'gcmtitle': 'Category:' + args.category,
            'gcmnamespace': '6',
            'gcmprop': 'title'
        }

        req = api.APIRequest(site, params)

        for data in req.queryGen():
            keys = data['query']['pages'].keys()

            for key in keys:
                url = data['query']['pages'][key]['imageinfo'][0]['url']
                yield re.sub("https://upload.wikimedia.org", "", url)
    else:
        sys.exit("No query given")
示例#7
0
    def query_page_ids_for_lang(self, query, lang_code):

        # Call the API to query a sample of page ids
        site = self.sites[lang_code]

        # API allows page_ids to be retrieved max 500 at a time
        page_ids = []
        params = {
            'action': 'query',
            'generator': 'search',
            'gsrwhat': 'text',
            'gsrsearch': query,
            'grnlimit': 10,
            'prop': 'info'
        }

        request = api.APIRequest(site, params)
        result = request.query()
        if 'query' in result:
            if 'pages' in result['query']:
                page_ids = result['query']['pages'].keys()
        print('page_ids: ' + str(page_ids))
        if page_ids is None:
            page_ids = []

        return page_ids
示例#8
0
    def fetch_log(self, ignore_callback=False):
        query_params = {
            'action': 'query',
            'list': 'abuselog',
            'aflprop': 'ids|user|title|action|result|filter'
        }
        request = api.APIRequest(self.wiki, query_params)

        try:
            result = request.query(querycontinue=False)
        except api.APIError as e:
            self.log.debug('Caught APIError in fetch_log: %s, isLoggedIn(%s)=%s', e, self.username, self.wiki.isLoggedIn(self.username))
            return

        items = result['query']['abuselog']
        items.reverse()
        if self.last_log_id:
            items = [item for item in items if item['id'] > self.last_log_id]

        if items:
            self.last_log_id = items[-1]['id']

        # Exclude ignored filters
        items = [item for item in items if item['filter_id'] not in self.ignored_filters]

        if not ignore_callback:
            for item in items:
                self.callback(self.wiki_name, item)
def wikipedia_query(query_params):
    """
	An extremely basic wrapper for the wikitools api.
	"""
    site = wiki.Wiki()  # This defaults to en.wikipedia.org
    request = api.APIRequest(site, query_params)
    result = request.query()
    return result[query_params['action']]
示例#10
0
 def test_parseJSON_bad_data_nonwrite(self):
     self.site.apibase = "http://localhost/w/index.php"
     self.site.maxwaittime = 10
     params = {}
     with self.assertWarns(UserWarning):
         with self.assertRaises(wikitools.exceptions.APIFailure):
             req = api.APIRequest(self.site, params)
             req.query(False)
示例#11
0
 def test_getRaw_HTTP_error_nonwrite(self):
     self.site.apibase = "http://httpbin.org/status/500"
     self.site.maxwaittime = 10
     params = {}
     with self.assertWarns(UserWarning):
         with self.assertRaises(requests.exceptions.HTTPError):
             req = api.APIRequest(self.site, params)
             req.query(False)
示例#12
0
def get_article(url, source_id, rfc_DB):
    cmd = 'select id, disqus_id, section_index, title from website_article where url = %s'
    article_result = rfc_DB.fetch_one(cmd, (urllib2.unquote(url), ))

    if article_result is not None:
        article_id, disqus_id, section_index, title = article_result
        return article_id, disqus_id, section_index, title
    else:
        if 'wikipedia.org/wiki/' in url:
            url_parts = url.split('/wiki/')
            wiki_sub = url_parts[1].split(':')
            wiki_parts = ':'.join(wiki_sub[1:]).split('#')
            wiki_page = wiki_parts[0]
            section = None
            if len(wiki_parts) > 1:
                section = wiki_parts[1]

            from wikitools import wiki, api
            site = wiki.Wiki(_DOMAIN + '/w/api.php')
            page = urllib2.unquote(
                str(wiki_sub[0]) + ':' + wiki_page.encode('ascii', 'ignore'))
            params = {
                'action': 'parse',
                'prop': 'sections',
                'page': page,
                'redirects': 'yes'
            }
            from wikitools import wiki, api
            try:
                request = api.APIRequest(site, params)

                result = request.query()

                disqus_id = str(result['parse']['pageid'])
                section_title = None
                section_index = None

                if section:
                    for s in result['parse']['sections']:
                        if s['anchor'] == section:
                            disqus_id = str(disqus_id) + '#' + str(s['index'])
                            section_title = s['line']
                            section_index = s['index']
                title = result['parse']['title']
                if section_title is not None:
                    title = title + ' - ' + section_title

                link = urllib2.unquote(url)
                article_insert_command = " insert into website_article (disqus_id, title, url, source_id, section_index)\
                                            values (%s, %s, %s, %s, %s)"

                article_id = rfc_DB.insert(
                    article_insert_command,
                    (disqus_id, title, link, source_id, section_index))
                return article_id, disqus_id, section_index, title

            except api.APIError as e:
                print e
示例#13
0
def import_wiki_authors(authors, rfc_DB):
    found_authors = set()
    anonymous_exist = False
    for author in authors:
        if author:
            found_authors.add(author)
        else:
            anonymous_exist = True
    authors_list = '|'.join(found_authors)

    from wikitools import wiki, api
    site = wiki.Wiki(_DOMAIN + '/w/api.php')
    params = {
        'action': 'query',
        'list': 'users',
        'ususers': authors_list,
        'usprop': 'blockinfo|groups|editcount|registration|emailable|gender',
        'format': 'json'
    }

    request = api.APIRequest(site, params)
    result = request.query()
    comment_authors = []
    for user in result['query']['users']:
        comment_author_id = None
        try:
            author_id = user['userid']
            # first check if the author exists using the username
            command = "select id from website_commentauthor where username = %s"
            (comment_author_id, ) = rfc_DB.fetch_one(command, (user['name'], ))
            # if no author exists with the same username
            if comment_author_id is None:
                author_insert_command = " insert into website_commentauthor (username, disqus_id, joined_at, edit_count, gender, groups, is_wikipedia)\
                        values (%s, %s, %s, %s, %s, %s, %s)"

                joined_at = datetime.datetime.strptime(user['registration'],
                                                       '%Y-%m-%dT%H:%M:%SZ')
                params = (user['name'], author_id, joined_at,
                          user['editcount'], user['gender'],
                          ','.join(user['groups']), 1)
                comment_author_id = rfc_DB.insert(author_insert_command,
                                                  params)

        except Exception:
            command = " insert into website_commentauthor (username, is_wikipedia)\
                        values (%s, %s)"

            comment_author_id = rfc_DB.insert(command, (user['name'], 1))

        if comment_author_id is not None:
            comment_authors.append(comment_author_id)

    if anonymous_exist:
        anonymous_id = rfc_DB.get_anonymous_id()
        comment_authors.append(anonymous_id)

    return comment_authors
示例#14
0
def wiki_get_edittoken(w_site):

    params = {'action': 'tokens'}
    req = api.APIRequest(w_site, params)
    res = req.query(querycontinue=False)
    w_site_edittoken = res['tokens']['edittoken']

    print('token: ' + w_site_edittoken)

    return w_site_edittoken
示例#15
0
def getlist(wikia, wkfrom=1, wkto=1000):
    params = {
        'action': 'query',
        'list': 'wkdomains',
        'wkactive': '1',
        'wkfrom': wkfrom,
        'wkto': wkto,
    }
    request = api.APIRequest(wikia, params)
    return request.query()['query']['wkdomains']
示例#16
0
 def test_listFromQuery(self):
     params = {"action": "query", "list": "allpages"}
     req = api.APIRequest(self.site, params)
     res = req.query(False)
     api.logging = True
     pages = pagelist.listFromQuery(self.site, res["query"]["allpages"])
     for item in pages:
         self.assertIsInstance(item, page.Page)
         self.assertTrue(item.exists)
     self.assertEqual(len(api.querylog), 0)
示例#17
0
    def getBacklinks(self):
        ''' Find all pages that link to this page

		limit - maximum number of pagenames to return
		'''
        params = {
            'action': 'query',
            'list': 'backlinks',
            'bltitle': self.title,
        }
        apiresponse = api.APIRequest(self.site, params).query()
        return map(lambda x: x['title'], apiresponse['query']['backlinks'])
示例#18
0
 def test_getRaw_HTTP_error_write(self):
     self.site.apibase = "http://httpbin.org/status/500"
     params = {}
     warnings.filterwarnings("error",
                             category=UserWarning,
                             module="wikitools.api")
     with self.assertRaises(requests.exceptions.HTTPError):
         req = api.APIRequest(self.site, params, write=True)
         req.query(False)
     warnings.filterwarnings("default",
                             category=UserWarning,
                             module="wikitools.api")
示例#19
0
 def test_parseJSON_bad_data_write(self):
     self.site.apibase = "http://localhost/w/index.php"
     params = {}
     warnings.filterwarnings("error",
                             category=UserWarning,
                             module="wikitools.api")
     with self.assertRaises(wikitools.exceptions.APIFailure):
         req = api.APIRequest(self.site, params, write=True)
         req.query(False)
     warnings.filterwarnings("default",
                             category=UserWarning,
                             module="wikitools.api")
示例#20
0
def import_wiki_authors(authors, article):
    found_authors = []
    anonymous_exist = False
    for author in authors:
        if author:
            found_authors.append(author)
        else:
            anonymous_exist = True
    authors_list = '|'.join(found_authors)

    from wikitools import wiki, api
    domain = article.url.split('/wiki/')[0]
    site = wiki.Wiki(domain + '/w/api.php')
    params = {
        'action': 'query',
        'list': 'users',
        'ususers': authors_list,
        'usprop': 'blockinfo|groups|editcount|registration|emailable|gender',
        'format': 'json'
    }

    request = api.APIRequest(site, params)
    result = request.query()
    comment_authors = []
    for user in result['query']['users']:
        try:
            author_id = user['userid']
            comment_author = CommentAuthor.objects.filter(disqus_id=author_id)
            if comment_author.count() > 0:
                comment_author = comment_author[0]
            else:
                joined_at = datetime.datetime.strptime(user['registration'],
                                                       '%Y-%m-%dT%H:%M:%SZ')
                comment_author = CommentAuthor.objects.create(
                    username=user['name'],
                    disqus_id=author_id,
                    joined_at=user['registration'],
                    edit_count=user['editcount'],
                    gender=user['gender'],
                    groups=','.join(user['groups']),
                    is_wikipedia=True)
        except Exception:
            comment_author = CommentAuthor.objects.create(
                username=user['name'], is_wikipedia=True)
        comment_authors.append(comment_author)

    if anonymous_exist:
        comment_authors.append(
            CommentAuthor.objects.get(disqus_id='anonymous',
                                      is_wikipedia=True))

    return comment_authors
示例#21
0
def crawl(url_param):

    # Fix eventual full URL
    url_param = unquote_plus(basename(url_param))

    # Generate query
    params = {
            'action'        : 'query',
            'prop'          : 'imageinfo|revisions',
            'iiprop'        : 'url|sha1|size',
            'rvprop'        : 'content',
            'rawcontinue'   : '' }

    url_type = get_url_type(url_param)

    if url_type == 'category':
        params['generator'] = 'categorymembers'
        params['gcmtitle']  = url_param
        params['gcmlimit']  = 'max'
    elif url_type == 'file':
        params['titles']    = url_param
    else:
        params['generator'] = 'images'
        params['titles']    = url_param
        params['gimlimit']  = 'max'


    # Call API
    site = wiki.Wiki(API_URL)
    request = api.APIRequest(site, params)

    print_verbose("Site: %s" % str(site), 2)
    print_verbose("Query: ", 2)
    pprint_verbose(params, 2)

    result = request.query(querycontinue=True)
    print_verbose("Result: ", 4)
    pprint_verbose(result, 4)

    # Check result
    if 'error' in result:
        raise Error(result['error'])

    if 'warnings' in result:
        sys.stderr.write(result['warnings'])
        return None

    if '-1' in result['query']['pages']:
        sys.stderr.write(result['query']['pages']['-1'])
        return None

    return result['query']['pages']
示例#22
0
 def test_parseJSON_maxlag(self):
     site = wiki.Wiki("https://en.wikipedia.org/w/api.php")
     params = {"action": "query"}
     req = api.APIRequest(site, params)
     req.changeParam("maxlag", "-1")
     warnings.filterwarnings("error",
                             category=UserWarning,
                             module="wikitools.api")
     with self.assertRaises(UserWarning):
         req.query(False)
     warnings.filterwarnings("default",
                             category=UserWarning,
                             module="wikitools.api")
示例#23
0
 def toolbar_icon_clicked(self, widget, movie):
     import pprint # Used for formatting the output for viewing, not necessary for most code
     from wikitools import wiki, api
     site = wiki.Wiki("http://de.wikipedia.org/w/api.php")
     params = {'action':'query',
         'list':'search',
         'srsearch':'rocky',
         'srprop':'',
         'srlimit':'50'
     }
     req = api.APIRequest(site, params)
     res = req.query(querycontinue=False)
     pprint.pprint(res)
示例#24
0
def get_usercontribs(lang, site, params, stop_date):
    user_contrib = []
    req = api.APIRequest(site, params)
    for resp in req.queryGen():
        pages = resp['query']['usercontribs']
        before_end = [
            p for p in pages if
            datetime.strptime(p['timestamp'], "%Y-%m-%dT%H:%M:%SZ") > stop_date
        ]
        user_contrib.extend(before_end)
        if len(pages) != len(before_end):
            # Has hit stop date if some pages are filtered away
            break
    return lang, user_contrib
示例#25
0
    def get_linked_here(self, page):
        args = {'action': 'query', 'prop': 'linkshere', 'titles': page}
        logger.debug("processing Links Here list")
        request = api.APIRequest(self.site, args)
        results = request.queryGen()
        titles = []
        for r in results:
            pages = r['query']['pages']
            key, values = pages.popitem()
            for value in values['linkshere']:
                # logger.debug(value)
                titles.append(value['title'])

        self.logger.info("Returning {:d} titles".format(len(titles)))
        return titles
示例#26
0
 def search_disambiguation_page(self, title):
     search_title = title + self.search_disambig
     args = {
         'action': 'query',
         'list': 'search',
         'srsearch': search_title,
         'srprop': 'size|wordcount'
     }
     request = api.APIRequest(self.site, args)
     results = request.queryGen()
     titles = []
     for r in results:
         for values in r['query']['search']:
             titles.append(values['title'])
     return titles
示例#27
0
def wiki_page_set_cnt(w_site, w_site_edittoken, p_title, p_content, p_summary):

    print("Upload ...")
    params = {
        'action': 'edit',
        'title': p_title,
        'summary': p_summary,
        'text': p_content,
        'token': w_site_edittoken
    }
    req = api.APIRequest(w_site, params)
    res = req.query(querycontinue=False)
    print("... done.")

    pprint.pprint(res)
示例#28
0
def get_cords(titles):
    """Get coordinates for titles via mediawiki API."""

    par = {'action': 'query', 'titles': titles,
           'prop': 'coordinates', 'colimit': 500, 'format': 'json'}
    p_req = api.APIRequest(site, par)
    for resp in p_req.queryGen():
        for t, page in resp['query']['pages'].items():
            if 'coordinates' in page:
                print(f"{page['title']} --- N {page['coordinates'][0]['lat']}"
                      f", E {page['coordinates'][0]['lon']}")
                yield [page['title'], page['coordinates'][0]['lat'],
                       page['coordinates'][0]['lon']]
            else:
                print(f"{page['title']} --- missing cords: {titles}")
                yield [page['title'], '', '']
示例#29
0
 def getChanges(self):
     print('getChanges from:',self.dataTS)
     with wiki.Wiki(url = self.siteurl) as mywiki:
         try:
             #Get recent edits and new titles.
             #print('get')
             urldata = { 'action':'query',
             'list':'recentchanges',
             'rcprop':'title|timestamp|loginfo',
             'rctype':'log|edit|new',
             'rcdir':'newer',
             'rcstart':self.dataTS,
             #'continue':self.dummycontinue,
             'rclimit':MAX_LE_OR_RC,
             'maxlag':MAX_LAG
             }
             therequest = api.APIRequest(mywiki, urldata)
             #thedict = {}
             #print(therequest)
             for req in therequest.queryGen():
                 #if self.fnrefresh:
                     #myprint('got some data', str(len(self.changes)))
                     #self.fnrefresh('rcpages:' + str(len(self.changes)))
                 #print(req)
                 for title in req['query']['recentchanges']:
                     #print('a query')
                     if title['type'] == 'new' or title['type'] == 'edit':
                         self.addifnewer (title['title'], title['timestamp'])
                     if title['type'] == 'log':
                         if  title['logaction'] == 'delete':
                             self.addifnewer (title['title'], title['timestamp'], True)
                         if title['logaction'] == 'move':
                             self.addifnewer (title['logparams']['target_title'], title['timestamp'])
                             if 'suppressredirect' in title['logparams']:
                                 self.addifnewer (title['logparams']['target_title'], title['timestamp'], True)
                             else:
                                 self.addifnewer (title['title'], title['timestamp'])
                 myprintstay('rcpages:', str(len(self.changes)))
                 self.jsondump()
             #print (thedict)
             #for b in sorted(self.changes, key=lambda x: thedict[x]['timestamp']):
                 #print( thedict[b]['timestamp'], b)
         except Exception as e:
             print(e)
             raise e
     print('Exiting getChanges')
     return True
示例#30
0
def loadPages(site, pages, outDir, wait=3):
    count = 0
    for page in pages:
        print >> sys.stderr, "Loading page", page, "(" + str(count+1) +"/"+str(len(pages)) + ")"
        # define the params for the query
        params = {'action':'query', 'titles':page, 'export':None}
        # create the request object
        request = api.APIRequest(site, params)
        # query the API
        result = request.query()
        
        print >> sys.stderr, "Writing result"
        f = codecs.open(os.path.join(outDir, page+".xml"), "wt", "utf-8")
        f.write( result["query"]["export"]["*"] )
        f.close()
        
        print >> sys.stderr, "Sleeping"
        time.sleep(wait)
        count += 1