def main(): parser = argparse.ArgumentParser(description='Datofeilfikser') parser.add_argument('--page', required=False, help='Name of a single page to check') args = parser.parse_args() cnt = {'pagesChecked': 0, 'datesChecked': 0, 'datesModified': 0, 'datesUnresolved': 0} pagesWithNoKnownErrors = [] unresolved = [] config = json.load(open('config.json', 'r')) site = Site('no.wikipedia.org') site.login(config['username'], config['password']) cat = site.Categories['Sider med kildemaler som inneholder datofeil'] if args.page: page = site.pages[args.page] p = Page(page) else: n = 0 for page in cat.members(): n += 1 logging.info('%02d %s - %.1f MB', n, page.name, memory_usage_psutil()) # print "-----------[ %s ]-----------" % page.name p = Page(page) cnt['pagesChecked'] += 1 cnt['datesChecked'] += p.checked cnt['datesModified'] += len(p.modified) cnt['datesUnresolved'] += len(p.unresolved) if len(p.modified) == 0 and len(p.unresolved) == 0: pagesWithNoKnownErrors.append(page.name) unresolved.extend(p.unresolved) # if cnt['pagesChecked'] > 100: # break # print # print "Pages with no known templates with date errors:" # for p in pagesWithNoKnownErrors: # print ' - %s' % p cnt['datesOk'] = cnt['datesChecked'] - cnt['datesModified'] - cnt['datesUnresolved'] unresolvedTxt = u"Pages checked: %(pagesChecked)d, dates checked: %(datesChecked)d, of which<br>\n" % cnt unresolvedTxt += " OK: %(datesOk)d, modified: %(datesModified)d, unresolved errors: %(datesUnresolved)d\n\n" % cnt unresolvedTxt += u'Unresolved errors:\n\n{|class="wikitable sortable"\n! Artikkel !! Felt !! Verdi\n|-\n' for p in unresolved: unresolvedTxt += u'| [[%(page)s]] || %(key)s || <nowiki>%(value)s</nowiki>\n|-\n' % p page = site.pages[u'Bruker:DanmicholoBot/Datofiks/Uløst'] page.save(unresolvedTxt, summary='Oppdaterer')
import mwclient from mwclient import Site import sys import scrapContent as scrap url="sindhipedia.org" user_name= 'Administrator' password= '******' page_name= sys.argv[1] site=Site(('http',url),path='/',) site.login(user_name, password) page = site.pages[sys.argv[1]] if sys.argv[2] == '-d': print 'Deleting Page !', sys.argv[1] page.delete() sys.exit() if (page.exists): print 'Page ' , sys.argv[1] ,'Already exists' sys.exit() else: print "Creating Page " , sys.argv[1] print page.can('edit') text= scrap.scrapDynamic(sys.argv[1] ,5); # result comes in sections so you have to define textspreadratio #print "Generator Output: ",text page.save(text, 'Edit Summary') print 'Created Page' , sys.argv[1] ,'!!'
async def compile_edits(title, count_skipped): # Load the article site = Site("en.wikipedia.org") page = site.pages[title] talk = site.pages["Talk:" + title] ratings = get_ratings(talk) # Collect metadata information metadata = [rev for rev in page.revisions()] users = get_users(metadata) kind = get_kind(metadata) comments = get_comment(metadata) revids = [] history = [] # Collect list of revision ids using the metadata pull for i in range(0, metadata.__len__()): revids.append(metadata[i]["revid"]) # Container for the revision texts texts = [] # Gather body content of all revisions (asynchronously) sema = 100 for i in range(0, metadata.__len__(), +sema): texts += await asyncio.gather(*(get_text(revid, 0) for revid in revids[i:(i + sema)])) # Initialize counter for the number of skipped pages j = 0 # Iterate backwards through our metadata and put together the list of change items for i in range(metadata.__len__() - 1, -1, -1): # Count deleted pages if texts[i] is None: j += 1 # Iterate against talk page editions time = datetime.fromtimestamp(mktime(metadata[i]["timestamp"])) rating = "NA" for item in ratings: if time > item[1]: rating = item[0] break change = Change( i, title, time, metadata[i]["revid"], kind[i], users[i], comments[i], rating, texts[i], ) # Compile the list of changes history.append(change) if count_skipped: return (history, j) else: return history
def __init__(self, client: Site, wikipedia_api: str): super().__init__(client) self.wp_client = Site(wikipedia_api)
def main(argv): # ------------- Constant Variables ---------------- MERGE = True WORLD_AREA = math.pi * (13000 * 13000) MODE = "OFFLINE" DATA_URL = "https://githubraw.com/ccmap/data/master/land_claims.civmap.json" SANDBOX = False # ------------------------------------------------ try: opts, args = getopt.getopt( argv, "h", ["markdown", "wiki", "offline", "sandbox", "help"]) except getopt.GetoptError: print("areaCalculator.py --wiki") sys.exit(2) for opt, arg in opts: if opt in ('-h', '--help'): print("--markdown , --wiki , --offline , --sandbox , --help") if opt in "--markdown": MODE = "MARKDOWN" if opt in "--wiki": MODE = "WIKI" if opt in "--offline": MODE = "OFFLINE" if opt in "--sandbox": MODE = "WIKI" SANDBOX = True # Get the latest claims json headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = urllib.request.Request(url=DATA_URL, headers=headers) with urllib.request.urlopen(req) as url: data = json.loads(url.read().decode()) # Calculate and sort the area of every polygon, combining ones from the same nation areas = {} shortnames = {} for feat in data["features"]: name = feat["name"] if MERGE: nation = (re.sub( "\(|\)", "", re.search("(^[^()]+$)|\((.*)\)", name.replace("\n", " ")).group())) if "shortname" in feat: shortnames[nation] = feat["shortname"] if ACRONYMS.get(nation) is not None: nation = ACRONYMS.get(nation) else: nation = name area = 0 if "polygon" in feat: for poly in feat["polygon"]: area += polygon_area(poly) else: print(feat) if nation in areas: areas[nation] += area else: areas[nation] = area areas_sorted = {} areas_sorted_keys = sorted(areas, key=areas.get, reverse=True) for w in areas_sorted_keys: areas_sorted[w] = areas[w] # Render the table if MODE == "MARKDOWN": with open('areas.md', 'w') as f: f.write("#|Nation|Area (km²)|% of Map Area\n") f.write(":---:|:---:|:---:|:---:|\n") f.write("{}|{}|{}|{}\n".format(0, "*CivClassic*", round(WORLD_AREA / 1000000, 3), 100)) i = 1 for key in areas_sorted.keys(): are = round(areas[key] / 1000000, 3) per = round((areas[key] / WORLD_AREA) * 100, 3) print(key, are) f.write("{}|{}|{}|{}\n".format(i, key, are, per)) i = i + 1 if MODE == "WIKI" or MODE == "OFFLINE": # Get all countries with a flag template flag_template_whitelist = [] ua = "AreaListCalculator/0.0.1 Smal" site = Site('civwiki.org', clients_useragent=ua) category = site.categories['All country data templates'] for page in category: flag_template_whitelist.append( page.name[len("Template:Country data") + 1:]) # Generate the wiki table new_table = "" new_table += "{| class=\"wikitable sortable\"\n|+\n!Rank\n!Nation\n!Area in km²\n!% of Map Area\n|-\n" new_table += ("|-\n|{}\n|{}\n|{}\n|{}\n".format( 0, "''[[CivClassic]]''", round(WORLD_AREA / 1000000, 3), 100)) i = 1 for key in areas_sorted.keys(): are = round(areas[key] / 1000000, 3) per = round((areas[key] / WORLD_AREA) * 100, 3) #print(key,are) nation_txt = "[[{}]]".format(key) if key in flag_template_whitelist: nation_txt = "{{{{flag|{}}}}}".format(key) elif key in shortnames: if shortnames[key] in flag_template_whitelist: nation_txt = "{{{{flag|{}}}}}".format(shortnames[key]) new_table += "|-\n|{}\n|{}\n|{}\n|{}\n".format( i, nation_txt, are, per) i = i + 1 new_table += "|}" # Upload the table to civwiki if SANDBOX == False: page = site.pages['List_of_nations_by_area'] else: page = site.pages['List_of_nations_by_area/Sandbox'] text = page.text() parsed = wtp.parse(text) for section in parsed.sections: if section.title == "Nations by area": section.contents = new_table print(parsed.string) if MODE == "OFFLINE": with open('areas.txt', 'w') as f: f.write(parsed.string) else: site.login(USER, PASSWORD) page.edit(parsed.string, "Automated Table Update")
def __init__(self, host='en.wikipedia.org', user_agent='Wnow?/1.0 ([email protected])'): self.site = Site(host, clients_useragent=user_agent)
class Wiki: def __init__(self, host='en.wikipedia.org', user_agent='Wnow?/1.0 ([email protected])'): self.site = Site(host, clients_useragent=user_agent) def get_id(self, title): return wk.page(title=title).pageid def get_title(self, pageid): return wk.page(pageid=pageid).title # This method returns the summary provided by wk.summary() # **kwargs could be either the title of the page or its pageid def get_summary(self, **kwargs) -> str: try: if 'title' in kwargs: return wk.summary(title=kwargs['title']) if 'pageid' in kwargs: return wk.page(pageid=kwargs['pageid']).summary except: print('\tSummary not available') raise APIError # This method returns the content provided by wk.page[].content # **kwargs could be either the title of the page or its pageid def get_content(self, **kwargs) -> str: try: if 'title' in kwargs: return wk.page(title=kwargs['title']).content if 'pageid' in kwargs: return wk.page(pageid=kwargs['pageid']).content except: print('\tContent not available') return 'Content not available' # This method returns the object mwclient.page.Page # **kwargs could be either the title of the page or its pageid def get_page(self, **kwargs): try: if 'title' in kwargs: return self.site.pages[kwargs['title']] if 'pageid' in kwargs: return self.site.pages[kwargs['pageid']] except: raise APIError # This method builds the url to the page given its title def get_page_link(self, title) -> str: return 'en.wikipedia.org/wiki/' + title.replace(' ', '%20') # This method gets the recent changes list using mwclient.Site.api() # It filters pages in namespace 0 and gets only pages created or modified def __recentchanges_list(self, limit, start, end) -> pd.DataFrame: try: rc = self.site.api('query', list='recentchanges', rclimit=limit, rcstart=start, rcend=end, rctype='new|edit', rcnamespace='0') except: raise APIError r = pd.DataFrame(data=rc['query']['recentchanges']) r.drop(columns=['ns', 'revid', 'old_revid', 'rcid', 'timestamp'], inplace=True) return r # This method gets the recent changes by calling __recentchanges_list(..) # Attribute rclimit is required to set up the maximum number of recent changes you can get; to set up the maximum value permitted by MediaWiki'API, type 'max' # Attributes rcstart and rcend are required to set up the time range in which getting recent changes; rcstart must be grater than rcend def recentchanges(self, rclimit, rcstart, rcend) -> pd.DataFrame: images = [] summaries = [] links = [] try: result = self.__recentchanges_list(limit=rclimit, start=rcstart, end=rcend) except: print('\tAn API error occured during recent changes retrieving') raise APIError for pageid in result['pageid']: try: page = self.get_page( pageid=pageid) # get the page from the pageid provided if not page.exists: raise PageNotExists except APIError: print('\tAn API error occured during single page retrieving') result.query( 'pageid != ' + str(pageid), inplace=True ) # if an API error occures, remove the pageid of the page that caused the error from the recent changes list continue except PageNotExists: result.query( 'pageid != ' + str(pageid), inplace=True ) # if a PageNotExists error occures, remove the pageid of the page that caused the error from the recent changes list continue try: summary = self.get_summary( pageid=pageid ) # get the summary of the page given the pageid if not summary: # if summary is empty (there's no summary), raise error raise PageNotExists summaries.append( summary) # insert summary into the list summaries except: result.query( 'pageid != ' + str(pageid), inplace=True ) # if a PageNotExists error occures, remove the pageid of the page that caused the error from the recent changes list continue try: images.append( page.images(generator=True).next().imageinfo['url'] ) # get the first url image from the page calling mwclient.page.Page.images() except: images.append( 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a7/Wikipedia_logo_v3.svg/1024px-Wikipedia_logo_v3.svg.png' ) # append a default image (Wikipedia logo) try: links.append(self.get_page_link( page.name)) # build the page link except: links.append('en.wikipedia.org/wiki/Main_Page' ) # if an error occures, append a default link result.insert(3, column='image', value=images) result.insert(4, column='link', value=links) result.insert(5, column='summary', value=summaries) return result # This method returns a dictionary containing pages from the category provided # According to MediaWiki API's syntax, category must be like 'Category:mycategory' # Attribute pages_num specifies the number of pages that at most will be returned def get_raw_category_pages(self, category, pages_num): search_list = [ category ] # make the list which will contain all the subcategories found recursively in category page_set = [] with tqdm(total=pages_num, desc=category) as cbar: # display progress bar while search_list and len( page_set ) <= pages_num: # while search_list is not empty and the number of pages is less than required query_result = self.site.api('query', list='categorymembers', cmtitle=search_list.pop(0), cmprop='title', cmtype='page|subcat', cmsort='timestamp', cmlimit='max') for element in query_result['query'][ 'categorymembers']: # for each page/category in the query's result if len( page_set ) >= pages_num: # the number of pages is greater than required break elif 'Category:' in element[ 'title']: # element is a category search_list.append( element['title'] ) # push the category found into the categories list else: # element is a page try: summary = wk.summary( element['title'], sentences=3) # request page's summary if summary: # if summary is not empty page_set.append(summary) # append summary cbar.update(1) # increment progress bar except: continue # if an error occures while querying the API for summary, skip the error category = category.replace( 'Category:', '') # get rid of Category: prefix in attribute category provided return { 'text': page_set, 'category': category } # return dictonary made up of all pages' summaries and the category label
import datetime from mwclient import Site site = Site('https://lol.gamepedia.com', path="/") # Set wiki site.login('RheingoldRiver@BotPasswordName', 'smldrgsrthmldyhj') limit = -1 now = datetime.datetime.utcnow() now_timestamp = now.isoformat() then = now - datetime.timedelta(hours=4) # change hours if needed last_timestamp = then.isoformat() revisions = site.api('query', format='json', list='recentchanges', rcstart=now_timestamp, rcend=last_timestamp, rcprop='title|ids', rclimit='max', rcdir='older') pages = [] pages_used = {} revs = {} failed_pages = [] for revision in revisions['query']['recentchanges']: revs[revision['revid']] = True if revision['title'] in pages_used: pass