예제 #1
0
def main():

    parser = argparse.ArgumentParser(description='Datofeilfikser')
    parser.add_argument('--page', required=False, help='Name of a single page to check')
    args = parser.parse_args()

    cnt = {'pagesChecked': 0, 'datesChecked': 0, 'datesModified': 0, 'datesUnresolved': 0}
    pagesWithNoKnownErrors = []
    unresolved = []

    config = json.load(open('config.json', 'r'))

    site = Site('no.wikipedia.org')
    site.login(config['username'], config['password'])
    cat = site.Categories['Sider med kildemaler som inneholder datofeil']

    if args.page:
        page = site.pages[args.page]
        p = Page(page)

    else:
        n = 0
        for page in cat.members():
            n += 1
            logging.info('%02d %s - %.1f MB', n, page.name, memory_usage_psutil())
            # print "-----------[ %s ]-----------" % page.name
            p = Page(page)
            cnt['pagesChecked'] += 1
            cnt['datesChecked'] += p.checked
            cnt['datesModified'] += len(p.modified)
            cnt['datesUnresolved'] += len(p.unresolved)

            if len(p.modified) == 0 and len(p.unresolved) == 0:
                pagesWithNoKnownErrors.append(page.name)

            unresolved.extend(p.unresolved)

            # if cnt['pagesChecked'] > 100:
            #     break

    # print
    # print "Pages with no known templates with date errors:"
    # for p in pagesWithNoKnownErrors:
    #     print ' - %s' % p

    cnt['datesOk'] = cnt['datesChecked'] - cnt['datesModified'] - cnt['datesUnresolved']

    unresolvedTxt = u"Pages checked: %(pagesChecked)d, dates checked: %(datesChecked)d, of which<br>\n" % cnt
    unresolvedTxt += "  OK: %(datesOk)d, modified: %(datesModified)d, unresolved errors: %(datesUnresolved)d\n\n" % cnt
    unresolvedTxt += u'Unresolved errors:\n\n{|class="wikitable sortable"\n! Artikkel !! Felt !! Verdi\n|-\n'

    for p in unresolved:
        unresolvedTxt += u'| [[%(page)s]] || %(key)s || <nowiki>%(value)s</nowiki>\n|-\n' % p

    page = site.pages[u'Bruker:DanmicholoBot/Datofiks/Uløst']
    page.save(unresolvedTxt, summary='Oppdaterer')
import mwclient
from mwclient import Site
import sys 
import scrapContent as scrap


url="sindhipedia.org"
user_name= 'Administrator'
password= '******'
page_name= sys.argv[1]
site=Site(('http',url),path='/',)
site.login(user_name, password)
page = site.pages[sys.argv[1]]

if sys.argv[2] == '-d':
	print 'Deleting Page !', sys.argv[1]
	page.delete()
	sys.exit()
if (page.exists):
	print 'Page ' , sys.argv[1] ,'Already exists'
	sys.exit()
else:
	print "Creating Page " , sys.argv[1]
	print page.can('edit')
	text= scrap.scrapDynamic(sys.argv[1] ,5); # result comes in sections so you have to define textspreadratio
	#print "Generator Output: ",text
	page.save(text, 'Edit Summary')
	print 'Created Page' , sys.argv[1] ,'!!'
예제 #3
0
async def compile_edits(title, count_skipped):
    # Load the article
    site = Site("en.wikipedia.org")
    page = site.pages[title]
    talk = site.pages["Talk:" + title]
    ratings = get_ratings(talk)

    # Collect metadata information
    metadata = [rev for rev in page.revisions()]
    users = get_users(metadata)
    kind = get_kind(metadata)
    comments = get_comment(metadata)

    revids = []
    history = []

    # Collect list of revision ids using the metadata pull
    for i in range(0, metadata.__len__()):
        revids.append(metadata[i]["revid"])

    # Container for the revision texts
    texts = []

    # Gather body content of all revisions (asynchronously)
    sema = 100
    for i in range(0, metadata.__len__(), +sema):
        texts += await asyncio.gather(*(get_text(revid, 0)
                                        for revid in revids[i:(i + sema)]))

    # Initialize counter for the number of skipped pages
    j = 0

    # Iterate backwards through our metadata and put together the list of change items
    for i in range(metadata.__len__() - 1, -1, -1):

        # Count deleted pages
        if texts[i] is None:
            j += 1

        # Iterate against talk page editions
        time = datetime.fromtimestamp(mktime(metadata[i]["timestamp"]))
        rating = "NA"

        for item in ratings:
            if time > item[1]:
                rating = item[0]
                break

        change = Change(
            i,
            title,
            time,
            metadata[i]["revid"],
            kind[i],
            users[i],
            comments[i],
            rating,
            texts[i],
        )

        # Compile the list of changes
        history.append(change)

    if count_skipped:
        return (history, j)
    else:
        return history
예제 #4
0
 def __init__(self, client: Site, wikipedia_api: str):
     super().__init__(client)
     self.wp_client = Site(wikipedia_api)
예제 #5
0
def main(argv):
    # ------------- Constant Variables ----------------
    MERGE = True
    WORLD_AREA = math.pi * (13000 * 13000)
    MODE = "OFFLINE"
    DATA_URL = "https://githubraw.com/ccmap/data/master/land_claims.civmap.json"
    SANDBOX = False
    # ------------------------------------------------

    try:
        opts, args = getopt.getopt(
            argv, "h", ["markdown", "wiki", "offline", "sandbox", "help"])
    except getopt.GetoptError:
        print("areaCalculator.py --wiki")
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print("--markdown , --wiki , --offline , --sandbox , --help")
        if opt in "--markdown":
            MODE = "MARKDOWN"
        if opt in "--wiki":
            MODE = "WIKI"
        if opt in "--offline":
            MODE = "OFFLINE"
        if opt in "--sandbox":
            MODE = "WIKI"
            SANDBOX = True
    # Get the latest claims json
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
    }
    req = urllib.request.Request(url=DATA_URL, headers=headers)

    with urllib.request.urlopen(req) as url:
        data = json.loads(url.read().decode())

    # Calculate and sort the area of every polygon, combining ones from the same nation
    areas = {}
    shortnames = {}

    for feat in data["features"]:
        name = feat["name"]
        if MERGE:
            nation = (re.sub(
                "\(|\)", "",
                re.search("(^[^()]+$)|\((.*)\)", name.replace("\n",
                                                              " ")).group()))
            if "shortname" in feat:
                shortnames[nation] = feat["shortname"]

            if ACRONYMS.get(nation) is not None:
                nation = ACRONYMS.get(nation)
        else:
            nation = name

        area = 0
        if "polygon" in feat:
            for poly in feat["polygon"]:
                area += polygon_area(poly)
        else:
            print(feat)

        if nation in areas:
            areas[nation] += area
        else:
            areas[nation] = area

    areas_sorted = {}
    areas_sorted_keys = sorted(areas, key=areas.get, reverse=True)
    for w in areas_sorted_keys:
        areas_sorted[w] = areas[w]

    # Render the table

    if MODE == "MARKDOWN":
        with open('areas.md', 'w') as f:
            f.write("#|Nation|Area (km²)|% of Map Area\n")
            f.write(":---:|:---:|:---:|:---:|\n")
            f.write("{}|{}|{}|{}\n".format(0, "*CivClassic*",
                                           round(WORLD_AREA / 1000000, 3),
                                           100))

            i = 1
            for key in areas_sorted.keys():
                are = round(areas[key] / 1000000, 3)
                per = round((areas[key] / WORLD_AREA) * 100, 3)
                print(key, are)
                f.write("{}|{}|{}|{}\n".format(i, key, are, per))
                i = i + 1
    if MODE == "WIKI" or MODE == "OFFLINE":
        # Get all countries with a flag template
        flag_template_whitelist = []

        ua = "AreaListCalculator/0.0.1 Smal"
        site = Site('civwiki.org', clients_useragent=ua)

        category = site.categories['All country data templates']
        for page in category:
            flag_template_whitelist.append(
                page.name[len("Template:Country data") + 1:])

        # Generate the wiki table
        new_table = ""
        new_table += "{| class=\"wikitable sortable\"\n|+\n!Rank\n!Nation\n!Area in km²\n!% of Map Area\n|-\n"
        new_table += ("|-\n|{}\n|{}\n|{}\n|{}\n".format(
            0, "''[[CivClassic]]''", round(WORLD_AREA / 1000000, 3), 100))
        i = 1
        for key in areas_sorted.keys():
            are = round(areas[key] / 1000000, 3)
            per = round((areas[key] / WORLD_AREA) * 100, 3)
            #print(key,are)
            nation_txt = "[[{}]]".format(key)
            if key in flag_template_whitelist:
                nation_txt = "{{{{flag|{}}}}}".format(key)
            elif key in shortnames:
                if shortnames[key] in flag_template_whitelist:
                    nation_txt = "{{{{flag|{}}}}}".format(shortnames[key])
            new_table += "|-\n|{}\n|{}\n|{}\n|{}\n".format(
                i, nation_txt, are, per)
            i = i + 1
        new_table += "|}"

        # Upload the table to civwiki
        if SANDBOX == False:
            page = site.pages['List_of_nations_by_area']
        else:
            page = site.pages['List_of_nations_by_area/Sandbox']
        text = page.text()
        parsed = wtp.parse(text)

        for section in parsed.sections:
            if section.title == "Nations by area":
                section.contents = new_table
        print(parsed.string)
        if MODE == "OFFLINE":
            with open('areas.txt', 'w') as f:
                f.write(parsed.string)
        else:
            site.login(USER, PASSWORD)
            page.edit(parsed.string, "Automated Table Update")
예제 #6
0
 def __init__(self,
              host='en.wikipedia.org',
              user_agent='Wnow?/1.0 ([email protected])'):
     self.site = Site(host, clients_useragent=user_agent)
예제 #7
0
class Wiki:
    def __init__(self,
                 host='en.wikipedia.org',
                 user_agent='Wnow?/1.0 ([email protected])'):
        self.site = Site(host, clients_useragent=user_agent)

    def get_id(self, title):
        return wk.page(title=title).pageid

    def get_title(self, pageid):
        return wk.page(pageid=pageid).title

    # This method returns the summary provided by wk.summary()
    # **kwargs could be either the title of the page or its pageid
    def get_summary(self, **kwargs) -> str:
        try:
            if 'title' in kwargs:
                return wk.summary(title=kwargs['title'])
            if 'pageid' in kwargs:
                return wk.page(pageid=kwargs['pageid']).summary
        except:
            print('\tSummary not available')
            raise APIError

    # This method returns the content provided by wk.page[].content
    # **kwargs could be either the title of the page or its pageid
    def get_content(self, **kwargs) -> str:
        try:
            if 'title' in kwargs:
                return wk.page(title=kwargs['title']).content
            if 'pageid' in kwargs:
                return wk.page(pageid=kwargs['pageid']).content
        except:
            print('\tContent not available')
            return 'Content not available'

    # This method returns the object mwclient.page.Page
    # **kwargs could be either the title of the page or its pageid
    def get_page(self, **kwargs):
        try:
            if 'title' in kwargs:
                return self.site.pages[kwargs['title']]
            if 'pageid' in kwargs:
                return self.site.pages[kwargs['pageid']]
        except:
            raise APIError

    # This method builds the url to the page given its title
    def get_page_link(self, title) -> str:
        return 'en.wikipedia.org/wiki/' + title.replace(' ', '%20')

    # This method gets the recent changes list using mwclient.Site.api()
    # It filters pages in namespace 0 and gets only pages created or modified
    def __recentchanges_list(self, limit, start, end) -> pd.DataFrame:
        try:
            rc = self.site.api('query',
                               list='recentchanges',
                               rclimit=limit,
                               rcstart=start,
                               rcend=end,
                               rctype='new|edit',
                               rcnamespace='0')
        except:
            raise APIError
        r = pd.DataFrame(data=rc['query']['recentchanges'])
        r.drop(columns=['ns', 'revid', 'old_revid', 'rcid', 'timestamp'],
               inplace=True)
        return r

    # This method gets the recent changes by calling __recentchanges_list(..)
    # Attribute rclimit is required to set up the maximum number of recent changes you can get; to set up the maximum value permitted by MediaWiki'API, type 'max'
    # Attributes rcstart and rcend are required to set up the time range in which getting recent changes; rcstart must be grater than rcend
    def recentchanges(self, rclimit, rcstart, rcend) -> pd.DataFrame:
        images = []
        summaries = []
        links = []
        try:
            result = self.__recentchanges_list(limit=rclimit,
                                               start=rcstart,
                                               end=rcend)
        except:
            print('\tAn API error occured during recent changes retrieving')
            raise APIError
        for pageid in result['pageid']:
            try:
                page = self.get_page(
                    pageid=pageid)  # get the page from the pageid provided
                if not page.exists:
                    raise PageNotExists
            except APIError:
                print('\tAn API error occured during single page retrieving')
                result.query(
                    'pageid != ' + str(pageid), inplace=True
                )  # if an API error occures, remove the pageid of the page that caused the error from the recent changes list
                continue
            except PageNotExists:
                result.query(
                    'pageid != ' + str(pageid), inplace=True
                )  # if a PageNotExists error occures, remove the pageid of the page that caused the error from the recent changes list
                continue
            try:
                summary = self.get_summary(
                    pageid=pageid
                )  # get the summary of the page given the pageid
                if not summary:  # if summary is empty (there's no summary), raise error
                    raise PageNotExists
                summaries.append(
                    summary)  # insert summary into the list summaries
            except:
                result.query(
                    'pageid != ' + str(pageid), inplace=True
                )  # if a PageNotExists error occures, remove the pageid of the page that caused the error from the recent changes list
                continue
            try:
                images.append(
                    page.images(generator=True).next().imageinfo['url']
                )  # get the first url image from the page calling mwclient.page.Page.images()
            except:
                images.append(
                    'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a7/Wikipedia_logo_v3.svg/1024px-Wikipedia_logo_v3.svg.png'
                )  # append a default image (Wikipedia logo)
            try:
                links.append(self.get_page_link(
                    page.name))  # build the page link
            except:
                links.append('en.wikipedia.org/wiki/Main_Page'
                             )  # if an error occures, append a default link
        result.insert(3, column='image', value=images)
        result.insert(4, column='link', value=links)
        result.insert(5, column='summary', value=summaries)
        return result

    # This method returns a dictionary containing pages from the category provided
    # According to MediaWiki API's syntax, category must be like 'Category:mycategory'
    # Attribute pages_num specifies the number of pages that at most will be returned
    def get_raw_category_pages(self, category, pages_num):
        search_list = [
            category
        ]  # make the list which will contain all the subcategories found recursively in category
        page_set = []
        with tqdm(total=pages_num,
                  desc=category) as cbar:  # display progress bar
            while search_list and len(
                    page_set
            ) <= pages_num:  # while search_list is not empty and the number of pages is less than required
                query_result = self.site.api('query',
                                             list='categorymembers',
                                             cmtitle=search_list.pop(0),
                                             cmprop='title',
                                             cmtype='page|subcat',
                                             cmsort='timestamp',
                                             cmlimit='max')
                for element in query_result['query'][
                        'categorymembers']:  # for each page/category in the query's result
                    if len(
                            page_set
                    ) >= pages_num:  # the number of pages is greater than required
                        break
                    elif 'Category:' in element[
                            'title']:  # element is a category
                        search_list.append(
                            element['title']
                        )  # push the category found into the categories list
                    else:  # element is a page
                        try:
                            summary = wk.summary(
                                element['title'],
                                sentences=3)  # request page's summary
                            if summary:  # if summary is not empty
                                page_set.append(summary)  # append summary
                                cbar.update(1)  # increment progress bar
                        except:
                            continue  # if an error occures while querying the API for summary, skip the error
        category = category.replace(
            'Category:',
            '')  # get rid of Category: prefix in attribute category provided
        return {
            'text': page_set,
            'category': category
        }  # return dictonary made up of all pages' summaries and the category label
import datetime
from mwclient import Site

site = Site('https://lol.gamepedia.com', path="/")  # Set wiki
site.login('RheingoldRiver@BotPasswordName', 'smldrgsrthmldyhj')

limit = -1

now = datetime.datetime.utcnow()
now_timestamp = now.isoformat()
then = now - datetime.timedelta(hours=4)  # change hours if needed
last_timestamp = then.isoformat()

revisions = site.api('query',
                     format='json',
                     list='recentchanges',
                     rcstart=now_timestamp,
                     rcend=last_timestamp,
                     rcprop='title|ids',
                     rclimit='max',
                     rcdir='older')

pages = []
pages_used = {}
revs = {}
failed_pages = []

for revision in revisions['query']['recentchanges']:
    revs[revision['revid']] = True
    if revision['title'] in pages_used:
        pass