Пример #1
0
def ListToParam( list ):
    """Convert a list of unicode strings into a UTF8 string separated by the '|'
    symbols

    """
    list = ConvToList( list )
    if len(list) == 0:
        return ''

    encList = ''
    # items may not have one symbol - '|'
    for item in list:
        if isinstance(item, basestring):
            if u'|' in item:
                raise wikipedia.Error(u"item '%s' contains '|' symbol" % item)
            encList += ToUtf8(item) + u'|'
        elif type(item) == int:
            encList += ToUtf8(item) + u'|'
        elif isinstance(item, wikipedia.Page):
            encList += ToUtf8(item.title()) + u'|'
        elif item.__class__.__name__ == 'User':
            # delay loading this until it is needed
            import userlib
            encList += ToUtf8(item.name()) + u'|'
        else:
            raise wikipedia.Error(u'unknown item class %s'
                                  % item.__class__.__name__)

    # strip trailing '|' before returning
    return encList[:-1]
Пример #2
0
def categoryAllElementsAPI(CatName,
                           cmlimit=5000,
                           categories_parsed=[],
                           site=None):
    # action=query&list=categorymembers&cmlimit=500&cmtitle=Category:License_tags
    """ Category to load all the elements in a category using the APIs.
    Limit: 5000 elements.

    """
    pywikibot.output("Loading %s..." % CatName)

    params = {
        'action': 'query',
        'list': 'categorymembers',
        'cmlimit': cmlimit,
        'cmtitle': CatName,
    }

    data = query.GetData(params, site)
    categories_parsed.append(CatName)
    try:
        members = data['query']['categorymembers']
    except KeyError:
        if int(cmlimit) != 500:
            pywikibot.output(
                u'An Error occured, trying to reload the category.')
            return categoryAllElementsAPI(CatName, cmlimit=500)
        else:
            raise pywikibot.Error(data)
    if len(members) == int(cmlimit):
        raise pywikibot.Error(
            u'The category selected has >= %s elements, limit reached.' %
            cmlimit)
    allmembers = members
    results = list()
    for subcat in members:
        ns = subcat['ns']
        pageid = subcat['pageid']
        title = subcat['title']
        if ns == 14:
            if title not in categories_parsed:
                categories_parsed.append(title)
                (results_part, categories_parsed) = categoryAllElementsAPI(
                    title, 5000, categories_parsed)
                allmembers.extend(results_part)
    for member in allmembers:
        ns = member['ns']
        pageid = member['pageid']
        title = member['title']
        results.append(member)
    return (results, categories_parsed)
Пример #3
0
    def query_api(self, host, path, **kwargs):
        data = urlencode([(k, v.encode('utf-8')) for k, v in kwargs.iteritems()])
        if path.endswith('query.php'):
            query_string = '%s?format=json&%s' % (path, data)
            method = 'GET'
            data = ''
        elif path.endswith('api.php'):
            query_string = '%s?format=json' % path
            method = 'POST'
        else:
            raise ValueError('Unknown api %s' % repr(api))

        try:
            res = self.request(method, query_string,
                {'Host': host, 'Content-Type': 'application/x-www-form-urlencoded'}, data)
        except httplib.ImproperConnectionState:
            self._conn.close()
            self.__init__(self.host)
        try:
            data = json.load(res)
        finally:
            res.close()

        if 'error' in data:
            if data['error']['code'] == u'internal_api_error_DBConnectionError':
                return self.query_api(host, path, **kwargs)
            raise wikipedia.Error(data['error']['code'],
                data['error']['info'])

        return data
Пример #4
0
def ListToParam(list):
    """Convert a list of unicode strings into a UTF8 string separated by the '|' symbols
    """
    list = ConvToList(list)
    if len(list) == 0:
        return ''

    encList = ''
    # items may not have one symbol - '|'
    for l in list:
        if type(l) == str and u'|' in l:
            raise wikipedia.Error("item '%s' contains '|' symbol" % l)
        encList += ToUtf8(l) + u'|'
    return encList[:-1]
Пример #5
0
    def getCategoryMembers(self, page, min=0, step=50):
        if (page.namespace() != 14):
            raise wikipedia.Error(
                "%s is not in category namespace '%s'" %
                (page.__repr__(), page.site().category_namespace()))

        q = """ SELECT page_namespace, page_title
                FROM %s.categorylinks
                LEFT JOIN %s.page
                ON page_id = cl_from
                WHERE cl_to=%%s """ % ((page.site().dbName(), ) * 2)

        for row in self._generate(
                q, min, step,
                page.titleWithoutNamespace(True).encode('utf-8')):
            if (row['page_namespace'] == 14):
                yield catlib.Category(
                    page.site(),
                    page.site().category_namespace() + ':' +
                    row['page_title'].decode('utf-8'), page.site())
            else:
                yield wikipedia.Page(page.site(),
                                     row['page_title'].decode('utf-8'),
                                     page.site(), row['page_namespace'])
Пример #6
0
def main(*args):
    add_cat = None
    gen = None
    # summary message
    summary_commandline = False
    # Array which will collect commandline parameters.
    # First element is original text, second element is replacement text.
    commandline_replacements = []
    # A list of 2-tuples of original text and replacement text.
    replacements = []
    # Don't edit pages which contain certain texts.
    exceptions = {
        'title': [],
        'text-contains': [],
        'inside': [],
        'inside-tags': [],
        'require-title': [],  # using a seperate requirements dict needs some
    }  # major refactoring of code.

    # Should the elements of 'replacements' and 'exceptions' be interpreted
    # as regular expressions?
    regex = False
    # Predefined fixes from dictionary 'fixes' (see above).
    fix = None
    # the dump's path, either absolute or relative, which will be used
    # if -xml flag is present
    xmlFilename = None
    useSql = False
    PageTitles = []
    # will become True when the user presses a ('yes to all') or uses the
    # -always flag.
    acceptall = False
    # Will become True if the user inputs the commandline parameter -nocase
    caseInsensitive = False
    # Will become True if the user inputs the commandline parameter -dotall
    dotall = False
    # Will become True if the user inputs the commandline parameter -multiline
    multiline = False
    # Do all hits when they overlap
    allowoverlap = False
    # Do not recurse replacement
    recursive = False
    # This is the maximum number of pages to load per query
    maxquerysize = 60
    # This factory is responsible for processing command line arguments
    # that are also used by other scripts and that determine on which pages
    # to work on.
    genFactory = pagegenerators.GeneratorFactory()
    # Load default summary message.
    # BUG WARNING: This is probably incompatible with the -lang parameter.
    editSummary = pywikibot.translate(pywikibot.getSite(), msg)
    # Between a regex and another (using -fix) sleep some time (not to waste
    # too much CPU
    sleep = None
    # Do not save the page titles, rather work on wiki
    titlefile = None
    filename = None
    # If we save, primary behaviour is append rather then new file
    append = True

    # Read commandline parameters.
    for arg in pywikibot.handleArgs(*args):
        if arg == '-regex':
            regex = True
        elif arg.startswith('-xmlstart'):
            if len(arg) == 9:
                xmlStart = pywikibot.input(
                    u'Please enter the dumped article to start with:')
            else:
                xmlStart = arg[10:]
        elif arg.startswith('-xml'):
            if len(arg) == 4:
                xmlFilename = pywikibot.input(
                    u'Please enter the XML dump\'s filename:')
            else:
                xmlFilename = arg[5:]
        elif arg == '-sql':
            useSql = True
        elif arg.startswith('-page'):
            if len(arg) == 5:
                PageTitles.append(
                    pywikibot.input(u'Which page do you want to change?'))
            else:
                PageTitles.append(arg[6:])
        elif arg.startswith('-savenew'):
            append = False
            if len(arg) == 8:
                filename = pywikibot.input(
                    u'Please enter the filename to save the titles \n(will be deleted if exists):'
                )
            else:
                filename = arg[9:]
        elif arg.startswith('-save'):
            if len(arg) == 5:
                filename = pywikibot.input(
                    u'Please enter the filename to save the titles:')
            else:
                filename = arg[6:]
        elif arg.startswith('-excepttitle:'):
            exceptions['title'].append(arg[13:])
        elif arg.startswith('-requiretitle:'):
            exceptions['require-title'].append(arg[14:])
        elif arg.startswith('-excepttext:'):
            exceptions['text-contains'].append(arg[12:])
        elif arg.startswith('-exceptinside:'):
            exceptions['inside'].append(arg[14:])
        elif arg.startswith('-exceptinsidetag:'):
            exceptions['inside-tags'].append(arg[17:])
        elif arg.startswith('-fix:'):
            fix = arg[5:]
        elif arg.startswith('-sleep:'):
            sleep = float(arg[7:])
        elif arg == '-always':
            acceptall = True
        elif arg == '-recursive':
            recursive = True
        elif arg == '-nocase':
            caseInsensitive = True
        elif arg == '-dotall':
            dotall = True
        elif arg == '-multiline':
            multiline = True
        elif arg.startswith('-addcat:'):
            add_cat = arg[8:]
        elif arg.startswith('-summary:'):
            editSummary = arg[9:]
            summary_commandline = True
        elif arg.startswith('-allowoverlap'):
            allowoverlap = True
        elif arg.startswith('-query:'):
            maxquerysize = int(arg[7:])
        else:
            if not genFactory.handleArg(arg):
                commandline_replacements.append(arg)

    if (len(commandline_replacements) % 2):
        raise pywikibot.Error, 'require even number of replacements.'
    elif (len(commandline_replacements) == 2 and fix is None):
        replacements.append(
            (commandline_replacements[0], commandline_replacements[1]))
        if not summary_commandline:
            editSummary = pywikibot.translate(pywikibot.getSite(), msg) % (
                ' (-%s +%s)' %
                (commandline_replacements[0], commandline_replacements[1]))
    elif (len(commandline_replacements) > 1):
        if (fix is None):
            for i in xrange(0, len(commandline_replacements), 2):
                replacements.append((commandline_replacements[i],
                                     commandline_replacements[i + 1]))
            if not summary_commandline:
                pairs = [(commandline_replacements[i],
                          commandline_replacements[i + 1])
                         for i in range(0, len(commandline_replacements), 2)]
                replacementsDescription = '(%s)' % ', '.join(
                    [('-' + pair[0] + ' +' + pair[1]) for pair in pairs])
                editSummary = pywikibot.translate(pywikibot.getSite(), msg ) \
                              % replacementsDescription
        else:
            raise pywikibot.Error(
                'Specifying -fix with replacements is undefined')
    elif fix is None:
        old = pywikibot.input(
            u'Please enter the text that should be replaced:')
        new = pywikibot.input(u'Please enter the new text:')
        change = '(-' + old + ' +' + new
        replacements.append((old, new))
        while True:
            old = pywikibot.input(
                u'Please enter another text that should be replaced, or press Enter to start:'
            )
            if old == '':
                change = change + ')'
                break
            new = pywikibot.input(u'Please enter the new text:')
            change = change + ' & -' + old + ' +' + new
            replacements.append((old, new))
        if not summary_commandline:
            default_summary_message = pywikibot.translate(
                pywikibot.getSite(), msg) % change
            pywikibot.output(u'The summary message will default to: %s' %
                             default_summary_message)
            summary_message = pywikibot.input(
                u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:'
            )
            if summary_message == '':
                summary_message = default_summary_message
            editSummary = summary_message

    else:
        # Perform one of the predefined actions.
        try:
            fix = fixes.fixes[fix]
        except KeyError:
            pywikibot.output(u'Available predefined fixes are: %s' %
                             fixes.fixes.keys())
            return
        if "regex" in fix:
            regex = fix['regex']
        if "msg" in fix:
            editSummary = pywikibot.translate(pywikibot.getSite(), fix['msg'])
        if "exceptions" in fix:
            exceptions = fix['exceptions']
        if "nocase" in fix:
            caseInsensitive = fix['nocase']
        replacements = fix['replacements']

    #Set the regular expression flags
    flags = re.UNICODE
    if caseInsensitive:
        flags = flags | re.IGNORECASE
    if dotall:
        flags = flags | re.DOTALL
    if multiline:
        flags = flags | re.MULTILINE

    # Pre-compile all regular expressions here to save time later
    for i in range(len(replacements)):
        old, new = replacements[i]
        if not regex:
            old = re.escape(old)
        oldR = re.compile(old, flags)
        replacements[i] = oldR, new

    for exceptionCategory in [
            'title', 'require-title', 'text-contains', 'inside'
    ]:
        if exceptionCategory in exceptions:
            patterns = exceptions[exceptionCategory]
            if not regex:
                patterns = [re.escape(pattern) for pattern in patterns]
            patterns = [re.compile(pattern, flags) for pattern in patterns]
            exceptions[exceptionCategory] = patterns

    if xmlFilename:
        try:
            xmlStart
        except NameError:
            xmlStart = None
        gen = XmlDumpReplacePageGenerator(xmlFilename, xmlStart, replacements,
                                          exceptions)
    elif useSql:
        whereClause = 'WHERE (%s)' % ' OR '.join([
            "old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern)
            for (old, new) in replacements
        ])
        if exceptions:
            exceptClause = 'AND NOT (%s)' % ' OR '.join([
                "old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern)
                for exc in exceptions
            ])
        else:
            exceptClause = ''
        query = u"""
SELECT page_namespace, page_title
FROM page
JOIN text ON (page_id = old_id)
%s
%s
LIMIT 200""" % (whereClause, exceptClause)
        gen = pagegenerators.MySQLPageGenerator(query)
    elif PageTitles:
        pages = [
            pywikibot.Page(pywikibot.getSite(), PageTitle)
            for PageTitle in PageTitles
        ]
        gen = iter(pages)

    gen = genFactory.getCombinedGenerator(gen)
    if not gen:
        # syntax error, show help text from the top of this file
        pywikibot.showHelp('replace')
        return
    if xmlFilename:
        # XML parsing can be quite slow, so use smaller batches and
        # longer lookahead.
        preloadingGen = pagegenerators.PreloadingGenerator(gen,
                                                           pageNumber=20,
                                                           lookahead=100)
    else:
        preloadingGen = pagegenerators.PreloadingGenerator(
            gen, pageNumber=maxquerysize)

    #Finally we open the file for page titles or set article to None
    if filename:
        try:
            #This opens in strict error mode, that means bot will stop
            #on encoding errors with ValueError.
            #See http://docs.python.org/library/codecs.html#codecs.open
            titlefile = codecs.open(filename,
                                    encoding='utf-8',
                                    mode=(lambda x: x and 'a' or 'w')(append))
        except IOError:
            pywikibot.output("%s cannot be opened for writing." % filename)
            return
    bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall,
                       allowoverlap, recursive, add_cat, sleep, editSummary,
                       titlefile)
    try:
        bot.run()
    finally:
        if titlefile:
            #Just for the spirit of programming (it was flushed)
            titlefile.close()
Пример #7
0
def replaceCategoryLinks(oldtext, new, site=None, addOnly=False):
    """
    Replace the category links given in the wikitext given
    in oldtext by the new links given in new.

    'new' should be a list of Category objects or strings
          which can be either the raw name or [[Category:..]].

    If addOnly is True, the old category won't be deleted and the
    category(s) given will be added (and so they won't replace anything).

    """
    # Find a marker that is not already in the text.
    marker = findmarker(oldtext)
    if site is None:
        site = pywikibot.getSite()
    if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext:
        raise pywikibot.Error("""\
The PyWikipediaBot is no longer allowed to touch categories on the German
Wikipedia on pages that contain the Personendaten template because of the
non-standard placement of that template.
See http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006#Position_der_Personendaten_am_.22Artikelende.22
""")
    separator = site.family.category_text_separator
    iseparator = site.family.interwiki_text_separator
    separatorstripped = separator.strip()
    iseparatorstripped = iseparator.strip()
    if addOnly:
        s2 = oldtext
    else:
        s2 = removeCategoryLinksAndSeparator(oldtext,
                                             site=site,
                                             marker=marker,
                                             separator=separatorstripped)
    s = categoryFormat(new, insite=site)
    if s:
        if site.language() in site.family.category_attop:
            newtext = s + separator + s2
        else:
            # calculate what was after the categories links on the page
            firstafter = s2.find(marker)
            if firstafter < 0:
                firstafter = len(s2)
            else:
                firstafter += len(marker)
            # Is there text in the 'after' part that means we should keep it
            # after?
            if "</noinclude>" in s2[firstafter:]:
                if separatorstripped:
                    s = separator + s
                newtext = (s2[:firstafter].replace(marker, '') + s +
                           s2[firstafter:])
            elif site.language() in site.family.categories_last:
                newtext = s2.replace(marker, '').strip() + separator + s
            else:
                interwiki = getLanguageLinks(s2)
                s2 = removeLanguageLinksAndSeparator(s2.replace(
                    marker, ''), site, '', iseparatorstripped) + separator + s
                newtext = replaceLanguageLinks(s2,
                                               interwiki,
                                               site=site,
                                               addOnly=True)
    else:
        newtext = s2.replace(marker, '')
    return newtext.strip()