Пример #1
0
    def getAllTitles(self):
        """Retrieve page titles from wiki in accordance with arguments
        given to constructor, in batches, writing them out to a file.
        On error (failure to rerieve some titles), raises WikiRetrieveErr exception."""

        self.more = True

        if self.startDate:
            self.dateFormatter = Date()
            self.startDateString = self.dateFormatter.formatDate(
                self.startDate)
            self.endDateString = self.dateFormatter.formatDate(self.endDate)
            self.startDateSecs = self.dateFormatter.getSecs(
                self.startDateString)
            self.endDateSecs = self.dateFormatter.getSecs(self.endDateString)

        self.outputFd = File.openOutput(self.outFileName)

        count = 0
        while True:
            count = count + self.batchSize
            titles = self.getBatchOfTitles()
            self.writeTitles(titles)
            if not len(titles):
                # not always an error
                break
            # FIXME is there a possibility that there will be a continue elt and
            # we'll be served the same titles again?
            if not self.more:
                break
        self.outputFd.close()
Пример #2
0
 def writeSql(self):
     self.userDict = { 1: True }
     fd = File.openInput(self.xmlFile)
     logOutFd = File.openOutput(self.logOutFile)
     if self.userOutFile:
         userOutFd = File.openOutput(self.userOutFile)
     else:
         userOutFd = None
     if not self.skipHeader(fd):
         raise WikiContentErr("failed to find end of mediawiki/siteinfo header in xml file\n")
     eof = False
     while not eof:
         eof = self.doLogItem(fd, logOutFd, userOutFd)
     fd.close()
     logOutFd.close()
     if self.userOutFile:
         userOutFd.close()
     return
Пример #3
0
    def getAllTitles(self):
        """Retrieve page content for all titles in accordance with arguments
        given to constructor, in batches, writing it out to a file.
        On error (failure to retrieve some content), raises WikiRetrieveErr exception"""

        self.outputFd = File.openOutput(self.outFileName)
        self.inputFd = File.openInput(self.titlesFile)
        first = True
        count = 0

        eof = False
        while not eof:
            linecount = 0
            titles = []
            while not eof:
                line = self.inputFd.readline()
                if line == "":
                    eof = True
                line = line.strip()
                if line:
                    titles.append(line)
                    linecount = linecount + 1
                if linecount >= self.batchSize:
                    break

            if (not titles):
                break

            count = count + self.batchSize
            content = self.getBatchOfPageContent(titles)

            if not len(content):
                raise WikiRetrieveErr(
                    "content of zero length returned, uh oh.")

            if first:
                first = False
                content = self.stripSiteFooter(content)
            else:
                content = self.stripSiteHeaderAndFooter(content)

            self.outputFd.write(content)

        # cheap hack
        self.outputFd.write("</mediawiki>\n")
        self.outputFd.close()
        self.inputFd.close()
Пример #4
0
    def writeStubAndPageIds(self, contentPath, stubsPath, pageIdsPath):
        """Write an XML stub file (omitting text content) and a
        list of page ids, from a MediaWiki XML page content file.
        Arguments:
        contentPath  -- path to the XML page content file to read
        stubsPath    -- path to the stubs file to write
        pageIdsPath  -- path to the page ids file to write"""

        pagePattern = "^\s*<page>"
        compiledPagePattern = re.compile(pagePattern)
        revisionPattern = "^\s*<revision>"
        compiledRevisionPattern = re.compile(revisionPattern)
        idPattern = "^\s*<id>(?P<i>.+)</id>\s*\n$"
        compiledIdPattern = re.compile(idPattern)
        textPattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"'
        compiledTextPattern = re.compile(textPattern)

        inFd = File.openInput(contentPath)
        outFd = File.openOutput(stubsPath)
        outPageIdFd = File.openOutput(pageIdsPath)
        currentTitle = None
        currentTextId = None
        pageId = None

        expectRevId = False
        expectPageId = False

        for line in inFd:
            # FIXME we could jus calculate text len  if the output is missing
            # the bytes attr. (as in dumps not from Special:Export)
            # format in content file:
            #   <text <text xml:space="preserve" bytes="78">
            # format wanted for stubs file:
            #   <text id="11248" bytes="9" />
            if '<' in line:
                result = compiledTextPattern.match(line)
                if result:
                    line = result.group(
                        "s") + '<text id="%s" bytes="%s" />\n' % (
                            currentTextId, result.group("b"))
                    outFd.write(line)
                    continue
                elif '</text' in line:
                    continue

                result = compiledPagePattern.match(line)
                if result:
                    expectPageId = True
                    outFd.write(line)
                    continue
                result = compiledRevisionPattern.match(line)
                if result:
                    expectRevId = True
                    outFd.write(line)
                    continue
                if expectPageId:
                    result = compiledIdPattern.match(line)
                    if result:
                        outPageIdFd.write("1:%s\n" % result.group("i"))
                        expectPageId = False
                    outFd.write(line)
                    continue
                if expectRevId:
                    result = compiledIdPattern.match(line)
                    if result:
                        currentTextId = result.group("i")
                        expectRevId = False
                    outFd.write(line)
                    continue
                outFd.write(line)
            else:
                continue  # these are lines of text, we can skip them
        inFd.close()
        outFd.close()
        outPageIdFd.close()
Пример #5
0
        if verbose:
            sys.stderr.write("mediawiki titles added to page title hash\n")

        t.addTitlesFromFile(o['moduleTitlesPath'], "828")
        if verbose:
            sys.stderr.write("module titles added to page title hash\n")

        t.addTitlesFromFile(o['templateTitlesPath'], "10")
        if verbose:
            sys.stderr.write("template titles added to page title hash\n")

        t.uniq()

        o['mainTitlesWithPrefixPath'] = out.makePath(
            "main-titles-with-nsprefix.gz")
        outFd = File.openOutput(o['mainTitlesWithPrefixPath'])
        for line in t.list:
            outFd.write(line + "\n")
        outFd.close()

        o['tmplTitlesWithPrefixPath'] = out.makePath(
            "tmpl-titles-with-nsprefix.gz")
        outFd = File.openOutput(o['tmplTitlesWithPrefixPath'])
        for line in t.listTemplates:
            outFd.write(line + "\n")
        outFd.close()

        if (verbose):
            sys.stderr.write(
                "Done converting retrieved titles, have %s and %s\n" %
                (o['mainTitlesWithPrefixPath'], o['tmplTitlesWithPrefixPath']))
    def writeStubAndPageIds(self, contentPath, stubsPath, pageIdsPath): 
        """Write an XML stub file (omitting text content) and a
        list of page ids, from a MediaWiki XML page content file.
        Arguments:
        contentPath  -- path to the XML page content file to read
        stubsPath    -- path to the stubs file to write
        pageIdsPath  -- path to the page ids file to write"""
        
        pagePattern = "^\s*<page>"
        compiledPagePattern = re.compile(pagePattern)
        revisionPattern = "^\s*<revision>"
        compiledRevisionPattern = re.compile(revisionPattern)
        idPattern = "^\s*<id>(?P<i>.+)</id>\s*\n$"
        compiledIdPattern = re.compile(idPattern)
        textPattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"'
        compiledTextPattern = re.compile(textPattern)

        inFd = File.openInput(contentPath)
        outFd = File.openOutput(stubsPath)
        outPageIdFd = File.openOutput(pageIdsPath)
        currentTitle = None
        currentTextId = None
        pageId = None

        expectRevId = False
        expectPageId = False

        for line in inFd:
            # FIXME we could jus calculate text len  if the output is missing
            # the bytes attr. (as in dumps not from Special:Export)
            # format in content file:
            #   <text <text xml:space="preserve" bytes="78">
            # format wanted for stubs file:
            #   <text id="11248" bytes="9" />
            if '<' in line:
                result = compiledTextPattern.match(line)
                if result:
                    line = result.group("s") + '<text id="%s" bytes="%s" />\n' % (currentTextId, result.group("b"))
                    outFd.write(line)
                    continue
                elif '</text' in line:
                    continue

                result = compiledPagePattern.match(line)
                if result:
                    expectPageId = True
                    outFd.write(line)
                    continue
                result = compiledRevisionPattern.match(line)
                if result:
                    expectRevId = True
                    outFd.write(line)
                    continue
                if expectPageId:
                    result = compiledIdPattern.match(line)
                    if result:
                        outPageIdFd.write("1:%s\n" % result.group("i"))
                        expectPageId = False
                    outFd.write(line)
                    continue
                if expectRevId:
                    result = compiledIdPattern.match(line)
                    if result:
                        currentTextId = result.group("i")
                        expectRevId = False
                    outFd.write(line)
                    continue
                outFd.write(line)
            else:
                continue  # these are lines of text, we can skip them
        inFd.close()
        outFd.close()
        outPageIdFd.close()
        t.addTitlesFromFile(o['mediawikiTitlesPath'], "8")
        if verbose:
            sys.stderr.write("mediawiki titles added to page title hash\n")

        t.addTitlesFromFile(o['moduleTitlesPath'], "828")
        if verbose:
            sys.stderr.write("module titles added to page title hash\n")

        t.addTitlesFromFile(o['templateTitlesPath'], "10")
        if verbose:
            sys.stderr.write("template titles added to page title hash\n")

        t.uniq()

        o['mainTitlesWithPrefixPath'] = out.makePath("main-titles-with-nsprefix.gz")
        outFd = File.openOutput(o['mainTitlesWithPrefixPath'])
        for line in t.list:
            outFd.write(line + "\n")
        outFd.close()

        o['tmplTitlesWithPrefixPath'] = out.makePath("tmpl-titles-with-nsprefix.gz")
        outFd = File.openOutput(o['tmplTitlesWithPrefixPath'])
        for line in t.listTemplates:
            outFd.write(line + "\n")
        outFd.close()

        if (verbose):
           sys.stderr.write("Done converting retrieved titles, have %s and %s\n" % (o['mainTitlesWithPrefixPath'], o['tmplTitlesWithPrefixPath']))

    if o['retrieveContent']:
        if not o['mainTitlesWithPrefixPath'] or not o['tmplTitlesWithPrefixPath']: