Пример #1
0
    def addRelatedTitlesFromFile(self, filename, relatedNsList, nsList):
        """Read list of titles from file, for those in one of the
        specified namespaces, convert the title to one from its related
        namespace (i.e. if it was in Category talk, convert to Category,
        if it was in File talk, convert to File, etc.) and add to title
        list and dict. Arguments:
        filename       -- full path to list of titles
        relatedNsList  -- list of namespaces wanted, e.g. [ "4", "6", "12" ]
        nsList         -- list of namespaces to convert from, in the same order as the
                          related NsList, e.g. [ "5", "7", "13" ]"""

        # don't pass templates in here, we do those separately
        # because it could be a huge list and we want the user
        # to be able to save and reuse it
        fd = File.openInput(filename)
        for line in fd:
            line = line.strip()
            sep = line.find(":")
            if sep != -1:
                prefix = line[:sep]
                if prefix in self.nsDictByString:
                    # main, file, category, project talk namespaces
                    if self.nsDictByString[prefix] in relatedNsList:
                        noPrefixTitle = line[sep + 1:]
                        # convert to file, category, project namespace
                        relatedNs = str(int(self.nsDictByString[prefix]) - 1)
                        if (self.nsDict[relatedNs]):
                            newTitle = self.nsDict[
                                relatedNs] + ":" + noPrefixTitle
                        else:
                            newTitle = noPrefixTitle  # main namespace titles
                        self.list.append(newTitle)
                        if noPrefixTitle in self.dict:
                            self.dict[noPrefixTitle][relatedNs] = True
                        else:
                            self.dict[noPrefixTitle] = {relatedNs: True}
                    # file, category, project talk namespaces
                    elif self.nsDictByString[prefix] in nsList:
                        ns = self.nsDictByString[prefix]
                        noPrefixTitle = line[sep + 1:]
                        self.list.append(noPrefixTitle)
                        if noPrefixTitle in self.dict:
                            self.dict[noPrefixTitle][ns] = True
                        else:
                            self.dict[noPrefixTitle] = {ns: True}
            elif "0" in nsList:
                # main namespace, won't be caught above
                self.list.append(line)
                if line in self.dict:
                    self.dict[line]["0"] = True
                else:
                    self.dict[line] = {"0": True}
        fd.close()
    def addRelatedTitlesFromFile(self, filename, relatedNsList, nsList):
        """Read list of titles from file, for those in one of the
        specified namespaces, convert the title to one from its related
        namespace (i.e. if it was in Category talk, convert to Category,
        if it was in File talk, convert to File, etc.) and add to title
        list and dict. Arguments:
        filename       -- full path to list of titles
        relatedNsList  -- list of namespaces wanted, e.g. [ "4", "6", "12" ]
        nsList         -- list of namespaces to convert from, in the same order as the
                          related NsList, e.g. [ "5", "7", "13" ]"""

        # don't pass templates in here, we do those separately
        # because it could be a huge list and we want the user
        # to be able to save and reuse it 
        fd = File.openInput(filename)
        for line in fd:
            line = line.strip()
            sep = line.find(":")
            if sep != -1:
                prefix = line[:sep]
                if prefix in self.nsDictByString:
                    # main, file, category, project talk namespaces
                    if self.nsDictByString[prefix] in relatedNsList:
                        noPrefixTitle = line[sep+1:]
                        # convert to file, category, project namespace
                        relatedNs = str(int(self.nsDictByString[prefix]) - 1)
                        if (self.nsDict[relatedNs]):
                            newTitle = self.nsDict[relatedNs] + ":" + noPrefixTitle 
                        else:
                            newTitle = noPrefixTitle  # main namespace titles
                        self.list.append(newTitle)
                        if noPrefixTitle in self.dict:
                            self.dict[noPrefixTitle][relatedNs] = True
                        else:
                            self.dict[noPrefixTitle] = { relatedNs : True }
                    # file, category, project talk namespaces
                    elif self.nsDictByString[prefix] in nsList:
                        ns = self.nsDictByString[prefix]
                        noPrefixTitle = line[sep+1:]
                        self.list.append(noPrefixTitle)
                        if noPrefixTitle in self.dict:
                            self.dict[noPrefixTitle][ns] = True
                        else:
                            self.dict[noPrefixTitle] = { ns : True }
            elif "0" in nsList:
                # main namespace, won't be caught above
                self.list.append(line)
                if line in self.dict:
                    self.dict[line]["0"] = True
                else:
                    self.dict[line] = { "0" : True }
        fd.close()
Пример #3
0
 def getTitlesDict(self,sqlFile):
     """Arguments:
     sqlFile         -- file containing pageid whitespace nsnum whitespace pagetitle where the title
                        is expected to be sql escaped and can be enclosed with single quotes"""
     fd = File.openInput(sqlFile)
     t = {}
     for line in fd:
         (pageid, ns, title) = line.split(' ',3)
         ns = int(ns)
         if title in t:
             t[title][ns] = pageid
         else:
             t[title] = { ns: pageid }
     return t
Пример #4
0
    def getAllTitles(self):
        """Retrieve page content for all titles in accordance with arguments
        given to constructor, in batches, writing it out to a file.
        On error (failure to retrieve some content), raises WikiRetrieveErr exception"""

        self.outputFd = File.openOutput(self.outFileName)
        self.inputFd = File.openInput(self.titlesFile)
        first = True
        count = 0

        eof = False
        while not eof:
            linecount = 0
            titles = []
            while not eof:
                line = self.inputFd.readline()
                if line == "":
                    eof = True
                line = line.strip()
                if line:
                    titles.append(line)
                    linecount = linecount + 1
                if linecount >= self.batchSize:
                    break

            if (not titles):
                break

            count = count + self.batchSize
            content = self.getBatchOfPageContent(titles)

            if not len(content):
                raise WikiRetrieveErr(
                    "content of zero length returned, uh oh.")

            if first:
                first = False
                content = self.stripSiteFooter(content)
            else:
                content = self.stripSiteHeaderAndFooter(content)

            self.outputFd.write(content)

        # cheap hack
        self.outputFd.write("</mediawiki>\n")
        self.outputFd.close()
        self.inputFd.close()
Пример #5
0
 def writeSql(self):
     self.userDict = { 1: True }
     fd = File.openInput(self.xmlFile)
     logOutFd = File.openOutput(self.logOutFile)
     if self.userOutFile:
         userOutFd = File.openOutput(self.userOutFile)
     else:
         userOutFd = None
     if not self.skipHeader(fd):
         raise WikiContentErr("failed to find end of mediawiki/siteinfo header in xml file\n")
     eof = False
     while not eof:
         eof = self.doLogItem(fd, logOutFd, userOutFd)
     fd.close()
     logOutFd.close()
     if self.userOutFile:
         userOutFd.close()
     return
Пример #6
0
    def addTitlesFromFile(self, filename, ns):
        """add titles from a file to the title list and dict.
        Note that template titles get added to a different title list
        than the rest, for separate processing
        Arguments:
        filename   -- full path to file containing page titles
        ns         -- number (string of digits) of namespace of page titles to
                      grab from file"""

        fd = File.openInput(filename)
        prefix = self.nsDict[ns] + ":"
        prefixLen = len(prefix)
        for line in fd:
            if line.startswith(prefix):
                if ns == "10":  # special case bleah
                    self.listTemplates.append(line[:-1])  # lose newline
                else:
                    self.list.append(line[:-1])  # lose newline
                noPrefixTitle = line[prefixLen:-1]
                if noPrefixTitle in self.dict:
                    self.dict[noPrefixTitle][ns] = True
                else:
                    self.dict[noPrefixTitle] = {ns: True}
    def addTitlesFromFile(self, filename, ns):
        """add titles from a file to the title list and dict.
        Note that template titles get added to a different title list
        than the rest, for separate processing
        Arguments:
        filename   -- full path to file containing page titles
        ns         -- number (string of digits) of namespace of page titles to
                      grab from file"""

        fd = File.openInput(filename)
        prefix = self.nsDict[ns] + ":"
        prefixLen = len(prefix)
        for line in fd:
            if line.startswith(prefix):
                if ns == "10": # special case bleah
                    self.listTemplates.append(line[:-1]) # lose newline
                else:
                    self.list.append(line[:-1]) # lose newline
                noPrefixTitle = line[prefixLen:-1]
                if noPrefixTitle in self.dict:
                    self.dict[noPrefixTitle][ns] = True
                else:
                    self.dict[noPrefixTitle] = { ns : True }
Пример #8
0
    def writeStubAndPageIds(self, contentPath, stubsPath, pageIdsPath):
        """Write an XML stub file (omitting text content) and a
        list of page ids, from a MediaWiki XML page content file.
        Arguments:
        contentPath  -- path to the XML page content file to read
        stubsPath    -- path to the stubs file to write
        pageIdsPath  -- path to the page ids file to write"""

        pagePattern = "^\s*<page>"
        compiledPagePattern = re.compile(pagePattern)
        revisionPattern = "^\s*<revision>"
        compiledRevisionPattern = re.compile(revisionPattern)
        idPattern = "^\s*<id>(?P<i>.+)</id>\s*\n$"
        compiledIdPattern = re.compile(idPattern)
        textPattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"'
        compiledTextPattern = re.compile(textPattern)

        inFd = File.openInput(contentPath)
        outFd = File.openOutput(stubsPath)
        outPageIdFd = File.openOutput(pageIdsPath)
        currentTitle = None
        currentTextId = None
        pageId = None

        expectRevId = False
        expectPageId = False

        for line in inFd:
            # FIXME we could jus calculate text len  if the output is missing
            # the bytes attr. (as in dumps not from Special:Export)
            # format in content file:
            #   <text <text xml:space="preserve" bytes="78">
            # format wanted for stubs file:
            #   <text id="11248" bytes="9" />
            if '<' in line:
                result = compiledTextPattern.match(line)
                if result:
                    line = result.group(
                        "s") + '<text id="%s" bytes="%s" />\n' % (
                            currentTextId, result.group("b"))
                    outFd.write(line)
                    continue
                elif '</text' in line:
                    continue

                result = compiledPagePattern.match(line)
                if result:
                    expectPageId = True
                    outFd.write(line)
                    continue
                result = compiledRevisionPattern.match(line)
                if result:
                    expectRevId = True
                    outFd.write(line)
                    continue
                if expectPageId:
                    result = compiledIdPattern.match(line)
                    if result:
                        outPageIdFd.write("1:%s\n" % result.group("i"))
                        expectPageId = False
                    outFd.write(line)
                    continue
                if expectRevId:
                    result = compiledIdPattern.match(line)
                    if result:
                        currentTextId = result.group("i")
                        expectRevId = False
                    outFd.write(line)
                    continue
                outFd.write(line)
            else:
                continue  # these are lines of text, we can skip them
        inFd.close()
        outFd.close()
        outPageIdFd.close()
    def writeStubAndPageIds(self, contentPath, stubsPath, pageIdsPath): 
        """Write an XML stub file (omitting text content) and a
        list of page ids, from a MediaWiki XML page content file.
        Arguments:
        contentPath  -- path to the XML page content file to read
        stubsPath    -- path to the stubs file to write
        pageIdsPath  -- path to the page ids file to write"""
        
        pagePattern = "^\s*<page>"
        compiledPagePattern = re.compile(pagePattern)
        revisionPattern = "^\s*<revision>"
        compiledRevisionPattern = re.compile(revisionPattern)
        idPattern = "^\s*<id>(?P<i>.+)</id>\s*\n$"
        compiledIdPattern = re.compile(idPattern)
        textPattern = '^(?P<s>\s*)<text\s+[^<>/]*bytes="(?P<b>[0-9]+)"'
        compiledTextPattern = re.compile(textPattern)

        inFd = File.openInput(contentPath)
        outFd = File.openOutput(stubsPath)
        outPageIdFd = File.openOutput(pageIdsPath)
        currentTitle = None
        currentTextId = None
        pageId = None

        expectRevId = False
        expectPageId = False

        for line in inFd:
            # FIXME we could jus calculate text len  if the output is missing
            # the bytes attr. (as in dumps not from Special:Export)
            # format in content file:
            #   <text <text xml:space="preserve" bytes="78">
            # format wanted for stubs file:
            #   <text id="11248" bytes="9" />
            if '<' in line:
                result = compiledTextPattern.match(line)
                if result:
                    line = result.group("s") + '<text id="%s" bytes="%s" />\n' % (currentTextId, result.group("b"))
                    outFd.write(line)
                    continue
                elif '</text' in line:
                    continue

                result = compiledPagePattern.match(line)
                if result:
                    expectPageId = True
                    outFd.write(line)
                    continue
                result = compiledRevisionPattern.match(line)
                if result:
                    expectRevId = True
                    outFd.write(line)
                    continue
                if expectPageId:
                    result = compiledIdPattern.match(line)
                    if result:
                        outPageIdFd.write("1:%s\n" % result.group("i"))
                        expectPageId = False
                    outFd.write(line)
                    continue
                if expectRevId:
                    result = compiledIdPattern.match(line)
                    if result:
                        currentTextId = result.group("i")
                        expectRevId = False
                    outFd.write(line)
                    continue
                outFd.write(line)
            else:
                continue  # these are lines of text, we can skip them
        inFd.close()
        outFd.close()
        outPageIdFd.close()