예제 #1
0
파일: importer.py 프로젝트: afcarl/sicekit
class Importer(object):
    """The Importer ill import all articles and files from
	configuration.datapath into the configured wiki."""
    def __init__(self, configuration, wiki):
        self.configuration = configuration
        self.wikiutil = WikiUtil(wiki)
        self.wiki = self.wikiutil.wiki  # WikiUtil modifies wiki

    def _buildPageListPerDir(self, pagelist, dirname, fnames):
        for fname in fnames:
            path = os.path.join(dirname, fname)
            if os.path.isfile(path) and path.endswith('.xml'):
                print "I: Considering file %s." % path
                pagelist.append(self.readDumpFile(path))

    def _buildPageList(self):
        pagelist = []
        os.path.walk(self.configuration.datapath, self._buildPageListPerDir,
                     pagelist)
        return pagelist

    def _buildFileListPerDir(self, filelist, dirname, fnames):
        for fname in fnames:
            path = os.path.join(dirname, fname)
            if os.path.isfile(path) and path.endswith('.sha1'):
                path = path.rpartition('.')[0]
                print "I: Considering file %s." % path
                filelist.append(path)

    def _buildFileList(self):
        filelist = []
        os.path.walk(os.path.join(self.configuration.datapath, 'File'),
                     self._buildFileListPerDir, filelist)
        return filelist

    def readDumpFile(self, path):
        xmlEl = self.wikiutil.readXmlFromFile(path)
        title = xmlEl.find(XMLNS + u'page/' + XMLNS + u'title').text
        revision = xmlEl.find(XMLNS + u'page/' + XMLNS + u'revision')
        revision.find(
            XMLNS + u'timestamp').text = self.wikiutil.buildWikiTimestampNow()
        revision.find(XMLNS + u'id').text = '1'

        io = StringIO(xml.etree.ElementTree.tostring(xmlEl))
        io.name = "import.xml"

        return {u'title': title, u'io': io, u'xmldoc': xmlEl}

    def importPage(self, page):
        """Runs an XML import for page "page"."""
        title = page[u'title']
        print "I: Importing page %s." % title

        oldpage = self.wikiutil.retrievePageExportXml(title)
        if oldpage is not None:
            oldtext = self.wikiutil.getPageTextFromXml(oldpage)
            newtext = self.wikiutil.getPageTextFromXml(page[u'xmldoc'])
            if oldtext == newtext:
                print "I: Skipping, no changes."
                return

        # fetch import token
        params = {
            'action': 'query',
            'prop': 'info',
            'intoken': 'import',
            'titles': title
        }
        request = APIRequest(self.wiki, params)
        result = request.query()['query']
        if u'warnings' in result.keys():
            print 'W: Wiki gave warning: ' + result[u'warnings'][u'info'][u'*']
        pages = result['pages']
        importtoken = pages[pages.keys()[0]]['importtoken']

        # now post the page
        params = {'action': 'import', 'token': importtoken}
        request = APIRequest(self.wiki, params)
        request.setMultipart()
        request.changeParam('xml', page[u'io'])
        result = request.query()

        if result[u'import'][0][u'revisions'] != 1:
            print "E: Page import failed. Wiki said:", result
            return 1

        self.pagecount = self.pagecount + 1
        self.changecount = self.changecount + result[u'import'][0][u'revisions']

    def importFile(self, path):
        """Uploads a file from "path" to the wiki, first checking against the
		stored sha1sums.
		"""
        prefix = os.path.commonprefix(
            [path, os.path.join(self.configuration.datapath, 'File')])
        title = path[len(prefix) + 1:]
        print "I: Importing image %s." % title

        # get new sha1
        new_sha1sum = file(path + '.sha1', 'r').read(40)
        new_sha1dum = new_sha1sum.strip('\r\n ')

        # get old sha1 from wiki, don't import if they match
        old_image = self.wikiutil.retrieveImageInfo('File:%s' % title)
        if not old_image.has_key('missing'):
            old_sha1sum = old_image['imageinfo'][0][u'sha1']
            if old_sha1sum == new_sha1sum:
                print "I: Skipping, no changes."
                return

        # fetch import token
        params = {
            'action': 'query',
            'prop': 'info',
            'intoken': 'edit',
            'titles': 'File:%s' % title
        }
        request = APIRequest(self.wiki, params)
        result = request.query()['query']
        if u'warnings' in result.keys():
            print 'W: Wiki gave warning: ' + result[u'warnings'][u'info'][u'*']
        pages = result['pages']
        token = pages[pages.keys()[0]]['edittoken']

        # get us the file object, so we can attach it to our request
        f = file(path, 'r')
        # now post the real upload request
        params = {
            'action': 'upload',
            'filename': title,
            'token': token,
            'comment': 'Origin-SICEKIT',
            'ignorewarnings': True
        }
        request = APIRequest(self.wiki, params)
        request.setMultipart()
        request.changeParam('file', f)
        result = request.query()
        f.close()

        if result[u'upload'][u'result'] != 'Success':
            print "E: Page import failed. Wiki said:", result
            return 1

        self.pagecount = self.pagecount + 1
        self.changecount = self.changecount + 1

    def run(self):
        """Main entry point for the Importer."""
        self.pagecount = 0
        self.changecount = 0

        print "I: Looking for pages to import in %s." % self.configuration.datapath
        pagelist = self._buildPageList()
        print "I: Now importing pages."
        map(self.importPage, pagelist)

        print "I: Looking for images to import in %s." % self.configuration.datapath
        filelist = self._buildFileList()
        print "I: Now importing images."
        map(self.importFile, filelist)

        print "I: Imported %d pages, resulting in %d changes." % (
            self.pagecount, self.changecount)
        return 0
예제 #2
0
class Importer(object):
	"""The Importer ill import all articles and files from
	configuration.datapath into the configured wiki."""

	def __init__(self, configuration, wiki):
		self.configuration = configuration
		self.wikiutil = WikiUtil(wiki)
		self.wiki = self.wikiutil.wiki # WikiUtil modifies wiki

	def _buildPageListPerDir(self, pagelist, dirname, fnames):
		for fname in fnames:
			path = os.path.join(dirname, fname)
			if os.path.isfile(path) and path.endswith('.xml'):
				print "I: Considering file %s." % path
				pagelist.append(self.readDumpFile(path))

	def _buildPageList(self):
		pagelist = []
		os.path.walk(self.configuration.datapath, self._buildPageListPerDir, pagelist)
		return pagelist

	def _buildFileListPerDir(self, filelist, dirname, fnames):
		for fname in fnames:
			path = os.path.join(dirname, fname)
			if os.path.isfile(path) and path.endswith('.sha1'):
				path = path.rpartition('.')[0]
				print "I: Considering file %s." % path
				filelist.append(path)

	def _buildFileList(self):
		filelist = []
		os.path.walk(os.path.join(self.configuration.datapath, 'File'), self._buildFileListPerDir, filelist)
		return filelist

	def readDumpFile(self, path):
		xmlEl = self.wikiutil.readXmlFromFile(path)
		title = xmlEl.find(XMLNS+u'page/'+XMLNS+u'title').text
		revision = xmlEl.find(XMLNS+u'page/'+XMLNS+u'revision')
		revision.find(XMLNS+u'timestamp').text = self.wikiutil.buildWikiTimestampNow()
		revision.find(XMLNS+u'id').text = '1'

		io = StringIO(xml.etree.ElementTree.tostring(xmlEl))
		io.name = "import.xml"

		return {u'title': title, u'io':io, u'xmldoc': xmlEl}

	def importPage(self, page):
		"""Runs an XML import for page "page"."""
		title = page[u'title']
		print "I: Importing page %s." % title

		oldpage = self.wikiutil.retrievePageExportXml(title)
		if oldpage is not None:
			oldtext = self.wikiutil.getPageTextFromXml(oldpage)
			newtext = self.wikiutil.getPageTextFromXml(page[u'xmldoc'])
			if oldtext == newtext:
				print "I: Skipping, no changes."
				return

		# fetch import token
		params = {'action':'query', 'prop':'info', 'intoken':'import', 'titles':title}
		request = APIRequest(self.wiki, params)
		result = request.query()['query']
		if u'warnings' in result.keys():
			print 'W: Wiki gave warning: ' + result[u'warnings'][u'info'][u'*']
		pages = result['pages']
		importtoken = pages[pages.keys()[0]]['importtoken']

		# now post the page
		params = {'action':'import', 'token':importtoken}
		request = APIRequest(self.wiki, params)
		request.setMultipart()
		request.changeParam('xml', page[u'io'])
		result = request.query()

		if result[u'import'][0][u'revisions'] != 1:
			print "E: Page import failed. Wiki said:", result
			return 1

		self.pagecount = self.pagecount + 1
		self.changecount = self.changecount + result[u'import'][0][u'revisions']

	def importFile(self, path):
		"""Uploads a file from "path" to the wiki, first checking against the
		stored sha1sums.
		"""
		prefix = os.path.commonprefix([path, os.path.join(self.configuration.datapath, 'File')])
		title = path[len(prefix)+1:]
		print "I: Importing image %s." % title

		# get new sha1
		new_sha1sum = file(path + '.sha1', 'r').read(40)
		new_sha1dum = new_sha1sum.strip('\r\n ')

		# get old sha1 from wiki, don't import if they match
		old_image = self.wikiutil.retrieveImageInfo('File:%s' % title)
		if not old_image.has_key('missing'):
			old_sha1sum = old_image['imageinfo'][0][u'sha1']
			if old_sha1sum == new_sha1sum:
				print "I: Skipping, no changes."
				return

		# fetch import token
		params = {'action':'query', 'prop':'info', 'intoken':'edit', 'titles':'File:%s'%title}
		request = APIRequest(self.wiki, params)
		result = request.query()['query']
		if u'warnings' in result.keys():
			print 'W: Wiki gave warning: ' + result[u'warnings'][u'info'][u'*']
		pages = result['pages']
		token = pages[pages.keys()[0]]['edittoken']

		# get us the file object, so we can attach it to our request
		f = file(path, 'r')
		# now post the real upload request
		params = {'action':'upload', 'filename':title, 'token':token, 'comment':'Origin-SICEKIT', 'ignorewarnings':True}
		request = APIRequest(self.wiki, params)
		request.setMultipart()
		request.changeParam('file', f)
		result = request.query()
		f.close()

		if result[u'upload'][u'result'] != 'Success':
			print "E: Page import failed. Wiki said:", result
			return 1

		self.pagecount = self.pagecount + 1
		self.changecount = self.changecount + 1

	def run(self):
		"""Main entry point for the Importer."""
		self.pagecount = 0
		self.changecount = 0

		print "I: Looking for pages to import in %s." % self.configuration.datapath
		pagelist = self._buildPageList()
		print "I: Now importing pages."
		map(self.importPage, pagelist)

		print "I: Looking for images to import in %s." % self.configuration.datapath
		filelist = self._buildFileList()
		print "I: Now importing images."
		map(self.importFile, filelist)

		print "I: Imported %d pages, resulting in %d changes." % (self.pagecount, self.changecount)
		return 0
예제 #3
0
class Exporter(object):
	def __init__(self, configuration, wiki):
		self.configuration = configuration
		self.wikiutil = WikiUtil(wiki)
		self.wiki = self.wikiutil.wiki

	def buildPageList(self):
		return self.wikiutil.retrieveCategoryMemberList(self.configuration.export_category)

	def buildPageFilesystemPath(self, page, extension='.xml'):
		tmp = page[u'title'].replace(':','/').rsplit('/', 1)
		if len(tmp) == 1: tmp = ['', tmp[0]]
		(directory, filename) = tmp
		return (os.path.join(self.configuration.datapath, directory), filename+extension)

	def writeDumpFile(self, page, xmlElement):
		(directory, filename) = self.buildPageFilesystemPath(page)
		if not os.path.exists(directory): os.makedirs(directory)
		return self.wikiutil.writeXmlToFile(os.path.join(directory, filename), xmlElement)

	def exportPage(self, page):
		title = page[u'title']
		print "I: Exporting page %s." % title
		xml = self.wikiutil.retrievePageExportXml(title)

		# remove stuff we don't need / don't want to leak
		revision = xml.find(XMLNS+u'page/'+XMLNS+u'revision')
		if revision.find(XMLNS+u'comment'):
			revision.remove(revision.find(XMLNS+u'comment'))
		contributor = revision.find(XMLNS+u'contributor')
		contributor.find(XMLNS+u'username').text = u'SICEKIT'
		contributor.find(XMLNS+u'id').text = u'0'
		siteinfo = xml.find(XMLNS+u'siteinfo')
		siteinfo.find(XMLNS+u'sitename').text = u'SICEKIT'
		siteinfo.find(XMLNS+u'base').text = u'chrome:///sicekit'
		siteinfo.find(XMLNS+u'generator').text = u'SICEKIT'
		siteinfo.remove(siteinfo.find(XMLNS+u'namespaces'))

		bytes = self.writeDumpFile(page, xml)

		self.pagecount = self.pagecount + 1
		self.bytecount = self.bytecount + bytes

	def buildImageList(self, pages):
		titles = '|'.join(map(lambda x:x[u'title'], pages))
		params = {'action':'query','prop':'images','titles':titles}
		request = APIRequest(self.wiki, params)

		pages = request.query()['query']['pages']
		images = []
		# filter page list so we only get images
		for pageid in pages.keys():
			page = pages[pageid]
			if page.has_key('images'):
				images.extend(map(lambda img: img[u'title'], page['images']))
		return images

	def exportImage(self, title):
		print "I: Exporting image %s." % title
		page = self.wikiutil.retrieveImageInfo(title)
		if page.has_key('missing'):
			print "W: Image %s does not exist." % title
			return
		imageinfo = page['imageinfo'][0]
		url = imageinfo[u'url']
		sha1 = imageinfo[u'sha1']
		directory = self.buildPageFilesystemPath(page)[0]
		if not os.path.exists(directory): os.makedirs(directory)
		image_path = os.path.join(*self.buildPageFilesystemPath(page, extension=''))
		sha1_path = os.path.join(*self.buildPageFilesystemPath(page, extension='.sha1'))
		bytes = self.wikiutil.downloadFile(url, image_path)
		f = file(sha1_path, 'w')
		f.write(sha1.encode('ascii'))
		f.close()
		self.bytecount = self.bytecount + bytes + 40 #sha1 is 40byte
		self.pagecount = self.pagecount + 2

	def run(self):
		if os.path.exists(self.configuration.datapath):
			print "I: Wiping export directory %s." % self.configuration.datapath
			shutil.rmtree(self.configuration.datapath)
		print "I: Exporting pages to %s." % self.configuration.datapath
		pages = self.buildPageList()
		self.pagecount = 0
		self.bytecount = 0
		map(self.exportPage, pages)
		print "I: Exporting images."
		images = self.buildImageList(pages)
		map(self.exportImage, images)

		print "I: Exported %d bytes in %d objects." % (self.bytecount, self.pagecount)
		return 0