def getIndex(config={}): """ Returns Index.php for a wiki, if available """ index = '' if config['wikiengine'] == 'mediawiki': import mediawiki index = mediawiki.mwGetIndex(config=config) return index
def getParameters(params=[]): """ Import parameters into variable """ if not params: params = sys.argv config = {} parser = argparse.ArgumentParser( description='Tools for downloading and preserving wikis.') # General params parser.add_argument('-v', '--version', action='version', version=getVersion()) parser.add_argument('--cookies', metavar="cookies.txt", help="Path to a cookies.txt file.") parser.add_argument('--delay', metavar=5, default=0, type=float, help="Adds a delay (in seconds).") parser.add_argument('--retries', metavar=5, default=5, help="Maximum number of retries.") parser.add_argument('--path', help='Path to store wiki dump at.') parser.add_argument( '--resume', action='store_true', help='Resumes previous incomplete dump (requires --path).') parser.add_argument('--force', action='store_true', help='') parser.add_argument('--user', help='Username if authentication is required.') parser.add_argument('--pass', dest='password', help='Password if authentication is required.') # URL params # This script should work with any general URL, finding out # API, index.php or whatever by itself when necessary groupWiki = parser.add_argument_group() groupWiki.add_argument('wiki', default='', nargs='?', help="URL to wiki (e.g. http://wiki.domain.org).") # URL params for MediaWiki groupWiki.add_argument( '--mwapi', help="URL to MediaWiki API (e.g. http://wiki.domain.org/w/api.php).") groupWiki.add_argument( '--mwindex', help= "URL to MediaWiki index.php (e.g. http://wiki.domain.org/w/index.php)." ) # Download params groupDownload = parser.add_argument_group( 'Data to download', 'What info download from the wiki') groupDownload.add_argument( '--pages', action='store_true', help= "Generates a dump of pages (--pages --curonly for current revisions only)." ) groupDownload.add_argument('--curonly', action='store_true', help='Store only the current version of pages.') groupDownload.add_argument('--images', action='store_true', help="Generates an image dump.") groupDownload.add_argument( '--namespaces', metavar="1,2,3", help='Comma-separated value of namespaces to include (all by default).' ) groupDownload.add_argument( '--exnamespaces', metavar="1,2,3", help='Comma-separated value of namespaces to exclude.') # Meta info params groupMeta = parser.add_argument_group( 'Meta info', 'What meta info to retrieve from the wiki') groupMeta.add_argument('--get-api', action='store_true', help="Returns wiki API when available.") groupMeta.add_argument('--get-index', action='store_true', help="Returns wiki Index.php when available.") groupMeta.add_argument('--get-page-titles', action='store_true', help="Returns wiki page titles.") groupMeta.add_argument('--get-image-names', action='store_true', help="Returns wiki image names.") groupMeta.add_argument('--get-namespaces', action='store_true', help="Returns wiki namespaces.") groupMeta.add_argument('--get-wiki-engine', action='store_true', help="Returns wiki engine.") args = parser.parse_args() #sys.stderr.write(args) # Not wiki? Exit if not args.wiki: sys.stderr.write('ERROR: Provide a URL to a wiki\n') parser.print_help() sys.exit(1) # Don't mix download params and meta info params if (args.pages or args.images) and \ (args.get_api or args.get_index or args.get_page_titles or args.get_image_names or args.get_namespaces or args.get_wiki_engine): sys.stderr.write( 'ERROR: Don\'t mix download params and meta info params\n') parser.print_help() sys.exit(1) # No download params and no meta info params? Exit if (not args.pages and not args.images) and \ (not args.get_api and not args.get_index and not args.get_page_titles and not args.get_image_names and not args.get_namespaces and not args.get_wiki_engine): sys.stderr.write( 'ERROR: Use at least one download param or meta info param\n') parser.print_help() sys.exit(1) # Load cookies cj = cookielib.MozillaCookieJar() if args.cookies: cj.load(args.cookies) sys.stderr.write('Using cookies from %s\n' % args.cookies) # check user and pass (one requires both) if (args.user and not args.password) or (args.password and not args.user): sys.stderr.write( 'ERROR: Both --user and --pass are required for authentication.\n') parser.print_help() sys.exit(1) session = None if args.user and args.password: import requests session = requests.Session() session.cookies = cj session.headers.update({'User-Agent': getUserAgent()}) session.auth = (args.user, args.password) #session.mount(args.mw_api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret)) Mediawiki-centric, be careful # check URLs for url in [args.mwapi, args.mwindex, args.wiki]: if url and (not url.startswith('http://') and not url.startswith('https://')): sys.stderr.write(url) sys.stderr.write( 'ERROR: URLs must start with http:// or https://\n') parser.print_help() sys.exit(1) # Meta info params metainfo = '' # only one allowed, so we don't mix output if args.get_api: metainfo = 'get_api' elif args.get_index: metainfo = 'get_index' elif args.get_page_titles: metainfo = 'get_page_titles' elif args.get_image_names: metainfo = 'get_image_names' elif args.get_namespaces: metainfo = 'get_namespaces' elif args.get_wiki_engine: metainfo = 'get_wiki_engine' namespaces = ['all'] exnamespaces = [] # Process namespace inclusions if args.namespaces: # fix, why - ? and... --namespaces= all with a space works? if re.search(r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all': sys.stderr.write( "Invalid namespace values.\nValid format is integer(s) separated by commas\n" ) sys.exit() else: ns = re.sub(' ', '', args.namespaces) if ns.lower() == 'all': namespaces = ['all'] else: namespaces = [int(i) for i in ns.split(',')] # Process namespace exclusions if args.exnamespaces: if re.search(r'[^\d, \-]', args.exnamespaces): sys.stderr.write( "Invalid namespace values.\nValid format is integer(s) separated by commas\n" ) sys.exit(1) else: ns = re.sub(' ', '', args.exnamespaces) if ns.lower() == 'all': sys.stderr.write('You cannot exclude all namespaces.\n') sys.exit(1) else: exnamespaces = [int(i) for i in ns.split(',')] # --curonly requires --xml if args.curonly and not args.pages: sys.stderr.write("--curonly requires --pages\n") parser.print_help() sys.exit(1) config = { 'cookies': args.cookies or '', 'curonly': args.curonly, 'date': datetime.datetime.now().strftime('%Y%m%d'), 'delay': args.delay, 'exnamespaces': exnamespaces, 'images': args.images, 'logs': False, 'metainfo': metainfo, 'namespaces': namespaces, 'pages': args.pages, 'path': args.path and os.path.normpath(args.path) or '', 'retries': int(args.retries), 'wiki': args.wiki, 'wikicanonical': '', 'wikiengine': getWikiEngine(args.wiki), 'other': { 'configfilename': 'config.txt', 'filenamelimit': 100, # do not change 'force': args.force, 'resume': args.resume, 'session': session, } } # Get ready special variables (API for MediWiki, etc) if config['wikiengine'] == 'mediawiki': import mediawiki config['mwexport'] = 'Special:Export' if not args.mwapi: config['mwapi'] = mediawiki.mwGetAPI(config=config) if not config['mwapi']: sys.stderr.write('ERROR: Provide a URL to API\n') sys.exit(1) else: data = { 'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json' } r = getURL(config['mwapi'], data=data) config['mwexport'] = getJSON(r)['query']['namespaces']['-1']['*'] \ + ':Export' if not args.mwindex: config['mwindex'] = mediawiki.mwGetIndex(config=config) if not config['mwindex']: sys.stderr.write('ERROR: Provide a URL to Index.php\n') sys.exit(1) elif wikiengine == 'wikispaces': import wikispaces # use wikicanonical for base url for Wikispaces? # calculating path, if not defined by user with --path= if not config['path']: config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config), config['date']) return config
def getParameters(params=[]): """ Import parameters into variable """ if not params: params = sys.argv config = {} parser = argparse.ArgumentParser(description='Tools for downloading and preserving wikis.') # General params parser.add_argument( '-v', '--version', action='version', version=getVersion()) parser.add_argument( '--cookies', metavar="cookies.txt", help="Path to a cookies.txt file.") parser.add_argument( '--delay', metavar=5, default=0, type=float, help="Adds a delay (in seconds).") parser.add_argument( '--retries', metavar=5, default=5, help="Maximum number of retries.") parser.add_argument('--path', help='Path to store wiki dump at.') parser.add_argument( '--resume', action='store_true', help='Resumes previous incomplete dump (requires --path).') parser.add_argument('--force', action='store_true', help='') parser.add_argument( '--user', help='Username if authentication is required.') parser.add_argument( '--pass', dest='password', help='Password if authentication is required.') # URL params # This script should work with any general URL, finding out # API, index.php or whatever by itself when necessary groupWiki = parser.add_argument_group() groupWiki.add_argument( 'wiki', default='', nargs='?', help="URL to wiki (e.g. http://wiki.domain.org).") # URL params for MediaWiki groupWiki.add_argument( '--mwapi', help="URL to MediaWiki API (e.g. http://wiki.domain.org/w/api.php).") groupWiki.add_argument( '--mwindex', help="URL to MediaWiki index.php (e.g. http://wiki.domain.org/w/index.php).") # Download params groupDownload = parser.add_argument_group( 'Data to download', 'What info download from the wiki') groupDownload.add_argument( '--pages', action='store_true', help="Generates a dump of pages (--pages --curonly for current revisions only).") groupDownload.add_argument('--curonly', action='store_true', help='Store only the current version of pages.') groupDownload.add_argument( '--images', action='store_true', help="Generates an image dump.") groupDownload.add_argument( '--namespaces', metavar="1,2,3", help='Comma-separated value of namespaces to include (all by default).') groupDownload.add_argument( '--exnamespaces', metavar="1,2,3", help='Comma-separated value of namespaces to exclude.') # Meta info params groupMeta = parser.add_argument_group( 'Meta info', 'What meta info to retrieve from the wiki') groupMeta.add_argument( '--get-api', action='store_true', help="Returns wiki API when available.") groupMeta.add_argument( '--get-index', action='store_true', help="Returns wiki Index.php when available.") groupMeta.add_argument( '--get-page-titles', action='store_true', help="Returns wiki page titles.") groupMeta.add_argument( '--get-image-names', action='store_true', help="Returns wiki image names.") groupMeta.add_argument( '--get-namespaces', action='store_true', help="Returns wiki namespaces.") groupMeta.add_argument( '--get-wiki-engine', action='store_true', help="Returns wiki engine.") args = parser.parse_args() #sys.stderr.write(args) # Not wiki? Exit if not args.wiki: sys.stderr.write('ERROR: Provide a URL to a wiki\n') parser.print_help() sys.exit(1) # Don't mix download params and meta info params if (args.pages or args.images) and \ (args.get_api or args.get_index or args.get_page_titles or args.get_image_names or args.get_namespaces or args.get_wiki_engine): sys.stderr.write('ERROR: Don\'t mix download params and meta info params\n') parser.print_help() sys.exit(1) # No download params and no meta info params? Exit if (not args.pages and not args.images) and \ (not args.get_api and not args.get_index and not args.get_page_titles and not args.get_image_names and not args.get_namespaces and not args.get_wiki_engine): sys.stderr.write('ERROR: Use at least one download param or meta info param\n') parser.print_help() sys.exit(1) # Load cookies cj = cookielib.MozillaCookieJar() if args.cookies: cj.load(args.cookies) sys.stderr.write('Using cookies from %s\n' % args.cookies) # check user and pass (one requires both) if (args.user and not args.password) or (args.password and not args.user): sys.stderr.write('ERROR: Both --user and --pass are required for authentication.\n') parser.print_help() sys.exit(1) session = None if args.user and args.password: import requests session = requests.Session() session.cookies = cj session.headers.update({'User-Agent': getUserAgent()}) session.auth = (args.user, args.password) #session.mount(args.mw_api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret)) Mediawiki-centric, be careful # check URLs for url in [args.mwapi, args.mwindex, args.wiki]: if url and (not url.startswith('http://') and not url.startswith('https://')): sys.stderr.write(url) sys.stderr.write('ERROR: URLs must start with http:// or https://\n') parser.print_help() sys.exit(1) # Meta info params metainfo = '' # only one allowed, so we don't mix output if args.get_api: metainfo = 'get_api' elif args.get_index: metainfo = 'get_index' elif args.get_page_titles: metainfo = 'get_page_titles' elif args.get_image_names: metainfo = 'get_image_names' elif args.get_namespaces: metainfo = 'get_namespaces' elif args.get_wiki_engine: metainfo = 'get_wiki_engine' namespaces = ['all'] exnamespaces = [] # Process namespace inclusions if args.namespaces: # fix, why - ? and... --namespaces= all with a space works? if re.search( r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all': sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas\n") sys.exit() else: ns = re.sub(' ', '', args.namespaces) if ns.lower() == 'all': namespaces = ['all'] else: namespaces = [int(i) for i in ns.split(',')] # Process namespace exclusions if args.exnamespaces: if re.search(r'[^\d, \-]', args.exnamespaces): sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas\n") sys.exit(1) else: ns = re.sub(' ', '', args.exnamespaces) if ns.lower() == 'all': sys.stderr.write('You cannot exclude all namespaces.\n') sys.exit(1) else: exnamespaces = [int(i) for i in ns.split(',')] # --curonly requires --xml if args.curonly and not args.pages: sys.stderr.write("--curonly requires --pages\n") parser.print_help() sys.exit(1) config = { 'cookies': args.cookies or '', 'curonly': args.curonly, 'date': datetime.datetime.now().strftime('%Y%m%d'), 'delay': args.delay, 'exnamespaces': exnamespaces, 'images': args.images, 'logs': False, 'metainfo': metainfo, 'namespaces': namespaces, 'pages': args.pages, 'path': args.path and os.path.normpath(args.path) or '', 'retries': int(args.retries), 'wiki': args.wiki, 'wikicanonical': '', 'wikiengine': getWikiEngine(args.wiki), 'other': { 'configfilename': 'config.txt', 'filenamelimit': 100, # do not change 'force': args.force, 'resume': args.resume, 'session': session, } } # Get ready special variables (API for MediWiki, etc) if config['wikiengine'] == 'mediawiki': import mediawiki config['mwexport'] = 'Special:Export' if not args.mwapi: config['mwapi'] = mediawiki.mwGetAPI(config=config) if not config['mwapi']: sys.stderr.write('ERROR: Provide a URL to API\n') sys.exit(1) else: data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'} r = getURL(config['mwapi'], data=data) config['mwexport'] = getJSON(r)['query']['namespaces']['-1']['*'] \ + ':Export' if not args.mwindex: config['mwindex'] = mediawiki.mwGetIndex(config=config) if not config['mwindex']: sys.stderr.write('ERROR: Provide a URL to Index.php\n') sys.exit(1) elif wikiengine == 'wikispaces': import wikispaces # use wikicanonical for base url for Wikispaces? # calculating path, if not defined by user with --path= if not config['path']: config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config), config['date']) return config