def test_getPageTitles(self): # This test download the title list using API and index.php # Compare both lists in length and title by title # Check the presence of some special titles, like odd chars # The tested wikis are from different wikifarms and some alone print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73 tests = [ # Alone wikis ['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day'], ['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'], # Test old allpages API behaviour ['http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'], # Test BOM encoding ['http://www.libreidea.org/w/index.php', 'http://www.libreidea.org/w/api.php', 'Main Page'], ] session = requests.Session() session.headers = {'User-Agent': getUserAgent()} for index, api, pagetocheck in tests: # Testing with API print '\nTesting', api print 'Trying to parse', pagetocheck, 'with API' config_api = {'api': api, 'index': '', 'delay': 0, 'namespaces': ['all'], 'exnamespaces': [], 'date': datetime.datetime.now().strftime('%Y%m%d'), 'path': '.'} getPageTitles(config=config_api, session=session) titles_api = './%s-%s-titles.txt' % (domain2prefix(config=config_api), config_api['date']) result_api = open(titles_api, 'r').read().splitlines() os.remove(titles_api) self.assertTrue(pagetocheck in result_api) # Testing with index print 'Testing', index print 'Trying to parse', pagetocheck, 'with index' config_index = {'index': index, 'api': '', 'delay': 0, 'namespaces': ['all'], 'exnamespaces': [], 'date': datetime.datetime.now().strftime('%Y%m%d'), 'path': '.'} getPageTitles(config=config_index, session=session) titles_index = './%s-%s-titles.txt' % (domain2prefix(config=config_index), config_index['date']) result_index = open(titles_index, 'r').read().splitlines() os.remove(titles_index) self.assertTrue(pagetocheck in result_index) self.assertEqual(len(result_api), len(result_index)) # Compare every page in both lists, with/without API c = 0 for pagename_api in result_api: self.assertEqual(pagename_api.decode('utf8'), result_index[c].decode('utf8'), u'{0} and {1} are different'.format(pagename_api.decode('utf8'), result_index[c].decode('utf8'))) c += 1
def main(): if len(sys.argv) < 2: print 'python script.py file-with-apis.txt' sys.exit() print 'Reading list of APIs from', sys.argv[1] wikis = open(sys.argv[1], 'r').read().splitlines() print '%d APIs found' % (len(wikis)) for wiki in wikis: print "#" * 73 print "# Downloading", wiki print "#" * 73 wiki = wiki.lower() # Make the prefix in standard way; api and index must be defined, not important which is which prefix = dumpgenerator.domain2prefix(config={ 'api': wiki, 'index': wiki }) #check if compressed, in that case dump was finished previously compressed = False for dirname, dirnames, filenames in os.walk('.'): if dirname == '.': for f in filenames: if f.startswith(prefix) and f.endswith('.7z'): compressed = True zipfilename = f break #stop searching, dot not explore subdirectories if compressed: print 'Skipping... This wiki was downloaded and compressed before in', zipfilename # Get the archive's file list. if (((sys.version_info[0] == 3) and (sys.version_info[1] > 0)) or ((sys.version_info[0] == 2) and (sys.version_info[1] > 6))): archivecontent = subprocess.check_output( ['7z', 'l', zipfilename]) if re.search(ur"%s.+-history\.xml" % (prefix), archivecontent) is None: # We should perhaps not create an archive in this case, but we continue anyway. print "ERROR: The archive contains no history!" if re.search(ur"Special:Version\.html", archivecontent) is None: print "WARNING: The archive doesn't contain Special:Version.html, this may indicate that download didn't finish." else: print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+." # TODO: Find a way like grep -q below without doing a 7z l multiple times? continue
def main(): if len(sys.argv) < 2: print 'python script.py file-with-apis.txt' sys.exit() print 'Reading list of APIs from', sys.argv[1] wikis = open(sys.argv[1], 'r').read().splitlines() print '%d APIs found' % (len(wikis)) for wiki in wikis: print "#"*73 print "# Downloading", wiki print "#"*73 wiki = wiki.lower() # Make the prefix in standard way; api and index must be defined, not important which is which prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki}) #check if compressed, in that case dump was finished previously compressed = False for dirname, dirnames, filenames in os.walk('.'): if dirname == '.': for f in filenames: if f.startswith(prefix) and f.endswith('.7z'): compressed = True zipfilename = f break #stop searching, dot not explore subdirectories if compressed: print 'Skipping... This wiki was downloaded and compressed before in', zipfilename # Get the archive's file list. if ( ( ( sys.version_info[0] == 3 ) and ( sys.version_info[1] > 0 ) ) or ( ( sys.version_info[0] == 2 ) and ( sys.version_info[1] > 6 ) ) ): archivecontent = subprocess.check_output (['7z', 'l', zipfilename]) if re.search(ur"%s.+-history\.xml" % (prefix), archivecontent) is None: # We should perhaps not create an archive in this case, but we continue anyway. print "ERROR: The archive contains no history!" if re.search(ur"Special:Version\.html", archivecontent) is None: print "WARNING: The archive doesn't contain Special:Version.html, this may indicate that download didn't finish." else: print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+." # TODO: Find a way like grep -q below without doing a 7z l multiple times? continue
def upload(wikis, config={}): headers = {'User-Agent': dumpgenerator.getUserAgent()} for wiki in wikis: print "#"*73 print "# Uploading", wiki print "#"*73 wiki = wiki.lower() prefix = dumpgenerator.domain2prefix(config={'api': wiki}) wikiname = prefix.split('-')[0] dumps = [] for dirname, dirnames, filenames in os.walk('.'): if dirname == '.': for f in filenames: if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')): dumps.append(f) break c = 0 for dump in dumps: wikidate = dump.split('-')[1] item = get_item('wiki-' + wikiname) if dump in uploadeddumps: if config['prune-directories']: rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate) # With -f the deletion might have happened before and we won't know if not os.system(rmline): print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate) if config['prune-wikidump'] and dump.endswith('wikidump.7z'): # Simplistic quick&dirty check for the presence of this file in the item stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() dumphash = re.sub(' +.+\n?', '', stdout) if dumphash in map(lambda x: x['md5'], item.files): log(wiki, dump, 'verified') rmline='rm -rf %s' % dump if not os.system(rmline): print 'DELETED ' + dump print '%s was uploaded before, skipping...' % (dump) continue else: print 'ERROR: The online item misses ' + dump log(wiki, dump, 'missing') # We'll exit this if and go upload the dump else: print '%s was uploaded before, skipping...' % (dump) continue time.sleep(0.1) wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8] print wiki, wikiname, wikidate, dump # Does the item exist already? ismissingitem = not item.exists # Logo path logourl = '' if ismissingitem or config['update']: #get metadata from api.php #first sitename and base url params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'} data = urllib.urlencode(params) req = urllib2.Request(url=wiki, data=data, headers=headers) xml = '' try: f = urllib2.urlopen(req) xml = f.read() f.close() except: pass sitename = '' baseurl = '' lang = '' try: sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0] except: pass try: baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0] except: pass try: lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0] except: pass if not sitename: sitename = wikiname if not baseurl: baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki) if lang: lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower() #now copyright info from API params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'} data = urllib.urlencode(params) req = urllib2.Request(url=wiki, data=data, headers=headers) xml = '' try: f = urllib2.urlopen(req) xml = f.read() f.close() except: pass rightsinfourl = '' rightsinfotext = '' try: rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0] rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0] except: pass raw = '' try: f = urllib.urlopen(baseurl) raw = f.read() f.close() except: pass #or copyright info from #footer in mainpage if baseurl and not rightsinfourl and not rightsinfotext: rightsinfotext = '' rightsinfourl = '' try: rightsinfourl = re.findall(ur"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw)[0] except: pass try: rightsinfotext = re.findall(ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0] except: pass if rightsinfotext and not rightsinfourl: rightsinfourl = baseurl + '#footer' try: logourl = re.findall(ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0] except: pass print logourl #retrieve some info from the wiki wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools." % (baseurl, sitename)# "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools." wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki if not rightsinfourl and not rightsinfotext: wikikeys.append('unknowncopyright') wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/ wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found. wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php else: print 'Item already exists.' lang = 'foo' wikititle = 'foo' wikidesc = 'foo' wikikeys = 'foo' wikilicenseurl = 'foo' wikirights = 'foo' wikiurl = 'foo' if c == 0: # Item metadata md = { 'mediatype': 'web', 'collection': config['collection'], 'title': wikititle, 'description': wikidesc, 'language': lang, 'last-updated-date': wikidate_text, 'subject': '; '.join(wikikeys), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ... 'licenseurl': wikilicenseurl and urlparse.urljoin(wiki, wikilicenseurl), 'rights': wikirights, 'originalurl': wikiurl, } #Upload files and update metadata try: item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True) item.modify_metadata(md) # update print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname) if logourl: logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read()) logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown' logo.name = 'wiki-' + wikiname + '_logo.' + logoextension item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True) uploadeddumps.append(dump) log(wiki, dump, 'ok') except: print wiki, dump, 'error when uploading?' c += 1
def upload(wikis, config={}, uploadeddumps=[]): headers = {'User-Agent': dumpgenerator.getUserAgent()} dumpdir = config.wikidump_dir filelist = os.listdir(dumpdir) for wiki in wikis: print "#" * 73 print "# Uploading", wiki print "#" * 73 wiki = wiki.lower() configtemp = config try: prefix = dumpgenerator.domain2prefix(config={'api': wiki}) except KeyError: print "ERROR: could not produce the prefix for %s" % wiki config = configtemp wikiname = prefix.split('-')[0] dumps = [] for f in filelist: if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')): print "%s found" % f dumps.append(f) # Re-introduce the break here if you only need to upload one file # and the I/O is too slow # break c = 0 for dump in dumps: wikidate = dump.split('-')[1] item = get_item('wiki-' + wikiname) if dump in uploadeddumps: if config.prune_directories: rmline = 'rm -rf %s-%s-wikidump/' % (wikiname, wikidate) # With -f the deletion might have happened before and we won't know if not os.system(rmline): print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate) if config.prune_wikidump and dump.endswith('wikidump.7z'): # Simplistic quick&dirty check for the presence of this file in the item print "Checking content in previously uploaded files" stdout, stderr = subprocess.Popen( ["md5sum", dumpdir + '/' + dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() dumphash = re.sub(' +.+\n?', '', stdout) if dumphash in map(lambda x: x['md5'], item.files): log(wiki, dump, 'verified', config) rmline = 'rm -rf %s' % dumpdir + '/' + dump if not os.system(rmline): print 'DELETED ' + dumpdir + '/' + dump print '%s was uploaded before, skipping...' % (dump) continue else: print 'ERROR: The online item misses ' + dump log(wiki, dump, 'missing', config) # We'll exit this if and go upload the dump else: print '%s was uploaded before, skipping...' % (dump) continue else: print '%s was not uploaded before' % dump time.sleep(0.1) wikidate_text = wikidate[0:4] + '-' + wikidate[ 4:6] + '-' + wikidate[6:8] print wiki, wikiname, wikidate, dump # Does the item exist already? ismissingitem = not item.exists # Logo path logourl = '' if ismissingitem or config.update: #get metadata from api.php #first sitename and base url params = { 'action': 'query', 'meta': 'siteinfo', 'format': 'xml' } data = urllib.urlencode(params) req = urllib2.Request(url=wiki, data=data, headers=headers) xml = '' try: f = urllib2.urlopen(req, timeout=10) xml = f.read() f.close() except: pass sitename = '' baseurl = '' lang = '' try: sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0] except: pass try: baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0] except: pass try: lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0] except: pass if not sitename: sitename = wikiname if not baseurl: baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki) if lang: lang = convertlang.has_key(lang.lower()) and convertlang[ lang.lower()] or lang.lower() #now copyright info from API params = { 'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml' } data = urllib.urlencode(params) req = urllib2.Request(url=wiki, data=data, headers=headers) xml = '' try: f = urllib2.urlopen(req, timeout=10) xml = f.read() f.close() except: pass rightsinfourl = '' rightsinfotext = '' try: rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0] rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0] except: pass raw = '' try: f = urllib.urlopen(baseurl, timeout=10) raw = f.read() f.close() except: pass #or copyright info from #footer in mainpage if baseurl and not rightsinfourl and not rightsinfotext: rightsinfotext = '' rightsinfourl = '' try: rightsinfourl = re.findall( ur"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw)[0] except: pass try: rightsinfotext = re.findall( ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0] except: pass if rightsinfotext and not rightsinfourl: rightsinfourl = baseurl + '#footer' try: logourl = re.findall( ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0] except: pass #retrieve some info from the wiki wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools." % ( baseurl, sitename ) # "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools." wikikeys = [ 'wiki', 'wikiteam', 'MediaWiki', sitename, wikiname ] # ecg; ECGpedia; wiki; wikiteam; MediaWiki if not rightsinfourl and not rightsinfotext: wikikeys.append('unknowncopyright') wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/ wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found. wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php else: print 'Item already exists.' lang = 'foo' wikititle = 'foo' wikidesc = 'foo' wikikeys = 'foo' wikilicenseurl = 'foo' wikirights = 'foo' wikiurl = 'foo' if c == 0: # Item metadata md = { 'mediatype': 'web', 'collection': config.collection, 'title': wikititle, 'description': wikidesc, 'language': lang, 'last-updated-date': wikidate_text, 'subject': '; '.join( wikikeys ), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ... 'licenseurl': wikilicenseurl and urlparse.urljoin(wiki, wikilicenseurl), 'rights': wikirights, 'originalurl': wikiurl, } #Upload files and update metadata try: item.upload(dumpdir + '/' + dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False) item.modify_metadata(md) # update print 'You can find it in https://archive.org/details/wiki-%s' % ( wikiname) uploadeddumps.append(dump) log(wiki, dump, 'ok', config) if logourl: logo = StringIO.StringIO( urllib.urlopen(urlparse.urljoin(wiki, logourl), timeout=10).read()) logoextension = logourl.split('.')[-1] if logourl.split( '.') else 'unknown' logo.name = 'wiki-' + wikiname + '_logo.' + logoextension item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True) except Exception as e: print wiki, dump, 'Error when uploading?' print e.message c += 1
def upload(wikis, config={}): for wiki in wikis: print "#"*73 print "# Uploading", wiki print "#"*73 wiki = wiki.lower() prefix = dumpgenerator.domain2prefix(config={'api': wiki}) wikiname = prefix.split('-')[0] dumps = [] for dirname, dirnames, filenames in os.walk('.'): if dirname == '.': for f in filenames: if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')): dumps.append(f) break c = 0 for dump in dumps: wikidate = dump.split('-')[1] if dump in uploadeddumps: if config['prune-directories']: rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate) # With -f the deletion might have happened before and we won't know if not os.system(rmline): print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate) if config['prune-wikidump'] and dump.endswith('wikidump.7z'): # Simplistic quick&dirty check for the presence of this file in the item stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() dumphash = re.sub(' +.+\n?', '', stdout) headers = {'User-Agent': dumpgenerator.getUserAgent()} itemdata = urllib2.Request(url='http://archive.org/metadata/wiki-' + wikiname, headers=headers) if re.search(dumphash, urllib2.urlopen(itemdata).read()): log(wiki, dump, 'verified') rmline='rm -rf %s' % dump if not os.system(rmline): print 'DELETED ' + dump print '%s was uploaded before, skipping...' % (dump) continue else: print 'ERROR: The online item misses ' + dump log(wiki, dump, 'missing') # We'll exit this if and go upload the dump else: print '%s was uploaded before, skipping...' % (dump) continue time.sleep(0.1) wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8] print wiki, wikiname, wikidate, dump # Does the item exist already? headers = {'User-Agent': dumpgenerator.getUserAgent()} itemdata = urllib2.Request(url='http://archive.org/metadata/wiki-' + wikiname, headers=headers) if urllib2.urlopen(itemdata).read() == '{}': ismissingitem = True else: ismissingitem = False # We don't know a way to fix/overwrite metadata if item exists already: # just pass bogus data and save some time if ismissingitem: #get metadata from api.php #first sitename and base url params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'} data = urllib.urlencode(params) req = urllib2.Request(url=wiki, data=data, headers=headers) xml = '' try: f = urllib2.urlopen(req) xml = f.read() f.close() except: pass sitename = '' baseurl = '' lang = '' try: sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0] except: pass try: baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0] except: pass try: lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0] except: pass if not sitename: sitename = wikiname if not baseurl: baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki) if lang: lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower() #now copyright info from API params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'} data = urllib.urlencode(params) req = urllib2.Request(url=wiki, data=data, headers=headers) xml = '' try: f = urllib2.urlopen(req) xml = f.read() f.close() except: pass rightsinfourl = '' rightsinfotext = '' try: rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0] rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0] except: pass #or copyright info from #footer in mainpage if baseurl and not rightsinfourl and not rightsinfotext: raw = '' try: f = urllib.urlopen(baseurl) raw = f.read() f.close() except: pass rightsinfotext = '' rightsinfourl = '' try: rightsinfourl = re.findall(ur"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw)[0] except: pass try: rightsinfotext = re.findall(ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0] except: pass if rightsinfotext and not rightsinfourl: rightsinfourl = baseurl + '#footer' #retrieve some info from the wiki wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools." % (baseurl, sitename)# "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools." wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki if not rightsinfourl and not rightsinfotext: wikikeys.append('unknowncopyright') wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/ wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found. wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php else: print 'Item already exists.' lang = 'foo' wikititle = 'foo' wikidesc = 'foo' wikikeys = 'foo' wikilicenseurl = 'foo' wikirights = 'foo' wikiurl = 'foo' #creates curl command curl = ['curl', '--location', '--header', "'x-amz-auto-make-bucket:1'", # Creates the item automatically, need to give some time for the item to correctly be created on archive.org, or everything else will fail, showing "bucket not found" error '--header', "'x-archive-queue-derive:0'", '--header', "'x-archive-size-hint:%d'" % (os.path.getsize(dump)), '--header', "'authorization: LOW %s:%s'" % (accesskey, secretkey), ] if c == 0: curl += ['--header', "'x-archive-meta-mediatype:web'", '--header', "'x-archive-meta-collection:%s'" % (config['collection']), '--header', quoteattr('x-archive-meta-title:' + wikititle), '--header', "'x-archive-meta-description:%s'" % wikidesc.replace("'", r"\'"), '--header', quoteattr('x-archive-meta-language:' + lang), '--header', "'x-archive-meta-last-updated-date:%s'" % (wikidate_text), '--header', "'x-archive-meta-subject:%s'" % ('; '.join(wikikeys)), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ... '--header', quoteattr('x-archive-meta-licenseurl:' + wikilicenseurl), '--header', "'x-archive-meta-rights:%s'" % wikirights.replace("'", r"\'"), '--header', quoteattr('x-archive-meta-originalurl:' + wikiurl), ] curl += ['--upload-file', "%s" % (dump), "http://s3.us.archive.org/wiki-%s/%s" % (wikiname, dump), # It could happen that the identifier is taken by another user; only wikiteam collection admins will be able to upload more files to it, curl will fail immediately and get a permissions error by s3. '> /dev/null', #FIXME: Must be NUL instead on Windows, how to make compatible? ] #now also to update the metadata #TODO: not needed for the second file in an item curlmeta = ['curl --silent', '--data-urlencode -target=metadata', """--data-urlencode -patch='{"replace":"/last-updated-date", "value":"%s"}'""" % (wikidate_text), '--data-urlencode access=' + accesskey, '--data-urlencode secret=' + secretkey, 'http://archive.org/metadata/wiki-' + wikiname, '> /dev/null' ] curlline = ' '.join(curl) curlmetaline = ' '.join(curlmeta) if not os.system(curlline): uploadeddumps.append(dump) log(wiki, dump, 'ok') if not ismissingitem: os.system(curlmetaline) c += 1
def upload(wikis, config={}): for wiki in wikis: print "#" * 73 print "# Uploading", wiki print "#" * 73 wiki = wiki.lower() prefix = dumpgenerator.domain2prefix(config={'api': wiki}) wikiname = prefix.split('-')[0] dumps = [] for dirname, dirnames, filenames in os.walk('.'): if dirname == '.': for f in filenames: if f.startswith( '%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')): dumps.append(f) break c = 0 for dump in dumps: wikidate = dump.split('-')[1] if dump in uploadeddumps: if config['prune-directories']: rmline = 'rm -rf %s-%s-wikidump/' % (wikiname, wikidate) # With -f the deletion might have happened before and we won't know if not os.system(rmline): print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate) if config['prune-wikidump'] and dump.endswith('wikidump.7z'): # Simplistic quick&dirty check for the presence of this file in the item stdout, stderr = subprocess.Popen( ["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() dumphash = re.sub(' +.+\n?', '', stdout) headers = {'User-Agent': dumpgenerator.getUserAgent()} itemdata = urllib2.Request( url='http://archive.org/metadata/wiki-' + wikiname, headers=headers) if re.search(dumphash, urllib2.urlopen(itemdata).read()): log(wiki, dump, 'verified') rmline = 'rm -rf %s' % dump if not os.system(rmline): print 'DELETED ' + dump print '%s was uploaded before, skipping...' % (dump) continue else: print 'ERROR: The online item misses ' + dump log(wiki, dump, 'missing') # We'll exit this if and go upload the dump else: print '%s was uploaded before, skipping...' % (dump) continue time.sleep(0.1) wikidate_text = wikidate[0:4] + '-' + wikidate[ 4:6] + '-' + wikidate[6:8] print wiki, wikiname, wikidate, dump # Does the item exist already? headers = {'User-Agent': dumpgenerator.getUserAgent()} itemdata = urllib2.Request( url='http://archive.org/metadata/wiki-' + wikiname, headers=headers) if urllib2.urlopen(itemdata).read() == '{}': ismissingitem = True else: ismissingitem = False # We don't know a way to fix/overwrite metadata if item exists already: # just pass bogus data and save some time if ismissingitem: #get metadata from api.php #first sitename and base url params = { 'action': 'query', 'meta': 'siteinfo', 'format': 'xml' } data = urllib.urlencode(params) req = urllib2.Request(url=wiki, data=data, headers=headers) xml = '' try: f = urllib2.urlopen(req) xml = f.read() f.close() except: pass sitename = '' baseurl = '' lang = '' try: sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0] except: pass try: baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0] except: pass try: lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0] except: pass if not sitename: sitename = wikiname if not baseurl: baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki) if lang: lang = convertlang.has_key(lang.lower()) and convertlang[ lang.lower()] or lang.lower() #now copyright info from API params = { 'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml' } data = urllib.urlencode(params) req = urllib2.Request(url=wiki, data=data, headers=headers) xml = '' try: f = urllib2.urlopen(req) xml = f.read() f.close() except: pass rightsinfourl = '' rightsinfotext = '' try: rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0] rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0] except: pass #or copyright info from #footer in mainpage if baseurl and not rightsinfourl and not rightsinfotext: raw = '' try: f = urllib.urlopen(baseurl) raw = f.read() f.close() except: pass rightsinfotext = '' rightsinfourl = '' try: rightsinfourl = re.findall( ur"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw)[0] except: pass try: rightsinfotext = re.findall( ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0] except: pass if rightsinfotext and not rightsinfourl: rightsinfourl = baseurl + '#footer' #retrieve some info from the wiki wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools." % ( baseurl, sitename ) # "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools." wikikeys = [ 'wiki', 'wikiteam', 'MediaWiki', sitename, wikiname ] # ecg; ECGpedia; wiki; wikiteam; MediaWiki if not rightsinfourl and not rightsinfotext: wikikeys.append('unknowncopyright') wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/ wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found. wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php else: print 'Item already exists.' lang = 'foo' wikititle = 'foo' wikidesc = 'foo' wikikeys = 'foo' wikilicenseurl = 'foo' wikirights = 'foo' wikiurl = 'foo' #creates curl command curl = [ 'curl', '--location', '--header', "'x-amz-auto-make-bucket:1'", # Creates the item automatically, need to give some time for the item to correctly be created on archive.org, or everything else will fail, showing "bucket not found" error '--header', "'x-archive-queue-derive:0'", '--header', "'x-archive-size-hint:%d'" % (os.path.getsize(dump)), '--header', "'authorization: LOW %s:%s'" % (accesskey, secretkey), ] if c == 0: curl += [ '--header', "'x-archive-meta-mediatype:web'", '--header', "'x-archive-meta-collection:%s'" % (config['collection']), '--header', quoteattr('x-archive-meta-title:' + wikititle), '--header', "'x-archive-meta-description:%s'" % wikidesc.replace("'", r"\'"), '--header', quoteattr('x-archive-meta-language:' + lang), '--header', "'x-archive-meta-last-updated-date:%s'" % (wikidate_text), '--header', "'x-archive-meta-subject:%s'" % ( '; '.join(wikikeys) ), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ... '--header', quoteattr('x-archive-meta-licenseurl:' + wikilicenseurl), '--header', "'x-archive-meta-rights:%s'" % wikirights.replace("'", r"\'"), '--header', quoteattr('x-archive-meta-originalurl:' + wikiurl), ] curl += [ '--upload-file', "%s" % (dump), "http://s3.us.archive.org/wiki-%s/%s" % ( wikiname, dump ), # It could happen that the identifier is taken by another user; only wikiteam collection admins will be able to upload more files to it, curl will fail immediately and get a permissions error by s3. '> /dev/null', #FIXME: Must be NUL instead on Windows, how to make compatible? ] #now also to update the metadata #TODO: not needed for the second file in an item curlmeta = [ 'curl --silent', '--data-urlencode -target=metadata', """--data-urlencode -patch='{"replace":"/last-updated-date", "value":"%s"}'""" % (wikidate_text), '--data-urlencode access=' + accesskey, '--data-urlencode secret=' + secretkey, 'http://archive.org/metadata/wiki-' + wikiname, '> /dev/null' ] curlline = ' '.join(curl) curlmetaline = ' '.join(curlmeta) if not os.system(curlline): uploadeddumps.append(dump) log(wiki, dump, 'ok') if not ismissingitem: os.system(curlmetaline) c += 1
import os import re import subprocess import sys import time import dumpgenerator wikis = open(sys.argv[1], 'r').read().splitlines() for wiki in wikis: print "#"*73 print "# Downloading", wiki print "#"*73 wiki = wiki.lower() # Make the prefix in standard way; api and index must be defined, not important which is which prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki}) #check if compressed, in that case it is finished compressed = False for dirname, dirnames, filenames in os.walk('.'): if dirname == '.': for f in filenames: if f.startswith(prefix) and f.endswith('.7z'): compressed = True zipfilename = f break #stop searching, dot not explore subdirectories if compressed: print 'Skipping... This wiki was downloaded and compressed before in', zipfilename # Get the archive's file list. if ( ( ( sys.version_info[0] == 3 ) and ( sys.version_info[1] > 0 ) ) or ( ( sys.version_info[0] == 2 ) and ( sys.version_info[1] > 6 ) ) ):
import os import re import subprocess import sys import time import dumpgenerator wikis = open(sys.argv[1], 'r').read().splitlines() for wiki in wikis: print "#" * 73 print "# Downloading", wiki print "#" * 73 wiki = wiki.lower() # Make the prefix in standard way; api and index must be defined, not important which is which prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki}) #check if compressed, in that case it is finished compressed = False for dirname, dirnames, filenames in os.walk('.'): if dirname == '.': for f in filenames: if f.startswith(prefix) and f.endswith('.7z'): compressed = True zipfilename = f break #stop searching, dot not explore subdirectories if compressed: print 'Skipping... This wiki was downloaded and compressed before in', zipfilename # Get the archive's file list. if (((sys.version_info[0] == 3) and (sys.version_info[1] > 0))
def upload(wikis, config={}): headers = {'User-Agent': dumpgenerator.getUserAgent()} for wiki in wikis: print "#"*73 print "# Uploading", wiki print "#"*73 wiki = wiki.lower() prefix = dumpgenerator.domain2prefix(config={'api': wiki}) wikiname = prefix.split('-')[0] dumps = [] for dirname, dirnames, filenames in os.walk('.'): if dirname == '.': for f in filenames: if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')): dumps.append(f) break c = 0 for dump in dumps: wikidate = dump.split('-')[1] item = get_item('wiki-' + wikiname) if dump in uploadeddumps: if config['prune-directories']: rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate) # With -f the deletion might have happened before and we won't know if not os.system(rmline): print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate) if config['prune-wikidump'] and dump.endswith('wikidump.7z'): # Simplistic quick&dirty check for the presence of this file in the item stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() dumphash = re.sub(' +.+\n?', '', stdout) if dumphash in map(lambda x: x['md5'], item.files): log(wiki, dump, 'verified') rmline='rm -rf %s' % dump if not os.system(rmline): print 'DELETED ' + dump print '%s was uploaded before, skipping...' % (dump) continue else: print 'ERROR: The online item misses ' + dump log(wiki, dump, 'missing') # We'll exit this if and go upload the dump else: print '%s was uploaded before, skipping...' % (dump) continue time.sleep(0.1) wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8] print wiki, wikiname, wikidate, dump # Does the item exist already? ismissingitem = not item.exists # We don't know a way to fix/overwrite metadata if item exists already: # just pass bogus data and save some time if ismissingitem: #get metadata from api.php #first sitename and base url params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'} data = urllib.urlencode(params) req = urllib2.Request(url=wiki, data=data, headers=headers) xml = '' try: f = urllib2.urlopen(req) xml = f.read() f.close() except: pass sitename = '' baseurl = '' lang = '' try: sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0] except: pass try: baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0] except: pass try: lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0] except: pass if not sitename: sitename = wikiname if not baseurl: baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki) if lang: lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower() #now copyright info from API params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'} data = urllib.urlencode(params) req = urllib2.Request(url=wiki, data=data, headers=headers) xml = '' try: f = urllib2.urlopen(req) xml = f.read() f.close() except: pass rightsinfourl = '' rightsinfotext = '' try: rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0] rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0] except: pass #or copyright info from #footer in mainpage if baseurl and not rightsinfourl and not rightsinfotext: raw = '' try: f = urllib.urlopen(baseurl) raw = f.read() f.close() except: pass rightsinfotext = '' rightsinfourl = '' try: rightsinfourl = re.findall(ur"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw)[0] except: pass try: rightsinfotext = re.findall(ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0] except: pass if rightsinfotext and not rightsinfourl: rightsinfourl = baseurl + '#footer' #retrieve some info from the wiki wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools." % (baseurl, sitename)# "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools." wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki if not rightsinfourl and not rightsinfotext: wikikeys.append('unknowncopyright') wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/ wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found. wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php else: print 'Item already exists.' lang = 'foo' wikititle = 'foo' wikidesc = 'foo' wikikeys = 'foo' wikilicenseurl = 'foo' wikirights = 'foo' wikiurl = 'foo' if c == 0: # Item metadata md = { 'mediatype': 'web', 'collection': config['collection'], 'title': wikititle, 'description': wikidesc, 'language': lang, 'last-updated-date': wikidate_text, 'subject': '; '.join(wikikeys), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ... 'licenseurl': wikilicenseurl, 'rights': wikirights, 'originalurl': wikiurl, } #now also to update the metadata #TODO: not needed for the second file in an item try: item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True) uploadeddumps.append(dump) log(wiki, dump, 'ok') except: log(wiki, dump, 'error?') c += 1