def test_capture(self): random_number = random.choice(range(0, 1000)) url = "http://www.example.com/my-random-page-{}".format(random_number) archive_url_1, c1 = savepagenow.capture_or_cache(url) self.assertTrue(archive_url_1.startswith("http://web.archive.org/")) # Test CacheError archive_url_2, c2 = savepagenow.capture_or_cache( url, user_agent="savepagenow (https://github.com/pastpages/savepagenow)" ) self.assertTrue(archive_url_2.startswith("http://web.archive.org/")) self.assertEqual(archive_url_1, archive_url_2)
def _save(self, l): try: archive_url, _ = savepagenow.capture_or_cache(l) return Bunch(url=l, archive_url=archive_url, ok=1) except savepagenow.api.CachedPage as e: _, archive_url = str(e).rsplit(None, 1) return Bunch(url=l, archive_url=archive_url, ok=0) # except savepagenow.api.WaybackRuntimeError as e: except Exception as e: status_code = None if len(e.args) == 1 and isinstance(e.args[0], dict): r = e.args[0] txt = r.get("headers", {}).get("Link", None) status_code = r.get("status_code", None) if txt: # and r.get("status_code", None) in (200, 206): _re = re.compile( "(" + re.escape("https://web.archive.org/web/") + r"\d+/.*?" + re.escape(get_dom(l)) + ".*?)>;") archive_url = _re.findall(txt) if archive_url: archive_url = sorted(archive_url)[-1] return Bunch(url=l, archive_url=archive_url, ok=0) out = Bunch(url=l, archive_url=None, ok=-1, error=e, code=status_code) return out
def test_capture(self): """ Test the basic function of retriving a URL from Wayback. """ url = "https://www.latimes.com/" archive_url, c = savepagenow.capture_or_cache(url) self.assertTrue(archive_url.startswith("https://web.archive.org/"))
def save(request): # Verify there is a URL to save url = request.POST.get("url", None) if not url: return HttpResponseBadRequest("Bad request") # Verify the user is logged in user = request.user if not user.is_authenticated(): return HttpResponseBadRequest("Bad request") # Sent URL to Internet Archive # This is required so we throw an error if it fails logger.debug("Archiving {} for {} to IA".format(url, user)) try: ia_url, ia_captured = savepagenow.capture_or_cache(url) logger.debug("Saving memento URL {}".format(ia_url)) ia_memento = Memento.objects.create(url=ia_url, archive="archive.org") except Exception as e: logger.error(e) return HttpResponseBadRequest( "Sorry. This link cannot be archived by archive.org.") # Write it all to the database clip = Clip.objects.create(user=user, url=url) clip.mementos.add(ia_memento) # Head back where the user started return redirect("/")
def main(args): n_done = 0 n_captured = 0 n_success = 0 done_url_set = set() if not args.override and os.path.exists(args.o): with open(args.o) as f: for line in f: url, archive_url = line.split() n_done += 1 if archive_url != 'None': n_success += 1 done_url_set.add(url) events = read_jsonl(args.i) urls = [ url for e in events for url in e['references'] if url not in done_url_set ] if args.shuffle: random.shuffle(urls) n_total = len(urls) + len(done_url_set) batch = [] for url in urls: repeat = True archive_url, captured = None, None while repeat: try: archive_url, captured = savepagenow.capture_or_cache(url) repeat = False if captured: n_captured += 1 n_success += 1 except Exception as e: if isinstance(e, ConnectionError): print('Too many requests, waiting a bit...') repeat = True else: repeat = False if repeat: time.sleep(60) else: time.sleep(1) if archive_url is not None: batch.append((url, archive_url, captured)) n_done += 1 print(f'total: {n_total}, done: {n_done}, ' f'successful: {n_success}, captured: {n_captured}\n') if len(batch) < args.batchsize: lines = [f'{url} {archive_url}' for (url, archive_url, _) in batch] if len(lines) > 0: with open(args.o, 'a') as f: f.write('\n'.join(lines) + '\n') batch = []
def archive(uri_dic, pageurl, RETRY, ONLYPAGE): """ Save URIs extracted from the target page. (by using Module savepagenow) """ if ONLYPAGE: uri_dic = {i for i in uri_dic if is_page(i)} print("[+]Now: %s" % pageurl) print("[+]%d URI(s) found." % len(uri_dic)) # try to throw each uri to API count, saves, fails = 0, 0, 0 dic_size = len(uri_dic) for uri in uri_dic: count += 1 id_ = str(count).zfill(len(str(len(uri_dic)))) try: for j in range(1, RETRY + 1): try: print("[%s/%d]: Wait... " % (id_, dic_size), end="\r") archived_uri, exist_flag = capture_or_cache( uri) # use module of "savepagenow" print("[%s/%d]:" % (id_, dic_size), "<%s>" % "NOW" if exist_flag else "PAST", archived_uri) saves += 1 break except WaybackRuntimeError: if j != RETRY: print("[%s/%d]: Retrying..." % (id_, dic_size), "COUNT:%d" % j, end="\r") else: print("[%s/%d]:" % (id_, dic_size), "<FAIL> %s" % uri) fails += 1 finally: # wait retrying time.sleep(random.uniform(1, 3)) except KeyboardInterrupt: print("[!]Interrupted!", file=sys.stderr) print("[!]Halt.", file=sys.stderr) break except TooManyRedirects: print("[!]API says: TooManyRedirects!", file=sys.stderr) print("[!]Need a 1 min break...", file=sys.stderr) for t in range(60): print("%d/60s" % t, end="\r", file=sys.stderr) time.sleep(1) # after for-loop print("[+]FIN!: %s" % pageurl) print("[+]ALL:", count, "SAVE:", saves, "FAIL:", fails)
def try_archive(self, url): try: #print("[%s/%d]: Wait... " % (id_, dic_size), end="\r") archive_uri, exist_f = capture_or_cache(url) #print("[%s/%d]:" % (id_, dic_size), end=" ") print("<%s>" % "NOW" if exist_f else "PAST", archive_uri) return True except WaybackRuntimeError: return False
def try_archive(id_, dic_size, uri): """Try to save a page on Wayback Machine.""" try: print("[%s/%d]: Wait... " % (id_, dic_size), end="\r") archive_uri, exist_f = capture_or_cache(uri) print("[%s/%d]:" % (id_, dic_size), end=" ") print("<%s>" % "NOW" if exist_f else "PAST", archive_uri) return True except WaybackRuntimeError: return False
async def archive_target(self, target): archive_url = savepagenow.capture_or_cache(target) """ Returns log of what was archived """ message = f"target {target} has been archived" # This logs to the docker logs self.logger.info(message) return archive_url[0]
def get_all_tweets(screen_name): if (consumer_key == ""): print "You need to set up the script first. Edit it and add your keys." return #Twitter only allows access to a users most recent 3240 tweets with this method #authorize twitter, initialize tweepy auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) api = tweepy.API(auth) #initialize a list to hold all the tweepy Tweets alltweets = [] #make initial request for most recent tweets (200 is the maximum allowed count) new_tweets = api.user_timeline(screen_name = screen_name,count=200) #save most recent tweets alltweets.extend(new_tweets) #save the id of the oldest tweet less one oldest = alltweets[-1].id - 1 #keep grabbing tweets until there are no tweets left to grab while len(new_tweets) > 0: print "getting tweets before {0}".format(oldest) #all subsiquent requests use the max_id param to prevent duplicates new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest,include_entities = True, tweet_mode = 'extended') #save most recent tweets alltweets.extend(new_tweets) #update the id of the oldest tweet less one oldest = alltweets[-1].id - 1 print "...{0} tweets downloaded so far".format(len(alltweets)) for tweet in alltweets: tweetID=tweet.id_str tweetURL="https://twitter.com/{0}/status/{1}".format(screen_name,tweetID) print "Archiving {0}...".format(tweetURL) archive_url = archiveis.capture(tweetURL) archiveorg_url = savepagenow.capture_or_cache(tweetURL) print "Tweet archived! archive.is: {0} ||| archive.org: {1}".format(archive_url,archiveorg_url[0]) print "All tweets successfully archived."
def ia_memento(clip_id): """ Archive a clip with archive.org """ clip = Clip.objects.get(id=clip_id) logger.debug("Archiving {} with savepagenow version {}".format(clip.url, savepagenow.__version__)) try: ia_url, ia_captured = savepagenow.capture_or_cache(clip.url) ia_memento = Memento.objects.create(url=ia_url, archive="archive.org") logger.debug("Created {}".format(ia_memento)) clip.mementos.add(ia_memento) except Exception as e: logger.debug("archive.org failed") logger.debug(e)
def try_archive(id_, dic_size, uri): """Try to save a page on Wayback Machine.""" try: print("[%s/%d]: Wait... " % (id_, dic_size), end="\r") time.sleep(random.uniform(1, 3)) archive_uri, exist_f = capture_or_cache( uri, user_agent='Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) '\ 'AppleWebKit/602.3.12 (KHTML, like Gecko) Version/10.0 '\ 'Mobile/14C92 Safari/602.1' ) print("[%s/%d]:" % (id_, dic_size), end=" ") print("<%s>" % "NOW" if exist_f else "PAST", archive_uri) return True except WaybackRuntimeError: return False
def update_csv(csv_name, dates, snapshot=False, csv_output='output.csv'): ''' Updates a csv with the date of latest snapshot before a given date, it's wayback url, today's date and today's wayback snapshot of the url. Inputs: - csv_name: (str) name of the csv file from which to pull the urls. The target column must be named as url or URL - dates: (list) latest date until which snapshots should be requested from wayback machine - snapshot: (bool) if set to True, a snapshot of the url is taken Returns: a csv in the directory ''' url_lst = get_links(csv_name) # get urls latest_urls_lst = [] latest_dates_lst = [] current_urls_lst = [] current_dates_lst = [] today = datetime.today().strftime('%m-%d-%Y') for url in url_lst: latest_date, latest_url = get_latest_wayback(url, dates) latest_urls_lst.append(latest_url) if latest_date: latest_date = latest_date.strftime('%m-%d-%Y') latest_dates_lst.append(latest_date) if snapshot: current_dates_lst.append(today) try: current_url = savepagenow.capture_or_cache(url) current_urls_lst.append(current_url[0]) except Exception as e: current_urls_lst.append(e) all_lsts, all_cols = zip_lists([ latest_urls_lst, latest_dates_lst, current_urls_lst, current_dates_lst ]) df = pd.DataFrame(all_lsts, columns=all_cols) df.to_csv(csv_output, index=False)
def parse(self, response): response_data = json.loads(response.body) total_records = response_data['iTotalRecords'] for item in response_data['aaData']: band = Band() match = re.search('<a href=".*/(\d+)">(.*)<\/a>.*', item[0]) band['name'] = match.group(2) band['metalarchives_id'] = match.group(1) # Regex to extract the band URL from the <a> tag url = re.search('href="([^"]*)', item[0]) band['url'] = url.group(1) band["wayback_link"] = savepagenow.capture_or_cache( band['url'], force_utf8=True)[0] self.fetched += 1 yield band if self.fetched < total_records: url = self.start_urls[0] + '&iDisplayStart=%s' % self.fetched yield scrapy.Request(url, callback=self.parse) yield
def archive_object(content_type_pk, object_pk): """ Archive the provided object. """ from .models import Memento # Get the object ct = ContentType.objects.get_for_id(content_type_pk) obj = ct.get_object_for_this_type(pk=object_pk) logger.debug("Archiving {}".format(obj)) # Get the URL we're going to save archive_url = obj.get_archive_url() # Archive it ia_url, ia_captured = savepagenow.capture_or_cache(archive_url) # Save the archived URL logger.debug("Saving memento URL {}".format(ia_url)) ia_memento = Memento.objects.create(content_type=ct, object_pk=obj.pk, url=ia_url) logger.debug("Created {}".format(ia_memento))
import savepagenow import archiveis import time # Add entries to a list of websites #websitelist = ["www.example.com", # "www.example2.com", # ] # Or create a textfile called websitefile.txt # and put a seperate url on each line. Then # save the text file to the same path as this # python script. # # Open a text file with the list in Read Mode websitefile = open("websitefile.txt", "r").read().splitlines() # if using the text file method, replace "websitelist" in # the following loop to "websitefile" for k in websitefile: archive_url = savepagenow.capture_or_cache(k) print(archive_url) archiveis_url = archiveis.capture(k) print(archiveis_url) # Run each capture every 5 seconds time.sleep(1)
ssht.has_crop = True ssht.save() # HTML screenshoting where it is turned on if site.has_html_screenshots: logger.info("Logging HTML for %s" % site.url) ssht.html = site.url ssht.has_html = True ssht.save() # Internet Archive mementos where turned on if site.has_internetarchive_mementos: logger.info("Adding archive.org memento for %s" % site.url) try: ia_memento, ia_created = savepagenow.capture_or_cache( site.url, user_agent="pastpages.org ([email protected])" ) if ia_created: memento = Memento.objects.create( site=site, update=update, archive='archive.org', url=ia_memento, ) else: logger.info("Internet Archive returned a cached memento") except Exception: logger.info("Adding Internet Archive memento failed") # webcitation mementos where turned on if site.has_webcitation_mementos:
def archive(self, url): page, is_not_cache = savepagenow.capture_or_cache(url) return page
# Create screenshot object with Internet Archive link ssht.internetarchive_id = item.identifier logger.debug("Setting internetarchive_id as {}".format(item.identifier)) # Save again ssht.save() # Remove images from the local filesystem [os.remove(f) for f in files] # Internet Archive mementos where turned on if site.has_internetarchive_mementos: logger.info("Adding archive.org memento for %s" % site.url) try: ia_memento, ia_created = savepagenow.capture_or_cache( site.url, user_agent="pastpages.org ([email protected])" ) if ia_created: memento = Memento.objects.create( site=site, update=update, archive='archive.org', url=ia_memento, ) else: logger.info("Internet Archive returned a cached memento") except Exception: logger.info("Adding Internet Archive memento failed") # Archive.is mementos where turned on if site.has_archiveis_mementos: