Пример #1
0
    def test_capture(self):
        random_number = random.choice(range(0, 1000))
        url = "http://www.example.com/my-random-page-{}".format(random_number)
        archive_url_1, c1 = savepagenow.capture_or_cache(url)
        self.assertTrue(archive_url_1.startswith("http://web.archive.org/"))

        # Test CacheError
        archive_url_2, c2 = savepagenow.capture_or_cache(
            url,
            user_agent="savepagenow (https://github.com/pastpages/savepagenow)"
        )
        self.assertTrue(archive_url_2.startswith("http://web.archive.org/"))
        self.assertEqual(archive_url_1, archive_url_2)
Пример #2
0
 def _save(self, l):
     try:
         archive_url, _ = savepagenow.capture_or_cache(l)
         return Bunch(url=l, archive_url=archive_url, ok=1)
     except savepagenow.api.CachedPage as e:
         _, archive_url = str(e).rsplit(None, 1)
         return Bunch(url=l, archive_url=archive_url, ok=0)
     # except savepagenow.api.WaybackRuntimeError as e:
     except Exception as e:
         status_code = None
         if len(e.args) == 1 and isinstance(e.args[0], dict):
             r = e.args[0]
             txt = r.get("headers", {}).get("Link", None)
             status_code = r.get("status_code", None)
             if txt:  # and r.get("status_code", None) in (200, 206):
                 _re = re.compile(
                     "(" + re.escape("https://web.archive.org/web/") +
                     r"\d+/.*?" + re.escape(get_dom(l)) + ".*?)>;")
                 archive_url = _re.findall(txt)
                 if archive_url:
                     archive_url = sorted(archive_url)[-1]
                     return Bunch(url=l, archive_url=archive_url, ok=0)
         out = Bunch(url=l,
                     archive_url=None,
                     ok=-1,
                     error=e,
                     code=status_code)
         return out
Пример #3
0
 def test_capture(self):
     """
     Test the basic function of retriving a URL from Wayback.
     """
     url = "https://www.latimes.com/"
     archive_url, c = savepagenow.capture_or_cache(url)
     self.assertTrue(archive_url.startswith("https://web.archive.org/"))
Пример #4
0
def save(request):
    # Verify there is a URL to save
    url = request.POST.get("url", None)
    if not url:
        return HttpResponseBadRequest("Bad request")

    # Verify the user is logged in
    user = request.user
    if not user.is_authenticated():
        return HttpResponseBadRequest("Bad request")

    # Sent URL to Internet Archive
    # This is required so we throw an error if it fails
    logger.debug("Archiving {} for {} to IA".format(url, user))
    try:
        ia_url, ia_captured = savepagenow.capture_or_cache(url)
        logger.debug("Saving memento URL {}".format(ia_url))
        ia_memento = Memento.objects.create(url=ia_url, archive="archive.org")
    except Exception as e:
        logger.error(e)
        return HttpResponseBadRequest(
            "Sorry. This link cannot be archived by archive.org.")

    # Write it all to the database
    clip = Clip.objects.create(user=user, url=url)
    clip.mementos.add(ia_memento)

    # Head back where the user started
    return redirect("/")
Пример #5
0
def main(args):
    n_done = 0
    n_captured = 0
    n_success = 0
    done_url_set = set()
    if not args.override and os.path.exists(args.o):
        with open(args.o) as f:
            for line in f:
                url, archive_url = line.split()
                n_done += 1
                if archive_url != 'None':
                    n_success += 1
                done_url_set.add(url)

    events = read_jsonl(args.i)
    urls = [
        url for e in events for url in e['references']
        if url not in done_url_set
    ]
    if args.shuffle:
        random.shuffle(urls)
    n_total = len(urls) + len(done_url_set)

    batch = []
    for url in urls:

        repeat = True
        archive_url, captured = None, None
        while repeat:
            try:
                archive_url, captured = savepagenow.capture_or_cache(url)
                repeat = False
                if captured:
                    n_captured += 1
                n_success += 1
            except Exception as e:
                if isinstance(e, ConnectionError):
                    print('Too many requests, waiting a bit...')
                    repeat = True
                else:
                    repeat = False
            if repeat:
                time.sleep(60)
            else:
                time.sleep(1)

            if archive_url is not None:
                batch.append((url, archive_url, captured))
        n_done += 1

        print(f'total: {n_total}, done: {n_done}, '
              f'successful: {n_success}, captured: {n_captured}\n')

        if len(batch) < args.batchsize:
            lines = [f'{url} {archive_url}' for (url, archive_url, _) in batch]
            if len(lines) > 0:
                with open(args.o, 'a') as f:
                    f.write('\n'.join(lines) + '\n')
            batch = []
Пример #6
0
def archive(uri_dic, pageurl, RETRY, ONLYPAGE):
    """
  Save URIs extracted from the target page.
  (by using Module savepagenow)
  """
    if ONLYPAGE:
        uri_dic = {i for i in uri_dic if is_page(i)}

    print("[+]Now: %s" % pageurl)
    print("[+]%d URI(s) found." % len(uri_dic))
    # try to throw each uri to API
    count, saves, fails = 0, 0, 0
    dic_size = len(uri_dic)

    for uri in uri_dic:
        count += 1

        id_ = str(count).zfill(len(str(len(uri_dic))))

        try:
            for j in range(1, RETRY + 1):
                try:
                    print("[%s/%d]: Wait...    " % (id_, dic_size), end="\r")
                    archived_uri, exist_flag = capture_or_cache(
                        uri)  # use module of "savepagenow"
                    print("[%s/%d]:" % (id_, dic_size),
                          "<%s>" % "NOW" if exist_flag else "PAST",
                          archived_uri)
                    saves += 1
                    break

                except WaybackRuntimeError:
                    if j != RETRY:
                        print("[%s/%d]: Retrying..." % (id_, dic_size),
                              "COUNT:%d" % j,
                              end="\r")
                    else:
                        print("[%s/%d]:" % (id_, dic_size), "<FAIL> %s" % uri)
                        fails += 1
                finally:
                    # wait retrying
                    time.sleep(random.uniform(1, 3))
        except KeyboardInterrupt:
            print("[!]Interrupted!", file=sys.stderr)
            print("[!]Halt.", file=sys.stderr)
            break

        except TooManyRedirects:
            print("[!]API says: TooManyRedirects!", file=sys.stderr)
            print("[!]Need a 1 min break...", file=sys.stderr)
            for t in range(60):
                print("%d/60s" % t, end="\r", file=sys.stderr)
                time.sleep(1)

    # after for-loop
    print("[+]FIN!: %s" % pageurl)
    print("[+]ALL:", count, "SAVE:", saves, "FAIL:", fails)
Пример #7
0
    def try_archive(self, url):
        try:
            #print("[%s/%d]: Wait...    " % (id_, dic_size), end="\r")
            archive_uri, exist_f = capture_or_cache(url)
            #print("[%s/%d]:" % (id_, dic_size), end=" ")
            print("<%s>" % "NOW" if exist_f else "PAST", archive_uri)
            return True

        except WaybackRuntimeError:
            return False
Пример #8
0
def try_archive(id_, dic_size, uri):
    """Try to save a page on Wayback Machine."""
    try:
        print("[%s/%d]: Wait...    " % (id_, dic_size), end="\r")
        archive_uri, exist_f = capture_or_cache(uri)
        print("[%s/%d]:" % (id_, dic_size), end=" ")
        print("<%s>" % "NOW" if exist_f else "PAST", archive_uri)
        return True

    except WaybackRuntimeError:
        return False
Пример #9
0
    async def archive_target(self, target):

        archive_url = savepagenow.capture_or_cache(target)
        """
        Returns log of what was archived
        """
        message = f"target {target} has been archived"

        # This logs to the docker logs
        self.logger.info(message)
        return archive_url[0]
Пример #10
0
def get_all_tweets(screen_name):

	if (consumer_key == ""):
		print "You need to set up the script first. Edit it and add your keys."
		return

	#Twitter only allows access to a users most recent 3240 tweets with this method
	
	#authorize twitter, initialize tweepy
	auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
	auth.set_access_token(access_key, access_secret)
	api = tweepy.API(auth)
	
	#initialize a list to hold all the tweepy Tweets
	alltweets = []	
	
	#make initial request for most recent tweets (200 is the maximum allowed count)
	new_tweets = api.user_timeline(screen_name = screen_name,count=200)
	
	#save most recent tweets
	alltweets.extend(new_tweets)
	
	#save the id of the oldest tweet less one
	oldest = alltweets[-1].id - 1
	
	#keep grabbing tweets until there are no tweets left to grab
	while len(new_tweets) > 0:
		print "getting tweets before {0}".format(oldest)
		
		#all subsiquent requests use the max_id param to prevent duplicates
		new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest,include_entities = True, tweet_mode = 'extended')
		
				
		#save most recent tweets
		alltweets.extend(new_tweets)

				
		
		#update the id of the oldest tweet less one
		oldest = alltweets[-1].id - 1
				
		print "...{0} tweets downloaded so far".format(len(alltweets))
		
		for tweet in alltweets:
			tweetID=tweet.id_str
			tweetURL="https://twitter.com/{0}/status/{1}".format(screen_name,tweetID)
			
			print "Archiving {0}...".format(tweetURL)
			archive_url = archiveis.capture(tweetURL)
			archiveorg_url = savepagenow.capture_or_cache(tweetURL)
			print "Tweet archived! archive.is: {0} ||| archive.org: {1}".format(archive_url,archiveorg_url[0])
	
		print "All tweets successfully archived."
Пример #11
0
def ia_memento(clip_id):
    """
    Archive a clip with archive.org
    """
    clip = Clip.objects.get(id=clip_id)
    logger.debug("Archiving {} with savepagenow version {}".format(clip.url, savepagenow.__version__))
    try:
        ia_url, ia_captured = savepagenow.capture_or_cache(clip.url)
        ia_memento = Memento.objects.create(url=ia_url, archive="archive.org")
        logger.debug("Created {}".format(ia_memento))
        clip.mementos.add(ia_memento)
    except Exception as e:
        logger.debug("archive.org failed")
        logger.debug(e)
Пример #12
0
def try_archive(id_, dic_size, uri):
    """Try to save a page on Wayback Machine."""
    try:
        print("[%s/%d]: Wait...    " % (id_, dic_size), end="\r")
        time.sleep(random.uniform(1, 3))
        archive_uri, exist_f = capture_or_cache(
            uri,
            user_agent='Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) '\
                       'AppleWebKit/602.3.12 (KHTML, like Gecko) Version/10.0 '\
                       'Mobile/14C92 Safari/602.1'
        )
        print("[%s/%d]:" % (id_, dic_size), end=" ")
        print("<%s>" % "NOW" if exist_f else "PAST", archive_uri)
        return True

    except WaybackRuntimeError:
        return False
def update_csv(csv_name, dates, snapshot=False, csv_output='output.csv'):
    '''
    Updates a csv with the date of latest snapshot before a given date, it's wayback url,
    today's date and today's wayback snapshot of the url.

    Inputs:
        - csv_name: (str) name of the csv file from which to pull the urls. The target
                    column must be named as url or URL
        - dates: (list) latest date until which snapshots should be requested from wayback
                 machine
        - snapshot: (bool) if set to True, a snapshot of the url is taken

    Returns: a csv in the directory
    '''
    url_lst = get_links(csv_name)  # get urls

    latest_urls_lst = []
    latest_dates_lst = []
    current_urls_lst = []
    current_dates_lst = []
    today = datetime.today().strftime('%m-%d-%Y')

    for url in url_lst:
        latest_date, latest_url = get_latest_wayback(url, dates)
        latest_urls_lst.append(latest_url)
        if latest_date:
            latest_date = latest_date.strftime('%m-%d-%Y')
        latest_dates_lst.append(latest_date)
        if snapshot:
            current_dates_lst.append(today)
            try:
                current_url = savepagenow.capture_or_cache(url)
                current_urls_lst.append(current_url[0])
            except Exception as e:
                current_urls_lst.append(e)

    all_lsts, all_cols = zip_lists([
        latest_urls_lst, latest_dates_lst, current_urls_lst, current_dates_lst
    ])
    df = pd.DataFrame(all_lsts, columns=all_cols)
    df.to_csv(csv_output, index=False)
Пример #14
0
    def parse(self, response):
        response_data = json.loads(response.body)
        total_records = response_data['iTotalRecords']

        for item in response_data['aaData']:
            band = Band()
            match = re.search('<a href=".*/(\d+)">(.*)<\/a>.*', item[0])
            band['name'] = match.group(2)
            band['metalarchives_id'] = match.group(1)

            # Regex to extract the band URL from the <a> tag
            url = re.search('href="([^"]*)', item[0])
            band['url'] = url.group(1)
            band["wayback_link"] = savepagenow.capture_or_cache(
                band['url'], force_utf8=True)[0]
            self.fetched += 1
            yield band

        if self.fetched < total_records:
            url = self.start_urls[0] + '&iDisplayStart=%s' % self.fetched
            yield scrapy.Request(url, callback=self.parse)
        yield
Пример #15
0
def archive_object(content_type_pk, object_pk):
    """
    Archive the provided object.
    """
    from .models import Memento

    # Get the object
    ct = ContentType.objects.get_for_id(content_type_pk)
    obj = ct.get_object_for_this_type(pk=object_pk)
    logger.debug("Archiving {}".format(obj))

    # Get the URL we're going to save
    archive_url = obj.get_archive_url()

    # Archive it
    ia_url, ia_captured = savepagenow.capture_or_cache(archive_url)

    # Save the archived URL
    logger.debug("Saving memento URL {}".format(ia_url))
    ia_memento = Memento.objects.create(content_type=ct,
                                        object_pk=obj.pk,
                                        url=ia_url)
    logger.debug("Created {}".format(ia_memento))
Пример #16
0
import savepagenow
import archiveis
import time

# Add entries to a list of websites
#websitelist = ["www.example.com",
 #              "www.example2.com",
#              ]

# Or create a textfile called websitefile.txt
# and put a seperate url on each line. Then
# save the text file to the same path as this
# python script.
#
# Open a text file with the list in Read Mode
websitefile = open("websitefile.txt", "r").read().splitlines()

# if using the text file method, replace "websitelist" in
# the following loop to "websitefile"
for k in websitefile:
    archive_url = savepagenow.capture_or_cache(k)
    print(archive_url)
    archiveis_url = archiveis.capture(k)
    print(archiveis_url)

    # Run each capture every 5 seconds
    time.sleep(1)
Пример #17
0
    ssht.has_crop = True
    ssht.save()

    # HTML screenshoting where it is turned on
    if site.has_html_screenshots:
        logger.info("Logging HTML for %s" % site.url)
        ssht.html = site.url
        ssht.has_html = True
        ssht.save()

    # Internet Archive mementos where turned on
    if site.has_internetarchive_mementos:
        logger.info("Adding archive.org memento for %s" % site.url)
        try:
            ia_memento, ia_created = savepagenow.capture_or_cache(
                site.url,
                user_agent="pastpages.org ([email protected])"
            )
            if ia_created:
                memento = Memento.objects.create(
                    site=site,
                    update=update,
                    archive='archive.org',
                    url=ia_memento,
                )
            else:
                logger.info("Internet Archive returned a cached memento")
        except Exception:
            logger.info("Adding Internet Archive memento failed")

    # webcitation mementos where turned on
    if site.has_webcitation_mementos:
Пример #18
0
 def archive(self, url):
     page, is_not_cache = savepagenow.capture_or_cache(url)
     return page
Пример #19
0
    # Create screenshot object with Internet Archive link
    ssht.internetarchive_id = item.identifier
    logger.debug("Setting internetarchive_id as {}".format(item.identifier))

    # Save again
    ssht.save()

    # Remove images from the local filesystem
    [os.remove(f) for f in files]

    # Internet Archive mementos where turned on
    if site.has_internetarchive_mementos:
        logger.info("Adding archive.org memento for %s" % site.url)
        try:
            ia_memento, ia_created = savepagenow.capture_or_cache(
                site.url,
                user_agent="pastpages.org ([email protected])"
            )
            if ia_created:
                memento = Memento.objects.create(
                    site=site,
                    update=update,
                    archive='archive.org',
                    url=ia_memento,
                )
            else:
                logger.info("Internet Archive returned a cached memento")
        except Exception:
            logger.info("Adding Internet Archive memento failed")

    # Archive.is mementos where turned on
    if site.has_archiveis_mementos: