def get_archived(page_url, update_old=False): try: waybackpy_url_obj = waybackpy.Url(page_url, USER_AGENT) archive_url_near = waybackpy_url_obj.near(year=YEAR, month=MONTH, day=DAY) except waybackpy.exceptions.WaybackError as e: try: # try again sleep(5) waybackpy_url_obj = waybackpy.Url(page_url, USER_AGENT) archive_url_near = waybackpy_url_obj.near(year=YEAR, month=MONTH, day=DAY) except waybackpy.exceptions.WaybackError as e: # print(e) print(' error in retrieving {} , using original url '.format( page_url)) return page_url url_str = archive_url_near.archive_url if update_old: date = archive_url_near.timestamp if date < OLD_DATE: print('updating {}'.format(url_str, date)) archive_url_near = update_archive(waybackpy_url_obj) if archive_url_near is None: print(' could not save page {}'.format(page_url)) else: url_str = archive_url_near.archive_url print(' updated to {}'.format(url_str)) url_str = url_str.replace(':80', '', 1) return url_str
def test_total_archives(): time.sleep(10) if sys.version_info > (3, 6): target = waybackpy.Url(" https://google.com ", user_agent) assert target.total_archives() > 500000 else: pass time.sleep(5) target = waybackpy.Url(" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent) assert target.total_archives() == 0
def test_clean_url(): time.sleep(10) test_url = " https://en.wikipedia.org/wiki/Network security " answer = "https://en.wikipedia.org/wiki/Network_security" target = waybackpy.Url(test_url, user_agent) test_result = target.clean_url() assert answer == test_result
def pagehandler(pageurl, pageresponse, soup): archived_url = waybackpy.Url(pageurl, "Any-User-Agent").save() print(pageurl + ' Crawled') print("Crawling:" + pageurl + " ({0} bytes)".format(len(pageresponse.text))) # wordcount(soup) # display unique word counts return True
def test_wayback_timestamp(): ts = waybackpy.Url("https://www.google.com", "UA").wayback_timestamp(year=2020, month=1, day=2, hour=3, minute=4) assert "202001020304" in str(ts)
def test_get_response(): hdr = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0' } req = Request("https://www.google.com", headers=hdr) # nosec response = waybackpy.Url("https://www.google.com", "UA").get_response(req) assert response.code == 200
def test_save(): # Test for urls that exist and can be archived. time.sleep(10) url_list = [ "en.wikipedia.org", "www.wikidata.org", "commons.wikimedia.org", "www.wiktionary.org", "www.w3schools.com", "www.youtube.com" ] x = random.randint(0, len(url_list) - 1) url1 = url_list[x] target = waybackpy.Url( url1, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36" ) archived_url1 = target.save() assert url1 in archived_url1 if sys.version_info > (3, 6): # Test for urls that are incorrect. with pytest.raises(Exception) as e_info: url2 = "ha ha ha ha" waybackpy.Url(url2, user_agent) time.sleep(5) # Test for urls not allowed to archive by robot.txt. with pytest.raises(Exception) as e_info: url3 = "http://www.archive.is/faq.html" target = waybackpy.Url( url3, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0" ) target.save() time.sleep(5) # Non existent urls, test with pytest.raises(Exception) as e_info: url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us" target = waybackpy.Url( url3, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27" ) target.save() else: pass
def archive_helper(url): wayback = waybackpy.Url(url, user_agent) # loop = asyncio.get_event_loop() # archive = await loop.run_in_executor(_executor, helpsave(wayback)) # archive = await wayback.save() # archive = wayback.save() return archive
def update_archive(waybackpy_url_obj): if isinstance(waybackpy_url_obj, str): waybackpy_url_obj = waybackpy.Url(waybackpy_url_obj, USER_AGENT) try: archive_obj = waybackpy_url_obj.save() except waybackpy.exceptions.WaybackError as e: print(e) return None return archive_obj
def _wayback_setup(self, url) -> waybackpy.Url: """ Helper function for setting up the waybackpy object. :param url: URL to deal with :return: waybackpy object """ try: return waybackpy.Url(url, self.USER_AGENT) except URLError as e: raise e
def archive(url, targetdir=None): """Archives URL and saves information to data log in targetdir based on archive age limit. Parameters: url (str): String indicating where data exists. targetdir (str): String indicating where files' data log exists. Returns: archive_dict (dict): Dictionary with URL and timestamp of latest archive. """ archive_dict = {'archive_url': None, 'archive_time': None} wayback_obj = waybackpy.Url(url=url, user_agent=user_agent) archive_age = len(wayback_obj.newest()) print(archive_age) # Create new archive if age is greater than limit, else use most recent if archive_age > archive_age_limit: archive_dict['archive_url'] = wayback_obj.save().archive_url archive_dict['archive_time'] = datetime.utcnow() new_archive = 1 else: archive_dict['archive_url'] = wayback_obj.newest().archive_url archive_dict['archive_time'] = wayback_obj.newest().timestamp new_archive = 0 # Dict of data about archive d = { 'URL': [url], 'File': [url.rsplit('/', 1)[-1]], 'Directory': [targetdir], 'ArchiveURL': [archive_dict['archive_url']], 'ArchiveTime': [archive_dict['archive_time']], 'NewArchive': [new_archive] } # Add to or create data log try: data_log = pd.read_csv(f'{targetdir}+_data_log.csv', index_col='LogID') data_log['ArchiveTime'] = [parse(x) for x in data_log['ArchiveTime']] d['LogID'] = data_log.index.values.max() + 1 d = pd.DataFrame.from_dict(d) d.set_index('LogID', inplace=True) data_log = pd.concat([d, data_log]).drop_duplicates(keep='last') except: d['LogID'] = 1 data_log = pd.DataFrame(data=d) data_log.set_index('LogID', inplace=True) data_log.to_csv(f'{targetdir}+_data_log.csv', index_label='LogID') return archive_dict
def test_near(): time.sleep(10) url = "google.com" target = waybackpy.Url(url, "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4") archive_near_year = target.near(year=2010) assert "2010" in archive_near_year if sys.version_info > (3, 6): time.sleep(5) archive_near_month_year = target.near( year=2015, month=2) assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year) target = waybackpy.Url("www.python.org", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246") archive_near_hour_day_month_year = target.near(year=2008, month=5, day=9, hour=15) assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year) with pytest.raises(Exception) as e_info: NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity" target = waybackpy.Url(NeverArchivedUrl, user_agent) target.near(year=2010) else: pass
def extract(url): log.info("extracting wayback data...") try: wayback = waybackpy.Url(url, USER_AGENT) oldest = wayback.oldest().timestamp.isoformat() newest = wayback.newest().timestamp.isoformat() except WaybackError: oldest = _get_oldest_wayback_timestamp(url) newest = _get_newest_wayback_timestamp(url) data = {} if oldest: data |= {"oldest-wayback-archive": oldest} if newest: data |= {"newest-wayback-archive": newest} return {"meta": data}
def test_get(): time.sleep(10) target = waybackpy.Url("google.com", user_agent) assert "Welcome to Google" in target.get(target.oldest())
if "detalles" not in link: index1 = link.index('//') total_page = int(last_page_text[index2 + 1:]) University_Web.append(link) # Crawl rate (Avoiding saturating the web server) sleep(randint(3, 10)) # Scraping Historical data. elif scraping_option == 1: print("Generating your csv file, please wait...") # Obtaining archive URLs from Wayback Machine. input_url = dict_url[int(url_option)] user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0" wayback = waybackpy.Url(input_url, user_agent) now = datetime.now() year_limit = now.year wayback_urls = [] for i in range(2012, year_limit): archive = wayback.near(year=i, month=12) wayback_urls.append(archive.archive_url) # Variable Year. for i in range(2012, year_limit): for j in range(1, 101): Year.append(i) # Start of scraping for url in wayback_urls:
def save_archive_of_webpage_in_wbm(url, idx, total, dry_run=False): ''' saves a copy of the given URL in the Internet Archive wayback machine and returns the archive URL @param url - the url as a string @param idx - the download # we are on @aram total - the total downloads @param dry_run - whether we should actually save it, or just log what we would do ''' if dry_run: logger.info("[`%s/%s`] DRY RUN: - would have saved the url `%s` in the wayback machine", idx, total, url) return logger.info("[`%s/%s`] - saving an archive of the url `%s` in the wayback machine", idx, total, url) error_list = [] archive_url = None # loop until we get a valid result since it seems to not return a result a lot of # the time , probably because it takes too long and is queued? # also, do +1 because we start at index 1 here for iter_try_idx in range(1, constants.WAYBACK_ATTEMPT_MAX + 1): try: logger.debug("try `%s/%s` on url `%s`", iter_try_idx, constants.WAYBACK_ATTEMPT_MAX, url) wayback_handle_for_url = waybackpy.Url(url, constants.HTTP_USER_AGENT) logger.debug("calling save() on wayback handle for url: `%s`", repr(wayback_handle_for_url)) archive = wayback_handle_for_url.save() archive_url = archive.archive_url logger.info("archive of url `%s` complete, url: `%s`", url, archive_url) except WaybackError as e: logger.debug(f"Got WaybackError when trying to save url `{url}`: `{e}`") error_list.append(e) logger.info("Got WaybackError, sleeping for `%s` seconds before trying again", constants.WAYBACK_MACHINE_BACKOFF_TIME_SECONDS) time.sleep(constants.WAYBACK_MACHINE_BACKOFF_TIME_SECONDS) continue except URLError as e: # don't continue here, this means we screwed up when providing # the url logger.debug(f"Got URLError when trying to save url `{url}`") raise e # check to see its not a garbage 'hashflags' url # for some reason, the wayback machine when given a twitter url has a chance to return a result that # isn't the page that is requested, but instead is a url of the form # `https://web.archive.org/web/20210115010042/https://pbs.twimg.com/hashflag/config-2021-01-15-01.json` # if we get this, we should throw an exception so we don't accidentally use this wayback machine URL as a real one logger.debug("checking the returned archive url `%s` against the hashflags JSON regex: `%s`", archive_url, constants.TWITTER_HASHFLAGS_REGEX) hashflags_re_result = constants.TWITTER_HASHFLAGS_REGEX.search(archive_url) logger.debug("regex result: `%s`", hashflags_re_result) if hashflags_re_result: logger.error("Hashflasgs regex `%s matched the returned archive url `%s` ," + " this means that the archive was corrupted and shouldn't be used, sleeping for 30 minutes", constants.TWITTER_HASHFLAGS_REGEX, archive_url) archive_url = None time.sleep(30 * 60) continue else: logger.debug("archive_url `%s` did not match the regex `%s`, returning", archive_url, constants.TWITTER_HASHFLAGS_REGEX) return archive_url # if we get here, then we ran out of tries raise Exception(f"did not get a good result when trying to save the url `{url}` in the wayback machine, errors: `{error_list}`")
def review_file(inpage: pywikibot.page.BasePage, throttle: Optional[utils.Throttle] = None) -> Optional[bool]: """Performs a license review on the input page inpage must be in the file namespace. Returns None if the file was skipped Returns False if there was an error during review Returns True if the file was successfully reviewed (pass or fail) """ try: page = pywikibot.FilePage(inpage) except ValueError: return None logger.info(f"Checking {page.title(as_link=True)}") utils.check_runpage(site, run_override) if not check_can_run(page): return None raw_obs_id, raw_photo_id = find_ina_id(page) logger.info(f"ID found in wikitext: {raw_obs_id} {raw_photo_id}") if raw_photo_id and not raw_obs_id: raw_obs_id = get_observation_from_photo(raw_photo_id) if not raw_obs_id: logger.info("No observation ID could be found") update_review(page, status="error", reason="url", throttle=throttle) return False ina_throttle = utils.Throttle(10) ina_data = get_ina_data(raw_obs_id, ina_throttle) if not ina_data: logger.warning("No data retrieved from iNaturalist!") update_review(page, status="error", reason="nodata", throttle=throttle) return False photo_id, found = find_photo_in_obs(page, raw_obs_id, ina_data, raw_photo_id, ina_throttle) if photo_id is None: logger.info(f"Images did not match: {found}") update_review(page, status="error", reason=found, throttle=throttle) return False else: assert isinstance(photo_id, iNaturalistID) ina_license = find_ina_license(ina_data, photo_id) logger.debug(f"iNaturalist License: {ina_license}") ina_author = find_ina_author(ina_data) logger.debug(f"Author: {ina_author}") com_license = find_com_license(page) logger.debug(f"Commons License: {com_license}") status = check_licenses(ina_license, com_license) if status == "fail": is_old = file_is_old(page) else: is_old = False if config["use_wayback"] and status in ("pass", "pass-change"): archive = waybackpy.Url(str(photo_id), user_agent).save() else: archive = "" reviewed = update_review( page, photo_id, status=status, author=ina_author, review_license=ina_license, upload_license=com_license, reason=found, is_old=is_old, throttle=throttle, archive=archive, ) if status == "fail" and reviewed: fail_warning(page, ina_license, is_old) return reviewed
def Alljson(row): name = row['url'] return waybackpy.Url(name, user_agent).JSON
def newestCache(row): name = row['url'] return waybackpy.Url(name, user_agent).newest()
def near(row): name = row['url'] return waybackpy.Url(name, user_agent).near(year=year, month=month, day=day)
def test_oldest(): time.sleep(10) url = "github.com/akamhy/waybackpy" target = waybackpy.Url(url, user_agent) assert "20200504141153" in target.oldest()
def get_wayback_item(url) -> Union[waybackpy.Url, None]: try: user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0" return waybackpy.Url(url, user_agent) except: return None
def test_url_check(): time.sleep(10) broken_url = "http://wwwgooglecom/" with pytest.raises(Exception) as e_info: waybackpy.Url(broken_url, user_agent)
spamwriter = csv.writer(csvfile, dialect='excel', delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) urllist = loadtxt(INFILE, dtype=str, comments="#", delimiter="\n", encoding='utf-8') for url in urllist: time.sleep(1) # pause for 1 sec, not to overrequest the server. # Choose longer times for archiving pdf and doc files, as the archiving software needs more time to read these docs than standard html pages wayback = waybackpy.Url(url, USER_AGENT) # MODE 1: Code for saving url/page try: archive = wayback.save() print(url, archive.archive_url) spamwriter.writerow([url, archive.archive_url]) except WaybackError as e: print(url, "WaybackError - could not save page to WBM") spamwriter.writerow([url, "WaybackError - could not save page to IA"]) except AttributeError as d: print(url, "AttributeError - could not save page to IA") spamwriter.writerow( [url, "AttributeError - could not save page to WBM"]) # MODE 2: Code for retrieving latest/newest version of page
def totalArchivesCount(row): name = row['url'] return waybackpy.Url(name, user_agent).total_archives()
def test_newest(): time.sleep(10) url = "github.com/akamhy/waybackpy" target = waybackpy.Url(url, user_agent) assert url in target.newest()
if len(raw) > 0: html = str(raw[0]['content']) res['outline_html'] = html att = soup.find_all('img', {'class': 'logo'}) if len(att) > 0: img = str(att[0]['src']) res['outline_img'] = img except Exception as e: print(f"OUTLINE ERROR: {str(e)}") # print(driver.save_screenshot("ERROR.png")) pass # Wayback Machine print("WAYBACK MACHINE") wayback = waybackpy.Url(res['external_link']) try: res['archived_url'] = wayback.newest().archive_url except: print("SAVING ON WAYBACK") wayback.save() res['archived_url'] = wayback.newest().archive_url # Newspape Metadata print("NEWSFETCH") n = newspaper(res['external_link']).get_dict res['headline'] = n['headline'] res['summary'] = n['summary'] res['article'] = n['article'] res['description'] = n['description'] res['publication'] = n['publication']