Пример #1
0
def get_archived(page_url, update_old=False):
    try:
        waybackpy_url_obj = waybackpy.Url(page_url, USER_AGENT)
        archive_url_near = waybackpy_url_obj.near(year=YEAR,
                                                  month=MONTH,
                                                  day=DAY)
    except waybackpy.exceptions.WaybackError as e:
        try:  # try again
            sleep(5)
            waybackpy_url_obj = waybackpy.Url(page_url, USER_AGENT)
            archive_url_near = waybackpy_url_obj.near(year=YEAR,
                                                      month=MONTH,
                                                      day=DAY)
        except waybackpy.exceptions.WaybackError as e:
            # print(e)
            print('  error in retrieving {} , using original url '.format(
                page_url))
            return page_url
    url_str = archive_url_near.archive_url
    if update_old:
        date = archive_url_near.timestamp
        if date < OLD_DATE:
            print('updating  {}'.format(url_str, date))
            archive_url_near = update_archive(waybackpy_url_obj)
            if archive_url_near is None:
                print('  could not save page {}'.format(page_url))
            else:
                url_str = archive_url_near.archive_url
                print('  updated to {}'.format(url_str))
    url_str = url_str.replace(':80', '', 1)
    return url_str
Пример #2
0
def test_total_archives():
    time.sleep(10)
    if sys.version_info > (3, 6):
        target = waybackpy.Url(" https://google.com ", user_agent)
        assert target.total_archives() > 500000
    else:
        pass
    time.sleep(5)
    target = waybackpy.Url(" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent)
    assert target.total_archives() == 0
Пример #3
0
def test_clean_url():
    time.sleep(10)
    test_url = " https://en.wikipedia.org/wiki/Network security "
    answer = "https://en.wikipedia.org/wiki/Network_security"
    target = waybackpy.Url(test_url, user_agent)
    test_result = target.clean_url()
    assert answer == test_result
Пример #4
0
def pagehandler(pageurl, pageresponse, soup):
    archived_url = waybackpy.Url(pageurl, "Any-User-Agent").save()
    print(pageurl + ' Crawled')
    print("Crawling:" + pageurl +
          " ({0} bytes)".format(len(pageresponse.text)))
    # wordcount(soup) # display unique word counts
    return True
Пример #5
0
def test_wayback_timestamp():
    ts = waybackpy.Url("https://www.google.com",
                       "UA").wayback_timestamp(year=2020,
                                               month=1,
                                               day=2,
                                               hour=3,
                                               minute=4)
    assert "202001020304" in str(ts)
Пример #6
0
def test_get_response():
    hdr = {
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'
    }
    req = Request("https://www.google.com", headers=hdr)  # nosec
    response = waybackpy.Url("https://www.google.com", "UA").get_response(req)
    assert response.code == 200
Пример #7
0
def test_save():
    # Test for urls that exist and can be archived.
    time.sleep(10)

    url_list = [
        "en.wikipedia.org", "www.wikidata.org", "commons.wikimedia.org",
        "www.wiktionary.org", "www.w3schools.com", "www.youtube.com"
    ]
    x = random.randint(0, len(url_list) - 1)
    url1 = url_list[x]
    target = waybackpy.Url(
        url1,
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36"
    )
    archived_url1 = target.save()
    assert url1 in archived_url1

    if sys.version_info > (3, 6):

        # Test for urls that are incorrect.
        with pytest.raises(Exception) as e_info:
            url2 = "ha ha ha ha"
            waybackpy.Url(url2, user_agent)
        time.sleep(5)
        # Test for urls not allowed to archive by robot.txt.
        with pytest.raises(Exception) as e_info:
            url3 = "http://www.archive.is/faq.html"
            target = waybackpy.Url(
                url3,
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0"
            )
            target.save()

        time.sleep(5)
        # Non existent urls, test
        with pytest.raises(Exception) as e_info:
            url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us"
            target = waybackpy.Url(
                url3,
                "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27"
            )
            target.save()

    else:
        pass
Пример #8
0
def archive_helper(url):
    wayback = waybackpy.Url(url, user_agent)
    # loop = asyncio.get_event_loop()
    # archive = await loop.run_in_executor(_executor, helpsave(wayback))

    # archive = await wayback.save()
    #
    archive = wayback.save()
    return archive
Пример #9
0
def update_archive(waybackpy_url_obj):
    if isinstance(waybackpy_url_obj, str):
        waybackpy_url_obj = waybackpy.Url(waybackpy_url_obj, USER_AGENT)
    try:
        archive_obj = waybackpy_url_obj.save()
    except waybackpy.exceptions.WaybackError as e:
        print(e)
        return None
    return archive_obj
Пример #10
0
 def _wayback_setup(self, url) -> waybackpy.Url:
     """
     Helper function for setting up the waybackpy object.
     :param url: URL to deal with
     :return: waybackpy object
     """
     try:
         return waybackpy.Url(url, self.USER_AGENT)
     except URLError as e:
         raise e
Пример #11
0
def archive(url, targetdir=None):
    """Archives URL and saves information to data log in targetdir based on archive age limit.
    
            Parameters:
                url (str): String indicating where data exists.
                targetdir (str): String indicating where files' data log exists.
        
            Returns:
                archive_dict (dict): Dictionary with URL and timestamp of latest archive.
    """
    archive_dict = {'archive_url': None, 'archive_time': None}
    wayback_obj = waybackpy.Url(url=url, user_agent=user_agent)
    archive_age = len(wayback_obj.newest())
    print(archive_age)

    # Create new archive if age is greater than limit, else use most recent
    if archive_age > archive_age_limit:
        archive_dict['archive_url'] = wayback_obj.save().archive_url
        archive_dict['archive_time'] = datetime.utcnow()
        new_archive = 1
    else:
        archive_dict['archive_url'] = wayback_obj.newest().archive_url
        archive_dict['archive_time'] = wayback_obj.newest().timestamp
        new_archive = 0

    # Dict of data about archive
    d = {
        'URL': [url],
        'File': [url.rsplit('/', 1)[-1]],
        'Directory': [targetdir],
        'ArchiveURL': [archive_dict['archive_url']],
        'ArchiveTime': [archive_dict['archive_time']],
        'NewArchive': [new_archive]
    }

    # Add to or create data log
    try:
        data_log = pd.read_csv(f'{targetdir}+_data_log.csv', index_col='LogID')
        data_log['ArchiveTime'] = [parse(x) for x in data_log['ArchiveTime']]
        d['LogID'] = data_log.index.values.max() + 1

        d = pd.DataFrame.from_dict(d)
        d.set_index('LogID', inplace=True)

        data_log = pd.concat([d, data_log]).drop_duplicates(keep='last')

    except:
        d['LogID'] = 1
        data_log = pd.DataFrame(data=d)
        data_log.set_index('LogID', inplace=True)

    data_log.to_csv(f'{targetdir}+_data_log.csv', index_label='LogID')

    return archive_dict
Пример #12
0
def test_near():
    time.sleep(10)
    url = "google.com"
    target = waybackpy.Url(url, "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4")
    archive_near_year = target.near(year=2010)
    assert "2010" in archive_near_year

    if sys.version_info > (3, 6):
        time.sleep(5)
        archive_near_month_year = target.near( year=2015, month=2)
        assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year)
    
        target = waybackpy.Url("www.python.org", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246")
        archive_near_hour_day_month_year = target.near(year=2008, month=5, day=9, hour=15)
        assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year)
    
        with pytest.raises(Exception) as e_info:
            NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
            target = waybackpy.Url(NeverArchivedUrl, user_agent)
            target.near(year=2010)
    else:
        pass
Пример #13
0
def extract(url):
    log.info("extracting wayback data...")

    try:
        wayback = waybackpy.Url(url, USER_AGENT)
        oldest = wayback.oldest().timestamp.isoformat()
        newest = wayback.newest().timestamp.isoformat()
    except WaybackError:
        oldest = _get_oldest_wayback_timestamp(url)
        newest = _get_newest_wayback_timestamp(url)

    data = {}

    if oldest:
        data |= {"oldest-wayback-archive": oldest}

    if newest:
        data |= {"newest-wayback-archive": newest}

    return {"meta": data}
Пример #14
0
def test_get():
    time.sleep(10)
    target = waybackpy.Url("google.com", user_agent)
    assert "Welcome to Google" in target.get(target.oldest())
                if "detalles" not in link:
                    index1 = link.index('//')
                    total_page = int(last_page_text[index2 + 1:])
                    University_Web.append(link)

        # Crawl rate (Avoiding saturating the web server)
        sleep(randint(3, 10))

# Scraping Historical data.
elif scraping_option == 1:
    print("Generating your csv file, please wait...")

    # Obtaining archive URLs from Wayback Machine.
    input_url = dict_url[int(url_option)]
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
    wayback = waybackpy.Url(input_url, user_agent)
    now = datetime.now()
    year_limit = now.year
    wayback_urls = []

    for i in range(2012, year_limit):
        archive = wayback.near(year=i, month=12)
        wayback_urls.append(archive.archive_url)

    # Variable Year.
    for i in range(2012, year_limit):
        for j in range(1, 101):
            Year.append(i)

    # Start of scraping
    for url in wayback_urls:
Пример #16
0
def save_archive_of_webpage_in_wbm(url, idx, total, dry_run=False):
    '''
    saves a copy of the given URL in the Internet Archive wayback machine
    and returns the archive URL

    @param url - the url as a string
    @param idx - the download # we are on
    @aram total - the total downloads
    @param dry_run - whether we should actually save it, or just log what we would do

    '''

    if dry_run:

        logger.info("[`%s/%s`] DRY RUN: - would have saved the url `%s` in the wayback machine", idx, total, url)
        return

    logger.info("[`%s/%s`] - saving an archive of the url `%s` in the wayback machine", idx, total, url)
    error_list = []
    archive_url = None

    # loop until we get a valid result since it seems to not return a result a lot of
    # the time , probably because it takes too long and is queued?
    # also, do +1 because we start at index 1 here
    for iter_try_idx in range(1, constants.WAYBACK_ATTEMPT_MAX + 1):
        try:
            logger.debug("try `%s/%s` on url `%s`", iter_try_idx, constants.WAYBACK_ATTEMPT_MAX, url)
            wayback_handle_for_url = waybackpy.Url(url, constants.HTTP_USER_AGENT)

            logger.debug("calling save() on wayback handle for url: `%s`", repr(wayback_handle_for_url))

            archive = wayback_handle_for_url.save()
            archive_url = archive.archive_url
            logger.info("archive of url `%s` complete, url: `%s`", url, archive_url)
        except WaybackError as e:
            logger.debug(f"Got WaybackError when trying to save url `{url}`: `{e}`")
            error_list.append(e)

            logger.info("Got WaybackError, sleeping for `%s` seconds before trying again", constants.WAYBACK_MACHINE_BACKOFF_TIME_SECONDS)
            time.sleep(constants.WAYBACK_MACHINE_BACKOFF_TIME_SECONDS)
            continue
        except URLError as e:
            # don't continue here, this means we screwed up when providing
            # the url
            logger.debug(f"Got URLError when trying to save url `{url}`")
            raise e

        # check to see its not a garbage 'hashflags' url
        # for some reason, the wayback machine when given a twitter url has a chance to return a result that
        # isn't the page that is requested, but instead is a url of the form
        # `https://web.archive.org/web/20210115010042/https://pbs.twimg.com/hashflag/config-2021-01-15-01.json`
        # if we get this, we should throw an exception so we don't accidentally use this wayback machine URL as a real one
        logger.debug("checking the returned archive url `%s` against the hashflags JSON regex: `%s`", archive_url, constants.TWITTER_HASHFLAGS_REGEX)
        hashflags_re_result = constants.TWITTER_HASHFLAGS_REGEX.search(archive_url)
        logger.debug("regex result: `%s`", hashflags_re_result)

        if hashflags_re_result:
            logger.error("Hashflasgs regex `%s matched the returned archive url `%s` ," +
                " this means that the archive was corrupted and shouldn't be used, sleeping for 30 minutes",
                constants.TWITTER_HASHFLAGS_REGEX, archive_url)
            archive_url = None
            time.sleep(30 * 60)

            continue

        else:
            logger.debug("archive_url `%s` did not match the regex `%s`, returning",
                archive_url, constants.TWITTER_HASHFLAGS_REGEX)
            return archive_url

    # if we get here, then we ran out of tries
    raise Exception(f"did not get a good result when trying to save the url `{url}` in the wayback machine, errors: `{error_list}`")
Пример #17
0
def review_file(inpage: pywikibot.page.BasePage,
                throttle: Optional[utils.Throttle] = None) -> Optional[bool]:
    """Performs a license review on the input page

    inpage must be in the file namespace.

    Returns None if the file was skipped
    Returns False if there was an error during review
    Returns True if the file was successfully reviewed (pass or fail)
    """
    try:
        page = pywikibot.FilePage(inpage)
    except ValueError:
        return None
    logger.info(f"Checking {page.title(as_link=True)}")

    utils.check_runpage(site, run_override)
    if not check_can_run(page):
        return None

    raw_obs_id, raw_photo_id = find_ina_id(page)
    logger.info(f"ID found in wikitext: {raw_obs_id} {raw_photo_id}")
    if raw_photo_id and not raw_obs_id:
        raw_obs_id = get_observation_from_photo(raw_photo_id)

    if not raw_obs_id:
        logger.info("No observation ID could be found")
        update_review(page, status="error", reason="url", throttle=throttle)
        return False

    ina_throttle = utils.Throttle(10)
    ina_data = get_ina_data(raw_obs_id, ina_throttle)

    if not ina_data:
        logger.warning("No data retrieved from iNaturalist!")
        update_review(page, status="error", reason="nodata", throttle=throttle)
        return False

    photo_id, found = find_photo_in_obs(page, raw_obs_id, ina_data,
                                        raw_photo_id, ina_throttle)
    if photo_id is None:
        logger.info(f"Images did not match: {found}")
        update_review(page, status="error", reason=found, throttle=throttle)
        return False
    else:
        assert isinstance(photo_id, iNaturalistID)

    ina_license = find_ina_license(ina_data, photo_id)
    logger.debug(f"iNaturalist License: {ina_license}")
    ina_author = find_ina_author(ina_data)
    logger.debug(f"Author: {ina_author}")

    com_license = find_com_license(page)
    logger.debug(f"Commons License: {com_license}")
    status = check_licenses(ina_license, com_license)

    if status == "fail":
        is_old = file_is_old(page)
    else:
        is_old = False

    if config["use_wayback"] and status in ("pass", "pass-change"):
        archive = waybackpy.Url(str(photo_id), user_agent).save()
    else:
        archive = ""

    reviewed = update_review(
        page,
        photo_id,
        status=status,
        author=ina_author,
        review_license=ina_license,
        upload_license=com_license,
        reason=found,
        is_old=is_old,
        throttle=throttle,
        archive=archive,
    )
    if status == "fail" and reviewed:
        fail_warning(page, ina_license, is_old)

    return reviewed
Пример #18
0
 def Alljson(row):
     name = row['url']
     return waybackpy.Url(name, user_agent).JSON
Пример #19
0
 def newestCache(row):
     name = row['url']
     return waybackpy.Url(name, user_agent).newest()
Пример #20
0
 def near(row):
     name = row['url']
     return waybackpy.Url(name, user_agent).near(year=year,
                                                 month=month,
                                                 day=day)
Пример #21
0
def test_oldest():
    time.sleep(10)
    url = "github.com/akamhy/waybackpy"
    target = waybackpy.Url(url, user_agent)
    assert "20200504141153" in target.oldest()
Пример #22
0
def get_wayback_item(url) -> Union[waybackpy.Url, None]:
    try:
        user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
        return waybackpy.Url(url, user_agent)
    except:
        return None
Пример #23
0
def test_url_check():
    time.sleep(10)
    broken_url = "http://wwwgooglecom/"
    with pytest.raises(Exception) as e_info:
        waybackpy.Url(broken_url, user_agent)
spamwriter = csv.writer(csvfile,
                        dialect='excel',
                        delimiter=';',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)

urllist = loadtxt(INFILE,
                  dtype=str,
                  comments="#",
                  delimiter="\n",
                  encoding='utf-8')
for url in urllist:
    time.sleep(1)  # pause for 1 sec, not to overrequest the server.
    # Choose longer times for archiving pdf and doc files, as the archiving software needs more time to read these docs than standard html pages

    wayback = waybackpy.Url(url, USER_AGENT)

    # MODE 1: Code for saving url/page
    try:
        archive = wayback.save()
        print(url, archive.archive_url)
        spamwriter.writerow([url, archive.archive_url])
    except WaybackError as e:
        print(url, "WaybackError - could not save page to WBM")
        spamwriter.writerow([url, "WaybackError - could not save page to IA"])
    except AttributeError as d:
        print(url, "AttributeError - could not save page to IA")
        spamwriter.writerow(
            [url, "AttributeError - could not save page to WBM"])

# MODE 2: Code for retrieving latest/newest version of page
Пример #25
0
 def totalArchivesCount(row):
     name = row['url']
     return waybackpy.Url(name, user_agent).total_archives()
Пример #26
0
def test_newest():
    time.sleep(10)
    url = "github.com/akamhy/waybackpy"
    target = waybackpy.Url(url, user_agent)
    assert url in target.newest()
Пример #27
0
                        if len(raw) > 0:
                            html = str(raw[0]['content'])
                            res['outline_html'] = html
                        att = soup.find_all('img', {'class': 'logo'})
                        if len(att) > 0:
                            img = str(att[0]['src'])
                            res['outline_img'] = img

                    except Exception as e:
                        print(f"OUTLINE ERROR: {str(e)}")
                        # print(driver.save_screenshot("ERROR.png"))
                        pass

                    # Wayback Machine
                    print("WAYBACK MACHINE")
                    wayback = waybackpy.Url(res['external_link'])
                    try:
                        res['archived_url'] = wayback.newest().archive_url
                    except:
                        print("SAVING ON WAYBACK")
                        wayback.save()
                        res['archived_url'] = wayback.newest().archive_url

                    # Newspape Metadata
                    print("NEWSFETCH")
                    n = newspaper(res['external_link']).get_dict
                    res['headline'] = n['headline']
                    res['summary'] = n['summary']
                    res['article'] = n['article']
                    res['description'] = n['description']
                    res['publication'] = n['publication']