Python Url示例，waybackpy.Url Python示例

示例#1

0

显示文件

文件： archive_lib.py 项目： manestay/novel-chapter-dataset

def get_archived(page_url, update_old=False):
    try:
        waybackpy_url_obj = waybackpy.Url(page_url, USER_AGENT)
        archive_url_near = waybackpy_url_obj.near(year=YEAR,
                                                  month=MONTH,
                                                  day=DAY)
    except waybackpy.exceptions.WaybackError as e:
        try:  # try again
            sleep(5)
            waybackpy_url_obj = waybackpy.Url(page_url, USER_AGENT)
            archive_url_near = waybackpy_url_obj.near(year=YEAR,
                                                      month=MONTH,
                                                      day=DAY)
        except waybackpy.exceptions.WaybackError as e:
            # print(e)
            print('  error in retrieving {} , using original url '.format(
                page_url))
            return page_url
    url_str = archive_url_near.archive_url
    if update_old:
        date = archive_url_near.timestamp
        if date < OLD_DATE:
            print('updating  {}'.format(url_str, date))
            archive_url_near = update_archive(waybackpy_url_obj)
            if archive_url_near is None:
                print('  could not save page {}'.format(page_url))
            else:
                url_str = archive_url_near.archive_url
                print('  updated to {}'.format(url_str))
    url_str = url_str.replace(':80', '', 1)
    return url_str

示例#2

0

显示文件

def test_total_archives():
    time.sleep(10)
    if sys.version_info > (3, 6):
        target = waybackpy.Url(" https://google.com ", user_agent)
        assert target.total_archives() > 500000
    else:
        pass
    time.sleep(5)
    target = waybackpy.Url(" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent)
    assert target.total_archives() == 0

示例#3

0

显示文件

def test_clean_url():
    time.sleep(10)
    test_url = " https://en.wikipedia.org/wiki/Network security "
    answer = "https://en.wikipedia.org/wiki/Network_security"
    target = waybackpy.Url(test_url, user_agent)
    test_result = target.clean_url()
    assert answer == test_result

示例#4

0

显示文件

文件： sitearchive.py 项目： dontbanmeplz/Site-Archiver

def pagehandler(pageurl, pageresponse, soup):
    archived_url = waybackpy.Url(pageurl, "Any-User-Agent").save()
    print(pageurl + ' Crawled')
    print("Crawling:" + pageurl +
          " ({0} bytes)".format(len(pageresponse.text)))
    # wordcount(soup) # display unique word counts
    return True

示例#5

0

显示文件

文件： test_1.py 项目： AntiCompositeNumber/waybackpy

def test_wayback_timestamp():
    ts = waybackpy.Url("https://www.google.com",
                       "UA").wayback_timestamp(year=2020,
                                               month=1,
                                               day=2,
                                               hour=3,
                                               minute=4)
    assert "202001020304" in str(ts)

示例#6

0

显示文件

文件： test_1.py 项目： AntiCompositeNumber/waybackpy

def test_get_response():
    hdr = {
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'
    }
    req = Request("https://www.google.com", headers=hdr)  # nosec
    response = waybackpy.Url("https://www.google.com", "UA").get_response(req)
    assert response.code == 200

示例#7

0

显示文件

文件： test_1.py 项目： AntiCompositeNumber/waybackpy

def test_save():
    # Test for urls that exist and can be archived.
    time.sleep(10)

    url_list = [
        "en.wikipedia.org", "www.wikidata.org", "commons.wikimedia.org",
        "www.wiktionary.org", "www.w3schools.com", "www.youtube.com"
    ]
    x = random.randint(0, len(url_list) - 1)
    url1 = url_list[x]
    target = waybackpy.Url(
        url1,
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36"
    )
    archived_url1 = target.save()
    assert url1 in archived_url1

    if sys.version_info > (3, 6):

        # Test for urls that are incorrect.
        with pytest.raises(Exception) as e_info:
            url2 = "ha ha ha ha"
            waybackpy.Url(url2, user_agent)
        time.sleep(5)
        # Test for urls not allowed to archive by robot.txt.
        with pytest.raises(Exception) as e_info:
            url3 = "http://www.archive.is/faq.html"
            target = waybackpy.Url(
                url3,
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0"
            )
            target.save()

        time.sleep(5)
        # Non existent urls, test
        with pytest.raises(Exception) as e_info:
            url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us"
            target = waybackpy.Url(
                url3,
                "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27"
            )
            target.save()

    else:
        pass

示例#8

0

显示文件

def archive_helper(url):
    wayback = waybackpy.Url(url, user_agent)
    # loop = asyncio.get_event_loop()
    # archive = await loop.run_in_executor(_executor, helpsave(wayback))

    # archive = await wayback.save()
    #
    archive = wayback.save()
    return archive

示例#9

0

显示文件

文件： archive_lib.py 项目： manestay/novel-chapter-dataset

def update_archive(waybackpy_url_obj):
    if isinstance(waybackpy_url_obj, str):
        waybackpy_url_obj = waybackpy.Url(waybackpy_url_obj, USER_AGENT)
    try:
        archive_obj = waybackpy_url_obj.save()
    except waybackpy.exceptions.WaybackError as e:
        print(e)
        return None
    return archive_obj

示例#10

0

显示文件

 def _wayback_setup(self, url) -> waybackpy.Url:
     """
     Helper function for setting up the waybackpy object.
     :param url: URL to deal with
     :return: waybackpy object
     """
     try:
         return waybackpy.Url(url, self.USER_AGENT)
     except URLError as e:
         raise e

示例#11

0

显示文件

文件： dataloading.py 项目： danvalen1/Vanguard-1k

def archive(url, targetdir=None):
    """Archives URL and saves information to data log in targetdir based on archive age limit.
    
            Parameters:
                url (str): String indicating where data exists.
                targetdir (str): String indicating where files' data log exists.
        
            Returns:
                archive_dict (dict): Dictionary with URL and timestamp of latest archive.
    """
    archive_dict = {'archive_url': None, 'archive_time': None}
    wayback_obj = waybackpy.Url(url=url, user_agent=user_agent)
    archive_age = len(wayback_obj.newest())
    print(archive_age)

    # Create new archive if age is greater than limit, else use most recent
    if archive_age > archive_age_limit:
        archive_dict['archive_url'] = wayback_obj.save().archive_url
        archive_dict['archive_time'] = datetime.utcnow()
        new_archive = 1
    else:
        archive_dict['archive_url'] = wayback_obj.newest().archive_url
        archive_dict['archive_time'] = wayback_obj.newest().timestamp
        new_archive = 0

    # Dict of data about archive
    d = {
        'URL': [url],
        'File': [url.rsplit('/', 1)[-1]],
        'Directory': [targetdir],
        'ArchiveURL': [archive_dict['archive_url']],
        'ArchiveTime': [archive_dict['archive_time']],
        'NewArchive': [new_archive]
    }

    # Add to or create data log
    try:
        data_log = pd.read_csv(f'{targetdir}+_data_log.csv', index_col='LogID')
        data_log['ArchiveTime'] = [parse(x) for x in data_log['ArchiveTime']]
        d['LogID'] = data_log.index.values.max() + 1

        d = pd.DataFrame.from_dict(d)
        d.set_index('LogID', inplace=True)

        data_log = pd.concat([d, data_log]).drop_duplicates(keep='last')

    except:
        d['LogID'] = 1
        data_log = pd.DataFrame(data=d)
        data_log.set_index('LogID', inplace=True)

    data_log.to_csv(f'{targetdir}+_data_log.csv', index_label='LogID')

    return archive_dict

示例#12

0

显示文件

def test_near():
    time.sleep(10)
    url = "google.com"
    target = waybackpy.Url(url, "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4")
    archive_near_year = target.near(year=2010)
    assert "2010" in archive_near_year

    if sys.version_info > (3, 6):
        time.sleep(5)
        archive_near_month_year = target.near( year=2015, month=2)
        assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year)
    
        target = waybackpy.Url("www.python.org", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246")
        archive_near_hour_day_month_year = target.near(year=2008, month=5, day=9, hour=15)
        assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year)
    
        with pytest.raises(Exception) as e_info:
            NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
            target = waybackpy.Url(NeverArchivedUrl, user_agent)
            target.near(year=2010)
    else:
        pass

示例#13

0

显示文件

文件： wayback.py 项目： jensecj/arkiv

def extract(url):
    log.info("extracting wayback data...")

    try:
        wayback = waybackpy.Url(url, USER_AGENT)
        oldest = wayback.oldest().timestamp.isoformat()
        newest = wayback.newest().timestamp.isoformat()
    except WaybackError:
        oldest = _get_oldest_wayback_timestamp(url)
        newest = _get_newest_wayback_timestamp(url)

    data = {}

    if oldest:
        data |= {"oldest-wayback-archive": oldest}

    if newest:
        data |= {"newest-wayback-archive": newest}

    return {"meta": data}

示例#14

0

显示文件

def test_get():
    time.sleep(10)
    target = waybackpy.Url("google.com", user_agent)
    assert "Welcome to Google" in target.get(target.oldest())

示例#15

0

显示文件

文件： Scraper_Webometrics.py 项目： cherreracar/Practica1_WebScraping

                if "detalles" not in link:
                    index1 = link.index('//')
                    total_page = int(last_page_text[index2 + 1:])
                    University_Web.append(link)

        # Crawl rate (Avoiding saturating the web server)
        sleep(randint(3, 10))

# Scraping Historical data.
elif scraping_option == 1:
    print("Generating your csv file, please wait...")

    # Obtaining archive URLs from Wayback Machine.
    input_url = dict_url[int(url_option)]
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
    wayback = waybackpy.Url(input_url, user_agent)
    now = datetime.now()
    year_limit = now.year
    wayback_urls = []

    for i in range(2012, year_limit):
        archive = wayback.near(year=i, month=12)
        wayback_urls.append(archive.archive_url)

    # Variable Year.
    for i in range(2012, year_limit):
        for j in range(1, 101):
            Year.append(i)

    # Start of scraping
    for url in wayback_urls:

示例#16

0

显示文件

文件： utils.py 项目： mgrandi/archive_pogchamp_emote

def save_archive_of_webpage_in_wbm(url, idx, total, dry_run=False):
    '''
    saves a copy of the given URL in the Internet Archive wayback machine
    and returns the archive URL

    @param url - the url as a string
    @param idx - the download # we are on
    @aram total - the total downloads
    @param dry_run - whether we should actually save it, or just log what we would do

    '''

    if dry_run:

        logger.info("[`%s/%s`] DRY RUN: - would have saved the url `%s` in the wayback machine", idx, total, url)
        return

    logger.info("[`%s/%s`] - saving an archive of the url `%s` in the wayback machine", idx, total, url)
    error_list = []
    archive_url = None

    # loop until we get a valid result since it seems to not return a result a lot of
    # the time , probably because it takes too long and is queued?
    # also, do +1 because we start at index 1 here
    for iter_try_idx in range(1, constants.WAYBACK_ATTEMPT_MAX + 1):
        try:
            logger.debug("try `%s/%s` on url `%s`", iter_try_idx, constants.WAYBACK_ATTEMPT_MAX, url)
            wayback_handle_for_url = waybackpy.Url(url, constants.HTTP_USER_AGENT)

            logger.debug("calling save() on wayback handle for url: `%s`", repr(wayback_handle_for_url))

            archive = wayback_handle_for_url.save()
            archive_url = archive.archive_url
            logger.info("archive of url `%s` complete, url: `%s`", url, archive_url)
        except WaybackError as e:
            logger.debug(f"Got WaybackError when trying to save url `{url}`: `{e}`")
            error_list.append(e)

            logger.info("Got WaybackError, sleeping for `%s` seconds before trying again", constants.WAYBACK_MACHINE_BACKOFF_TIME_SECONDS)
            time.sleep(constants.WAYBACK_MACHINE_BACKOFF_TIME_SECONDS)
            continue
        except URLError as e:
            # don't continue here, this means we screwed up when providing
            # the url
            logger.debug(f"Got URLError when trying to save url `{url}`")
            raise e

        # check to see its not a garbage 'hashflags' url
        # for some reason, the wayback machine when given a twitter url has a chance to return a result that
        # isn't the page that is requested, but instead is a url of the form
        # `https://web.archive.org/web/20210115010042/https://pbs.twimg.com/hashflag/config-2021-01-15-01.json`
        # if we get this, we should throw an exception so we don't accidentally use this wayback machine URL as a real one
        logger.debug("checking the returned archive url `%s` against the hashflags JSON regex: `%s`", archive_url, constants.TWITTER_HASHFLAGS_REGEX)
        hashflags_re_result = constants.TWITTER_HASHFLAGS_REGEX.search(archive_url)
        logger.debug("regex result: `%s`", hashflags_re_result)

        if hashflags_re_result:
            logger.error("Hashflasgs regex `%s matched the returned archive url `%s` ," +
                " this means that the archive was corrupted and shouldn't be used, sleeping for 30 minutes",
                constants.TWITTER_HASHFLAGS_REGEX, archive_url)
            archive_url = None
            time.sleep(30 * 60)

            continue

        else:
            logger.debug("archive_url `%s` did not match the regex `%s`, returning",
                archive_url, constants.TWITTER_HASHFLAGS_REGEX)
            return archive_url

    # if we get here, then we ran out of tries
    raise Exception(f"did not get a good result when trying to save the url `{url}` in the wayback machine, errors: `{error_list}`")

示例#17

0

显示文件

文件： inrbot.py 项目： eatcha-wikimedia/iNaturalistReviewer

def review_file(inpage: pywikibot.page.BasePage,
                throttle: Optional[utils.Throttle] = None) -> Optional[bool]:
    """Performs a license review on the input page

    inpage must be in the file namespace.

    Returns None if the file was skipped
    Returns False if there was an error during review
    Returns True if the file was successfully reviewed (pass or fail)
    """
    try:
        page = pywikibot.FilePage(inpage)
    except ValueError:
        return None
    logger.info(f"Checking {page.title(as_link=True)}")

    utils.check_runpage(site, run_override)
    if not check_can_run(page):
        return None

    raw_obs_id, raw_photo_id = find_ina_id(page)
    logger.info(f"ID found in wikitext: {raw_obs_id} {raw_photo_id}")
    if raw_photo_id and not raw_obs_id:
        raw_obs_id = get_observation_from_photo(raw_photo_id)

    if not raw_obs_id:
        logger.info("No observation ID could be found")
        update_review(page, status="error", reason="url", throttle=throttle)
        return False

    ina_throttle = utils.Throttle(10)
    ina_data = get_ina_data(raw_obs_id, ina_throttle)

    if not ina_data:
        logger.warning("No data retrieved from iNaturalist!")
        update_review(page, status="error", reason="nodata", throttle=throttle)
        return False

    photo_id, found = find_photo_in_obs(page, raw_obs_id, ina_data,
                                        raw_photo_id, ina_throttle)
    if photo_id is None:
        logger.info(f"Images did not match: {found}")
        update_review(page, status="error", reason=found, throttle=throttle)
        return False
    else:
        assert isinstance(photo_id, iNaturalistID)

    ina_license = find_ina_license(ina_data, photo_id)
    logger.debug(f"iNaturalist License: {ina_license}")
    ina_author = find_ina_author(ina_data)
    logger.debug(f"Author: {ina_author}")

    com_license = find_com_license(page)
    logger.debug(f"Commons License: {com_license}")
    status = check_licenses(ina_license, com_license)

    if status == "fail":
        is_old = file_is_old(page)
    else:
        is_old = False

    if config["use_wayback"] and status in ("pass", "pass-change"):
        archive = waybackpy.Url(str(photo_id), user_agent).save()
    else:
        archive = ""

    reviewed = update_review(
        page,
        photo_id,
        status=status,
        author=ina_author,
        review_license=ina_license,
        upload_license=com_license,
        reason=found,
        is_old=is_old,
        throttle=throttle,
        archive=archive,
    )
    if status == "fail" and reviewed:
        fail_warning(page, ina_license, is_old)

    return reviewed

示例#18

0

显示文件

 def Alljson(row):
     name = row['url']
     return waybackpy.Url(name, user_agent).JSON

示例#19

0

显示文件

 def newestCache(row):
     name = row['url']
     return waybackpy.Url(name, user_agent).newest()

示例#20

0

显示文件

 def near(row):
     name = row['url']
     return waybackpy.Url(name, user_agent).near(year=year,
                                                 month=month,
                                                 day=day)

示例#21

0

显示文件

def test_oldest():
    time.sleep(10)
    url = "github.com/akamhy/waybackpy"
    target = waybackpy.Url(url, user_agent)
    assert "20200504141153" in target.oldest()

示例#22

0

显示文件

def get_wayback_item(url) -> Union[waybackpy.Url, None]:
    try:
        user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
        return waybackpy.Url(url, user_agent)
    except:
        return None

示例#23

0

显示文件

def test_url_check():
    time.sleep(10)
    broken_url = "http://wwwgooglecom/"
    with pytest.raises(Exception) as e_info:
        waybackpy.Url(broken_url, user_agent)

示例#24

0

显示文件

文件： SaveToWaybackMachine_v2_30112021.py 项目： ookgezellig/SaveToWaybackMachine

spamwriter = csv.writer(csvfile,
                        dialect='excel',
                        delimiter=';',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)

urllist = loadtxt(INFILE,
                  dtype=str,
                  comments="#",
                  delimiter="\n",
                  encoding='utf-8')
for url in urllist:
    time.sleep(1)  # pause for 1 sec, not to overrequest the server.
    # Choose longer times for archiving pdf and doc files, as the archiving software needs more time to read these docs than standard html pages

    wayback = waybackpy.Url(url, USER_AGENT)

    # MODE 1: Code for saving url/page
    try:
        archive = wayback.save()
        print(url, archive.archive_url)
        spamwriter.writerow([url, archive.archive_url])
    except WaybackError as e:
        print(url, "WaybackError - could not save page to WBM")
        spamwriter.writerow([url, "WaybackError - could not save page to IA"])
    except AttributeError as d:
        print(url, "AttributeError - could not save page to IA")
        spamwriter.writerow(
            [url, "AttributeError - could not save page to WBM"])

# MODE 2: Code for retrieving latest/newest version of page

示例#25

0

显示文件

 def totalArchivesCount(row):
     name = row['url']
     return waybackpy.Url(name, user_agent).total_archives()

示例#26

0

显示文件

def test_newest():
    time.sleep(10)
    url = "github.com/akamhy/waybackpy"
    target = waybackpy.Url(url, user_agent)
    assert url in target.newest()

示例#27

0

显示文件

文件： media2.py 项目： yesiknowjava/theodorecaputihugo

                        if len(raw) > 0:
                            html = str(raw[0]['content'])
                            res['outline_html'] = html
                        att = soup.find_all('img', {'class': 'logo'})
                        if len(att) > 0:
                            img = str(att[0]['src'])
                            res['outline_img'] = img

                    except Exception as e:
                        print(f"OUTLINE ERROR: {str(e)}")
                        # print(driver.save_screenshot("ERROR.png"))
                        pass

                    # Wayback Machine
                    print("WAYBACK MACHINE")
                    wayback = waybackpy.Url(res['external_link'])
                    try:
                        res['archived_url'] = wayback.newest().archive_url
                    except:
                        print("SAVING ON WAYBACK")
                        wayback.save()
                        res['archived_url'] = wayback.newest().archive_url

                    # Newspape Metadata
                    print("NEWSFETCH")
                    n = newspaper(res['external_link']).get_dict
                    res['headline'] = n['headline']
                    res['summary'] = n['summary']
                    res['article'] = n['article']
                    res['description'] = n['description']
                    res['publication'] = n['publication']