def test_get_memento_uri_default(input_uri_r, input_datetime, expected_uri_m): mc = MementoClient() actual_uri_m = mc.get_memento_info(input_uri_r, input_datetime).get("mementos").get("closest").get("uri")[0] assert expected_uri_m == actual_uri_m
def test_get_memento_uri_specified_timegate_direct_timegate_query(input_uri_r, input_datetime, input_timegate, expected_uri_m): mc = MementoClient(timegate_uri=input_timegate, check_native_timegate=False) actual_uri_m = mc.get_memento_info(input_uri_r, input_datetime, include_uri_checks=False).get("mementos").get("closest").get("uri")[0] assert expected_uri_m == actual_uri_m
def url_list(request): urls = URL.objects.all() if request.method == "POST": form = URLForm(request.POST) if form.is_valid(): post = form.save(commit=False) try: response = requests.get(post) temp = BeautifulSoup(response.content,"lxml") post.title = temp.title.string post.finalDestination = response.url post.statusCode = response.status_code dt = datetime.datetime.now() mc = MementoClient() uri = post.finalDestination memento_uri = mc.get_memento_info(uri, dt).get("mementos").get("closest") post.uri = memento_uri.get('uri')[0] post.datetime = str(memento_uri.get('datetime')) except: post.statusCode = "None" post.finalDestination = "Does not exit" post.title = "No title" pass finally: post.save() return redirect('url_detail', pk=post.pk) else: form = URLForm return render(request, 'lab1/url_list.html',{'urls':urls,'form':URLForm})
def url_list(request): urls = URL.objects.all() if request.method == "POST": form = URLForm(request.POST) if form.is_valid(): post = form.save(commit=False) try: response = requests.get(post) temp = BeautifulSoup(response.content, "lxml") post.title = temp.title.string post.finalDestination = response.url post.statusCode = response.status_code dt = datetime.datetime.now() mc = MementoClient() uri = post.finalDestination memento_uri = mc.get_memento_info( uri, dt).get("mementos").get("closest") post.uri = memento_uri.get('uri')[0] post.datetime = str(memento_uri.get('datetime')) except: post.statusCode = "None" post.finalDestination = "Does not exit" post.title = "No title" pass finally: post.save() return redirect('url_detail', pk=post.pk) else: form = URLForm return render(request, 'lab1/url_list.html', { 'urls': urls, 'form': URLForm })
def url_list(request): if request.method == "POST": form = SearchForm(request.POST) if form.is_valid(): new_url = form.save(commit = False) new_url.date = timezone.now() # Runs when URL is correct try: response = requests.get(new_url) page = BeautifulSoup(response.content, "lxml") if page.title is not None: title = page.title.string else: title = "No Title Available" new_url.status = response.status_code new_url.final_url = response.url new_url.title = title # Wayback storing current_date = datetime.datetime.now() memento = MementoClient() wayback_res = memento.get_memento_info(response.url, current_date).get("mementos").get("closest") new_url.wayback = wayback_res.get("uri")[0] if wayback_res.get("datetime") is not None: new_url.wayback_date = str(wayback_res.get("datetime")) else: new_url.wayback_date = str(current_date) # Picture archiving # Connecting to S3 s3_connection = boto3.resource("s3") # For image capture with PhahtomJS data = json.dumps({"url":response.url, "renderType":"jpeg"}).encode("utf-8") headers = {"content-type": "application/json"} api_url = "http://PhantomJScloud.com/api/browser/v2/" + api_key + "/" req = urllibreq.Request(url=api_url, data=data, headers=headers) res = urllibreq.urlopen(req) result = res.read() # Puts the generated image on S3 s3_connection.Bucket("lab3pics").put_object(Key=str(current_date) + ".jpg", Body=result, ACL="public-read", ContentType="image/jpeg") # Generates a publicly accessible link to the image pic_url = "http://s3.amazonaws.com/lab3pics/" + str(current_date) + ".jpg" new_url.archive_link = pic_url # Sets up error message except Exception as e: new_url.status = "None" new_url.final_url = "Does not exist" new_url.title = "This webpage does not exist" new_url.wayback = "Not available" new_url.wayback_date = "Not available" new_url.archive_link = e # Redirects to details page finally: new_url.save() return redirect('url_detail', pk = new_url.pk) else: urls = URL.objects.filter(date__lte = timezone.now()).order_by('-date') form = SearchForm return render(request, 'urlexpander/url_list.html', {'urls': urls, 'form': SearchForm})
def test_get_memento_uri_default(input_uri_r, input_datetime, expected_uri_m): mc = MementoClient() actual_uri_m = mc.get_memento_info( input_uri_r, input_datetime).get("mementos").get("closest").get("uri")[0] assert expected_uri_m == actual_uri_m
def test_mementos_not_in_archive_uri(input_uri_r, input_datetime, input_uri_g): mc = MementoClient(timegate_uri=input_uri_g) accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri") assert input_uri_r == original_uri
def test_bad_timegate_linux(): input_uri_r = "http://www.cnn.com" bad_uri_g = "http://www.example.com" accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") mc = MementoClient(timegate_uri=bad_uri_g) with pytest.raises(requests.ConnectionError): original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri")
def good_url_slash_at_end(): input_uri_r = "http://www.cnn.com/" mc = MementoClient() dt = datetime.datetime.strptime("Tue, 11 Sep 2001 08:45:45 GMT", "%a, %d %b %Y %H:%M:%S GMT") uri_m = mc.get_memento_info(input_uri_r, dt).get("mementos").get("closest").get("uri")[0] assert uri_m == 'http://webarchive.loc.gov/all/20010911181528/http://www2.cnn.com/'
def test_bad_timegate_osx(): input_uri_r = "http://www.cnn.com" bad_uri_g = "http://www.example.com" accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") mc = MementoClient(timegate_uri=bad_uri_g) original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri") assert input_uri_r == original_uri
def test_get_memento_uri_specified_timegate(input_uri_r, input_datetime, input_timegate, expected_uri_m): mc = MementoClient(timegate_uri=input_timegate, check_native_timegate=False) actual_uri_m = mc.get_memento_info( input_uri_r, input_datetime).get("mementos").get("closest").get("uri")[0] assert expected_uri_m == actual_uri_m
def test_mementos_not_in_archive_uri(input_uri_r, input_datetime, input_uri_g): mc = MementoClient(timegate_uri=input_uri_g) accept_datetime = datetime.datetime.strptime( "Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri") assert input_uri_r == original_uri
def test_bad_timegate(): print("'HTTP_PROXY' in os.environ: {}".format('HTTP_PROXY' in os.environ)) input_uri_r = "http://www.cnn.com" bad_uri_g = "http://www.example.moc" accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") mc = MementoClient(timegate_uri=bad_uri_g) with pytest.raises(requests.ConnectionError): original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri")
def test_good_url_slash_at_end(): input_uri_r = "http://www.cnn.com/" mc = MementoClient() dt = datetime.datetime.strptime("Tue, 11 Sep 2001 08:45:45 GMT", "%a, %d %b %Y %H:%M:%S GMT") uri_m = mc.get_memento_info( input_uri_r, dt).get("mementos").get("closest").get("uri")[0] assert uri_m == 'http://webarchive.loc.gov/all/20010911181528/http://www2.cnn.com/'
def main(): """Entry function.""" parser = argparse.ArgumentParser() parser.add_argument('input_csv_path_file', help="specify the csv file to read") parser.add_argument('output_csv_path_file', help="specify the csv file to write results") parser.add_argument('url_field', help=" specify the field name to get the URL") parser.add_argument('datetime', help="Memento Datetime") args = parser.parse_args() df = pd.read_csv(args.input_csv_path_file) dt = datetime.datetime.strptime(args.datetime, '%Y%m%d') mc = MementoClient(check_native_timegate=False) with open(args.output_csv_path_file, 'a') as csvfile: fieldnames = ['original-uri', 'memento-closest', 'memento-first', 'memento-last', 'timegate_uri'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for index, row in df[9487:].iterrows(): # if row[args.url_field] != '': memento = mc.get_memento_info( row[args.url_field], dt, include_uri_checks=False) if not memento.get("mementos") is None: writer.writerow({'original-uri': memento.get("original_uri"), 'memento-closest': '' if memento.get("mementos") .get("closest") is None else memento.get("mementos") .get("closest").get("uri")[0], 'memento-first': '' if memento.get("mementos") .get("first") is None else memento.get("mementos") .get("first").get("uri")[0], 'memento-last': '' if memento.get("mementos") .get("last") is None else memento.get("mementos") .get("last").get("uri")[0], 'timegate_uri': memento.get("timegate_uri")}) else: writer.writerow({'original-uri': memento.get("original_uri"), 'memento-closest': '', 'memento-first': '', 'memento-last': '', 'timegate_uri': memento.get("timegate_uri")}) csvfile.flush()
def test_get_memento_data_non_compliant(input_uri_m): # TODO: pytest did not seem to split this into arguments input_uri_m = input_uri_m[0] mc = MementoClient() accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") original_uri = mc.get_memento_info(input_uri_m, accept_datetime).get("original_uri") assert input_uri_m == original_uri
def test_nonexistent_urirs(input_uri_r): input_uri_r = input_uri_r[0] accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") mc = MementoClient() memento_info = mc.get_memento_info(input_uri_r, accept_datetime) assert memento_info.get("original_uri") == input_uri_r assert memento_info.get("timegate_uri") == 'http://timetravel.mementoweb.org/timegate/{}'.format(input_uri_r)
def test_get_memento_data_non_compliant(input_uri_m): # TODO: pytest did not seem to split this into arguments input_uri_m = input_uri_m[0] mc = MementoClient() accept_datetime = datetime.datetime.strptime( "Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") original_uri = mc.get_memento_info(input_uri_m, accept_datetime).get("original_uri") assert input_uri_m == original_uri
def test_nonexistent_urirs(input_uri_r): input_uri_r = input_uri_r[0] accept_datetime = datetime.datetime.strptime( "Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") mc = MementoClient() memento_info = mc.get_memento_info(input_uri_r, accept_datetime) assert memento_info.get("original_uri") == input_uri_r assert memento_info.get( "timegate_uri" ) == 'http://timetravel.mementoweb.org/timegate/{}'.format(input_uri_r)
def test_bad_timegate(): print("'HTTP_PROXY' in os.environ: {}".format('HTTP_PROXY' in os.environ)) input_uri_r = "http://www.cnn.com" bad_uri_g = "http://www.example.moc" accept_datetime = datetime.datetime.strptime( "Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") mc = MementoClient(timegate_uri=bad_uri_g) #with pytest.raises(requests.ConnectionError): with pytest.raises((requests.exceptions.ConnectionError, memento_client.memento_client.MementoClientException)): original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri")
def get_via_mementos(uri, dt): mc = MementoClient(timegate_uri=timegate, check_native_timegate=False) # mc = MementoClient() print("Getting mementos for %s ..." % uri) try: mementos = mc.get_memento_info(uri, dt).get("mementos") if mementos: print("Got mementos for %s ..." % uri) if 'closest' in mementos: uri = mementos.get("closest").get("uri")[0] elif 'memento' in mementos: uri = mementos.get("closest").get("uri")[0] # Need to patch the id_ into the url: uri = re.sub(r"\/(\d{14})\/", r"/\1id_/", uri) except Exception as e: print(e) pass return uri
def on_callback_query(msg): query_id, chat_id, query_data = telepot.glance(msg, flavor='callback_query') # print(msg) # print(query_data) print('Recieved query ' + query_id) url = msg['message']['reply_to_message']['text'].split(' ')[1] msg_idf = telepot.message_identifier(msg['message']) callback_text = '' global delay if query_data == 'save': if delay != '': if datetime.datetime.now() > delay: r = requests.get('https://archive.fo/') html = r.text soup = BeautifulSoup(html, 'lxml') submitid = soup.find('input').get('value') headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' } values = {'submitid': submitid, 'url': url, 'anyway': '1'} r = requests.post('https://archive.fo/submit/', data=values, headers=headers) uri = r.text archive_uri = uri.split('"')[1] delay = datetime.datetime.now() + datetime.timedelta(minutes=3) if 'archive.fo' in archive_uri: pass else: callback_text = 'Something went wrong, let @raku_cat know' else: callback_text = 'Saving on cooldown, please try again in a few miniutes.' else: uri = msg['message']['text'] foo, keyboard = link_handler(url) dt = uri.split('/')[3] dt = datetime.datetime.strptime(dt, '%Y%m%d%H%M%S') timegate = 'https://archive.fo/timegate/' mc = MementoClient(timegate_uri=timegate, check_native_timegate=False) if query_data == 'back': try: archive_uri = mc.get_memento_info( url, dt).get('mementos').get('prev').get('uri')[0] except AttributeError: callback_text = 'No older archives or something went wrong.' elif query_data == 'next': try: archive_uri = mc.get_memento_info( uri, dt).get('mementos').get('next').get('uri')[0] except AttributeError: callback_text = 'No newer archives or something went wrong.' try: bot.editMessageText(msg_idf, archive_uri) except: pass try: bot.editMessageText(msg_idf, archive_uri, reply_markup=keyboard) except: pass bot.answerCallbackQuery(query_id, text=callback_text) print('Responding to callback ' + query_id)
def link_handler(link): try: link = link.split(' ')[1] except IndexError: pass #print(str_link) uri_regex = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4 r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6 r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) uri_rec = uri_regex.search(link) #uri_rec = re.search("(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", link) #print(uri_rec) #print(uri_regex) #print(link) if uri_rec: print('Url found') uri = uri_rec.group(0) print(uri) timegate = 'https://archive.fo/timegate/' mc = MementoClient(timegate_uri=timegate, check_native_timegate=False) try: archive_uri = mc.get_memento_info(uri).get("mementos").get( "last").get("uri")[0] # print(uri) # print(archive_uri) print('Archive is ' + archive_uri) except AttributeError: archive_uri = archive_create(uri) return archive_uri except NameError: print('Sum happen') return ('Something went wrong, let @raku_cat know') else: pass else: return 'No valid URL found' if 'archive.fo' in archive_uri: # print(archive_uri) return archive_uri elif 'archive.is' in archive_uri: keyboard = InlineKeyboardMarkup(inline_keyboard=[ [ InlineKeyboardButton(text='Force save page', callback_data='save') ], [ InlineKeyboardButton(text='← Prior', callback_data='back'), InlineKeyboardButton(text='Next →', callback_data='next') ], [ InlineKeyboardButton(text='History', switch_inline_query_current_chat=uri) ], ]) return archive_uri, keyboard elif 'trans' in archive_uri: archive_uri = mc.get_memento_info(uri).get("timegate_uri") print('Sent weird api deal') return (archive_uri) else: print('^No it wasn\'t') return 'Something went wrong, let @raku_cat know'