def upload_file(request): if request.method == 'POST': form = UploadFileForm(request.POST, request.FILES) if form.is_valid(): if validate_upload_file(request.FILES['file']) and request.FILES['file'].size <= settings.MAX_ARCHIVE_FILE_SIZE: link = Link(submitted_url=form.cleaned_data['url'], submitted_title=form.cleaned_data['title']) if request.user.is_authenticated(): link.created_by = request.user link.save() now = dt = datetime.now() time_tuple = now.timetuple() path_elements = [str(time_tuple.tm_year), str(time_tuple.tm_mon), str(time_tuple.tm_mday), str(time_tuple.tm_hour), str(time_tuple.tm_min), link.guid] linky_home_disk_path = settings.GENERATED_ASSETS_STORAGE + '/' + os.path.sep.join(path_elements) if not os.path.exists(linky_home_disk_path): os.makedirs(linky_home_disk_path) asset, created = Asset.objects.get_or_create(link=link) asset.base_storage_path = os.path.sep.join(path_elements) asset.save() file_name = '/cap.' + request.FILES['file'].name.split('.')[-1] if request.FILES['file'].name.split('.')[-1] == 'pdf': asset.pdf_capture = file_name else: asset.image_capture = file_name #print linky_home_disk_path + file_name #png = PythonMagick.Image(linky_home_disk_path + file_name) #png.write("file_out.png") #params = ['convert', linky_home_disk_path + file_name, 'out.png'] #subprocess.check_call(params) asset.save() request.FILES['file'].file.seek(0) f = open(linky_home_disk_path + file_name, 'w') f.write(request.FILES['file'].file.read()) os.fsync(f) f.close() response_object = {'status':'success', 'linky_id':link.guid, 'linky_hash':link.guid} """try: get_source.delay(link.guid, target_url, os.path.sep.join(path_elements), request.META['HTTP_USER_AGENT']) store_text_cap.delay(target_url, target_title, asset) except Exception, e: # TODO: Log the failed url asset.pdf_capture = 'failed' asset.save()""" return HttpResponse(json.dumps(response_object), 'application/json') else: return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'Invalid file.'}), 'application/json') else: return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'Missing file.'}), 'application/json')
def create_capture_job(user, human=True): link = Link(created_by=user, submitted_url="http://example.com") link.save() capture_job = CaptureJob(created_by=user, link=link, human=human, status='pending') capture_job.save() return capture_job
def test_link_count_period_equal_dates(self): ''' If end date = start date, links are only counted once ''' now = tz_datetime(timezone.now().year, 1, 1) user = LinkUser() user.save() link = Link(creation_timestamp=now, guid="AAAA-AAAA", created_by=user) link.save() links = Link.objects.filter(pk=link.pk) self.assertEqual(len(links), 1) self.assertEqual(link_count_in_time_period(links, now, now), len(links))
def test_delete_bonus_link(self): # make a bonus link here, rather than messing with the fixtures bonus_link = Link(created_by=self.regular_user, bonus_link=True) bonus_link.save() bonus_link_url = "{0}/{1}".format(self.list_url, bonus_link.pk) # establish baseline links_remaining, _ , bonus_links = self.regular_user.get_links_remaining() self.assertEqual(links_remaining, 6) self.assertEqual(bonus_links, 0) # delete the bonus link self.successful_delete(bonus_link_url, user=self.regular_user) self.regular_user.refresh_from_db() # assertions links_remaining, links_remaining_period, bonus_links = self.regular_user.get_links_remaining() self.assertEqual(links_remaining, 6) self.assertEqual(bonus_links, 1)
def test_link_count_regular_user(self): """ We do some link count tallying on save """ link_count = self.regular_user.link_count link = Link(created_by=self.regular_user, submitted_url="http://example.com") link.save() self.regular_user.refresh_from_db() self.assertEqual(link_count + 1, self.regular_user.link_count) link.safe_delete() link.save() self.regular_user.refresh_from_db() self.assertEqual(link_count, self.regular_user.link_count)
def test_link_count_for_orgs(self): """ We do some link count tallying on save. Let's make sure we're adjusting the counts on the orgs """ org_to_which_user_belongs = self.org_user.organizations.all().first() link_count = org_to_which_user_belongs.link_count link = Link(created_by=self.org_user, submitted_url="http://example.com", organization=org_to_which_user_belongs) link.save() org_to_which_user_belongs.refresh_from_db() self.assertEqual(link_count + 1, org_to_which_user_belongs.link_count) link.safe_delete() link.save() org_to_which_user_belongs.refresh_from_db() self.assertEqual(link_count, org_to_which_user_belongs.link_count)
def test_link_count_for_registrars(self): """ We do some link count tallying on save. Let's make sure we're adjusting the counts on the registrars """ registrar_to_which_user_belongs = self.registrar_user.registrar link_count = registrar_to_which_user_belongs.link_count org_managed_by_registrar = registrar_to_which_user_belongs.organizations.all().first() link = Link(created_by=self.registrar_user, submitted_url="http://example.com", organization=org_managed_by_registrar) link.save() registrar_to_which_user_belongs.refresh_from_db() self.assertEqual(link_count + 1, registrar_to_which_user_belongs.link_count) link.safe_delete() link.save() registrar_to_which_user_belongs.refresh_from_db() self.assertEqual(link_count, registrar_to_which_user_belongs.link_count)
def upload_file(request): if request.method == 'POST': form = UploadFileForm(request.POST, request.FILES) if form.is_valid(): mime = MimeTypes() mime_type = mime.guess_type(request.FILES['file'].name) # Get mime type string from tuple if mime_type[0]: mime_type = mime_type[0] else: return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'Invalid file.'}), 'application/json') if validate_upload_file(request.FILES['file'], mime_type) and request.FILES['file'].size <= settings.MAX_ARCHIVE_FILE_SIZE: link = Link(submitted_url=form.cleaned_data['url'], submitted_title=form.cleaned_data['title']) if request.user.is_authenticated(): link.created_by = request.user link.save() now = dt = datetime.now() time_tuple = now.timetuple() path_elements = [str(time_tuple.tm_year), str(time_tuple.tm_mon), str(time_tuple.tm_mday), str(time_tuple.tm_hour), str(time_tuple.tm_min), link.guid] linky_home_disk_path = settings.GENERATED_ASSETS_STORAGE + '/' + os.path.sep.join(path_elements) if not os.path.exists(linky_home_disk_path): os.makedirs(linky_home_disk_path) asset, created = Asset.objects.get_or_create(link=link) asset.base_storage_path = os.path.sep.join(path_elements) asset.save() file_name = '/cap' + mime.guess_extension(mime_type) if mime_type == 'application/pdf': asset.pdf_capture = file_name else: asset.image_capture = file_name #print linky_home_disk_path + file_name #png = PythonMagick.Image(linky_home_disk_path + file_name) #png.write("file_out.png") #params = ['convert', linky_home_disk_path + file_name, 'out.png'] #subprocess.check_call(params) asset.save() request.FILES['file'].file.seek(0) f = open(linky_home_disk_path + file_name, 'w') f.write(request.FILES['file'].file.read()) os.fsync(f) f.close() response_object = {'status':'success', 'linky_id':link.guid, 'linky_hash':link.guid} return HttpResponse(json.dumps(response_object), 'application/json') else: return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'Invalid file.'}), 'application/json') else: return HttpResponseBadRequest(json.dumps({'status':'failed', 'reason':'Missing file.'}), 'application/json')
def single_linky(request, linky_guid): """ Given a Perma ID, serve it up. Vesting also takes place here. """ if request.method == 'POST' and request.user.is_authenticated(): Link.objects.filter(guid=linky_guid).update(vested = True, vested_by_editor = request.user, vested_timestamp = datetime.now()) return HttpResponseRedirect(reverse('single_linky', args=[linky_guid])) else: canonical_guid = Link.get_canonical_guid(linky_guid) if canonical_guid != linky_guid: return HttpResponsePermanentRedirect(reverse('single_linky', args=[canonical_guid])) link = get_object_or_404(Link, guid=linky_guid) # Increment the view count if we're not the referrer parsed_url = urlparse(request.META.get('HTTP_REFERER', '')) current_site = Site.objects.get_current() if not current_site.domain in parsed_url.netloc: link.view_count += 1 link.save() asset = Asset.objects.get(link=link) text_capture = None # User requested archive type serve_type = 'live' if 'type' in request.REQUEST: requested_type = request.REQUEST['type'] if requested_type == 'image': serve_type = 'image' elif requested_type == 'pdf': serve_type = 'pdf' elif requested_type == 'source': serve_type = 'source' elif requested_type == 'text': serve_type = 'text' if asset.text_capture and asset.text_capture != 'pending': path_elements = [settings.GENERATED_ASSETS_STORAGE, asset.base_storage_path, asset.text_capture] file_path = os.path.sep.join(path_elements) with open(file_path, 'r') as f: text_capture = f.read() # If we are going to serve up the live version of the site, let's make sure it's iframe-able display_iframe = False if serve_type == 'live': try: response = urllib2.urlopen(link.submitted_url) if 'X-Frame-Options' in response.headers: # TODO actually check if X-Frame-Options specifically allows requests from us display_iframe = False else: display_iframe = True except urllib2.URLError: # Something is broken with the site, so we might as well display it in an iFrame so the user knows display_iframe = True asset= Asset.objects.get(link__guid=link.guid) created_datestamp = link.creation_timestamp pretty_date = created_datestamp.strftime("%B %d, %Y %I:%M GMT") context = RequestContext(request, {'linky': link, 'asset': asset, 'pretty_date': pretty_date, 'user': request.user, 'next': request.get_full_path(), 'display_iframe': display_iframe, 'serve_type': serve_type, 'text_capture': text_capture}) #context.update(csrf(request)) return render_to_response('single-link.html', context)
def validate(self, data): user = self.context['request'].user errors = {} # since 'file' is not a field on the model, we have to access it through request.data rather than data uploaded_file = self.context['request'].data.get('file') # handle is_private and private_reason: if self.instance: if not user.is_staff: # only staff can manually change private_reason in all cases data.pop('private_reason', None) # if updating privacy, make sure user is allowed to change private status if 'is_private' in data and self.instance.is_private != bool( data['is_private']): if self.instance.private_reason and self.instance.private_reason not in [ 'user', 'old_policy' ]: errors['is_private'] = 'Cannot change link privacy.' else: data['private_reason'] = 'user' if data[ 'is_private'] else None else: # for new links, set private_reason based on is_private data['private_reason'] = 'user' if data.get('is_private') else None # check submitted URL for new link if not self.instance: if not data.get('submitted_url'): errors['url'] = "URL cannot be empty." else: try: validate = URLValidator() temp_link = Link(submitted_url=data['submitted_url']) validate(temp_link.ascii_safe_url) # Don't force URL resolution validation if a file is provided if not uploaded_file: if not temp_link.ip: errors['url'] = "Couldn't resolve domain." elif not ip_in_allowed_ip_range(temp_link.ip): errors['url'] = "Not a valid IP." elif not temp_link.headers: errors['url'] = "Couldn't load URL." else: # preemptively reject URLs that report a size over settings.MAX_ARCHIVE_FILE_SIZE try: if int( temp_link.headers.get( 'content-length', 0) ) > settings.MAX_ARCHIVE_FILE_SIZE: errors[ 'url'] = "Target page is too large (max size %sMB)." % ( settings.MAX_ARCHIVE_FILE_SIZE / 1024 / 1024) except ValueError: # content-length header wasn't an integer. Carry on. pass except DjangoValidationError: errors['url'] = "Not a valid URL." except TooManyRedirects: errors['url'] = "URL caused a redirect loop." # check uploaded file if uploaded_file == '': errors['file'] = "File cannot be blank." elif uploaded_file: if self.instance and self.instance.is_permanent(): errors[ 'file'] = "Archive contents cannot be replaced after 24 hours" else: mime_type = get_mime_type(uploaded_file.name) # Get mime type string from tuple if not mime_type or not mime_type_lookup[mime_type][ 'valid_file'](uploaded_file): errors['file'] = "Invalid file." elif uploaded_file.size > settings.MAX_ARCHIVE_FILE_SIZE: errors['file'] = "File is too large." if errors: raise serializers.ValidationError(errors) return data
def test_link_count_valid_period(self): ''' Should include links created only in the target year ''' now = tz_datetime(timezone.now().year, 1, 1) two_years_ago = tz_datetime(now.year - 2, 1, 1) three_years_ago = tz_datetime(now.year - 3, 1, 1) user = LinkUser() user.save() link_pks = [ "AAAA-AAAA", "BBBB-BBBB", "CCCC-CCCC", "DDDD-DDDD", "EEEE-EEEE" ] older = Link(creation_timestamp=three_years_ago, guid=link_pks[0], created_by=user) older.save() old = Link(creation_timestamp=two_years_ago, guid=link_pks[1], created_by=user) old.save() now1 = Link(creation_timestamp=now, guid=link_pks[2], created_by=user) now1.save() now2 = Link(creation_timestamp=now, guid=link_pks[3], created_by=user) now2.save() now3 = Link(creation_timestamp=now, guid=link_pks[4], created_by=user) now3.save() links = Link.objects.filter(pk__in=link_pks) self.assertEqual(len(links), 5) self.assertEqual( link_count_in_time_period(links, three_years_ago, two_years_ago), 2)
url_details = urlparse(target_url) target_title = url_details.netloc # Get the markup. We get the mime-type and the title from this. try: r = requests.get(target_url) parsed_html = lxml.html.fromstring(r.content) except IOError: logger.debug("Title capture from markup failed for %s, using the hostname" % target_url) if len(parsed_html): if parsed_html.find(".//title") is not None and parsed_html.find(".//title").text: target_title = parsed_html.find(".//title").text.strip() # We have some markup and a title. Let's create a linky from it link = Link(submitted_url=target_url, submitted_title=target_title) if request.user.is_authenticated(): link.created_by = request.user link.save() # Assets get stored in /storage/path/year/month/day/hour/unique-id/* # Get that path that we'll pass off to our workers to do the indexing. They'll store their results here now = dt = datetime.now() time_tuple = now.timetuple() path_elements = [str(time_tuple.tm_year), str(time_tuple.tm_mon), str(time_tuple.tm_mday), str(time_tuple.tm_hour), str(time_tuple.tm_min), link.guid] # Create a stub for our assets asset, created = Asset.objects.get_or_create(link=link)
def update_perma(link_guid): """ Update the vested/darchived status of a perma link, and download the assets if necessary """ # N.B. This function has two instances of downloading stuff from # the root server using a scheme that looks something like # settings.SERVER + reverse("url_pattern") # This is nice because it means we don't have to repeat our URL # patterns from urls.py, but it hardcodes the fact that the root # server is another Perma instance. It's unclear to me which is a # better fact to abstract, but this is easier for now. ## First, let's get the metadata for this link. The metadata ## contains information about where we should place the assets (if ## we decide that we need them). This is also a fast check to make ## sure the link GUID is actually real. metadata_server = settings.UPSTREAM_SERVER['address'] metadata_url = metadata_server + reverse("service_link_status", args=(link_guid,)) metadata = requests.get( metadata_url, headers=settings.UPSTREAM_SERVER.get('headers', {}) ).json() ## Next, let's see if we need to get the assets. If we have the ## Link object for this GUID, we're going to assume we already ## have what we need. It would make a little more sense to use the ## Asset object here instead, but we're definitely going to need ## to do stuff to the Link object so we might as well get that ## instead. In practice they should be ~one to one. try: link = Link.objects.get(guid=link_guid) except Link.DoesNotExist: ## We need to download the assets. We can download an archive ## from the assets server. assets_server = settings.UPSTREAM_SERVER['address'] assets_url = assets_server + reverse("mirroring:link_assets", args=(link_guid,)) # Temp paths can be relative because we're in run_in_tempdir() temp_zip_path = 'temp.zip' # Save remote zip file to disk, using streaming to avoid keeping large files in RAM. request = requests.get( assets_url, headers=settings.UPSTREAM_SERVER.get('headers', {}), stream=True) with open(temp_zip_path, 'wb') as f: for chunk in request.iter_content(1024): f.write(chunk) ## Extract the archive and change into the extracted folder. with zipfile.ZipFile(temp_zip_path, "r") as zipfh: #assets_path = os.path.dirname(os.path.join(settings.MEDIA_ROOT, metadata["path"])) zipfh.extractall() # creates folder named [guid] in current temp dir temp_extracted_path = os.path.basename(metadata['path']) # e.g. "1234-ABCD" # Save all extracted files to default_storage, using the path in metadata. for root, dirs, files in os.walk(temp_extracted_path): for file in files: source_file_path = os.path.join(root, file) # e.g. "1234-ABCD/cap.png" dest_file_path = os.path.join(os.path.dirname(metadata['path']), source_file_path) # e.g. 2014/6/10/18/37/1234-ABCD/cap.png with open(source_file_path, 'rb') as source_file: default_storage.store_file(source_file, dest_file_path) ## We can now get some additional metadata that we'll need to ## create the Link object. with open(os.path.join(temp_extracted_path, "metadata.json"), "r") as fh: link_metadata = json.load(fh) ## We now have everything we need to initialize the Link object. link = Link(guid=link_guid) link.submitted_url = link_metadata["submitted_url"] link.submitted_title = link_metadata["submitted_title"] link.created_by = None # XXX maybe we should do something with FakeUser here link.save(pregenerated_guid=True) # We need to save this so that we can create an Asset object # This is a stupid hack to overcome the fact that the Link has # auto_now_add=True, so it's always going to be saved to the # current time on first creation. link.creation_timestamp = unserialize_datetime(link_metadata["creation_timestamp"]) link.save() ## Lastly, let's create an Asset object for this Link. asset = Asset(link=link) asset.base_storage_path = metadata["path"] asset.image_capture = metadata["image_capture"] asset.warc_capture = metadata["source_capture"] asset.pdf_capture = metadata["pdf_capture"] asset.text_capture = metadata["text_capture"] asset.save() ## We can now add some of the data we got from the metadata to the Link object link.dark_archived = metadata["dark_archived"] link.vested = metadata["vested"] link.save() # If we have sub-mirrors, poke them to get a copy from us. if settings.DOWNSTREAM_SERVERS: run_task(poke_mirrors, link_guid=link_guid)
def test_org_link_count_this_year(self): ''' Should include links created this year and exclude links older than that. ''' r = Registrar() r.save() o = Organization(registrar=r) o.save() self.assertEqual(o.link_count_this_year(), 0) now = tz_datetime(timezone.now().year, 1, 1) two_years_ago = tz_datetime(now.year - 2, 1, 1) user = LinkUser() user.save() link_pks = ["AAAA-AAAA", "BBBB-BBBB", "CCCC-CCCC"] too_early = Link(creation_timestamp=two_years_ago, guid=link_pks[0], created_by=user, organization=o) too_early.save() now1 = Link(creation_timestamp=now, guid=link_pks[1], created_by=user, organization=o) now1.save() now2 = Link(creation_timestamp=now, guid=link_pks[2], created_by=user, organization=o) now2.save() links = Link.objects.filter(pk__in=link_pks) self.assertEqual(len(links), 3) self.assertEqual(o.link_count_this_year(), 2)
def test_registrar_link_count_this_year(self): ''' Should include links created this year and exclude links older than that. Should work across all its orgs. ''' r = Registrar() r.save() o1 = Organization(registrar=r) o1.save() o2 = Organization(registrar=r) o2.save() now = tz_datetime(timezone.now().year, 1, 1) two_years_ago = tz_datetime(now.year - 2, 1, 1) user = LinkUser() user.save() link_pks = ["AAAA-AAAA", "BBBB-BBBB", "CCCC-CCCC", "DDDD-DDDD"] too_early = Link(creation_timestamp=two_years_ago, guid=link_pks[0], created_by=user, organization=o1) too_early.save() now1 = Link(creation_timestamp=now, guid=link_pks[1], created_by=user, organization=o1) now1.save() now2 = Link(creation_timestamp=now, guid=link_pks[2], created_by=user, organization=o1) now2.save() now3 = Link(creation_timestamp=now, guid=link_pks[3], created_by=user, organization=o2) now3.save() links = Link.objects.filter(pk__in=link_pks) self.assertEqual(len(links), 4) self.assertEqual(r.link_count_this_year(), 3)
def test_most_active_org_in_time_period_valid_period(self): ''' Should include links created only in the target year ''' now = tz_datetime(timezone.now().year, 1, 1) two_years_ago = tz_datetime(now.year - 2, 1, 1) three_years_ago = tz_datetime(now.year - 3, 1, 1) r = Registrar() r.save() o1 = Organization(registrar=r) o1.save() o2 = Organization(registrar=r) o2.save() user = LinkUser() user.save() link_pks = [ "AAAA-AAAA", "BBBB-BBBB", "CCCC-CCCC", "DDDD-DDDD", "EEEE-EEEE" ] too_early1 = Link(creation_timestamp=three_years_ago, guid=link_pks[0], organization=o1, created_by=user) too_early1.save() too_early2 = Link(creation_timestamp=three_years_ago, guid=link_pks[1], organization=o1, created_by=user) too_early2.save() now1 = Link(creation_timestamp=now, guid=link_pks[2], organization=o1, created_by=user) now1.save() now2 = Link(creation_timestamp=now, guid=link_pks[3], organization=o2, created_by=user) now2.save() now3 = Link(creation_timestamp=now, guid=link_pks[4], organization=o2, created_by=user) now3.save() # organization 1 was more active in the past self.assertEqual( most_active_org_in_time_period(r.organizations, three_years_ago, two_years_ago), o1) # but organization 2 was more active during the period in question self.assertEqual( most_active_org_in_time_period(r.organizations, two_years_ago), o2) # with a total of three links, organization 1 has been more active over all self.assertEqual(most_active_org_in_time_period(r.organizations), o1)
def test_registrar_most_active_org_this_year(self): ''' Should return the org (whole object)with the most links created this year, or None if it has no orgs with links created this year. ''' r = Registrar() r.save() self.assertEqual(type(r.most_active_org_this_year()), type(None)) o1 = Organization(registrar=r) o1.save() o2 = Organization(registrar=r) o2.save() now = tz_datetime(timezone.now().year, 1, 1) two_years_ago = tz_datetime(now.year - 2, 1, 1) user = LinkUser() user.save() link_pks = [ "AAAA-AAAA", "BBBB-BBBB", "CCCC-CCCC", "DDDD-DDDD", "EEEE-EEEE", "FFFF-FFFF" ] too_early = Link(creation_timestamp=two_years_ago, guid=link_pks[0], created_by=user, organization=o1) too_early.save() self.assertEqual(type(r.most_active_org_this_year()), type(None)) now1 = Link(creation_timestamp=now, guid=link_pks[1], created_by=user, organization=o1) now1.save() now2 = Link(creation_timestamp=now, guid=link_pks[2], created_by=user, organization=o1) now2.save() now3 = Link(creation_timestamp=now, guid=link_pks[3], created_by=user, organization=o2) now3.save() self.assertEqual(r.most_active_org_this_year(), o1) now4 = Link(creation_timestamp=now, guid=link_pks[4], created_by=user, organization=o2) now4.save() now5 = Link(creation_timestamp=now, guid=link_pks[5], created_by=user, organization=o2) now5.save() self.assertEqual(r.most_active_org_this_year(), o2)