def test_1_http_upload_fail(self): requests.head = ResponseMockFactory.head_fail requests.get = ResponseMockFactory.get_fail url= "http://fail" file_upload = models.FileUpload() file_upload.set_id() file_upload.upload("testuser", url, status="exists") upload_dir = app.config.get("UPLOAD_DIR") path = os.path.join(upload_dir, file_upload.local_filename) self.cleanup_paths.append(path) job = models.BackgroundJob() result = ingestarticles.http_upload(job, path, file_upload) assert result is False assert file_upload.status == "failed" assert file_upload.error is not None and file_upload.error != "" assert file_upload.error_details is None assert list(file_upload.failure_reasons.keys()) == [] # now try it with an actual exception url= "http://except" file_upload.upload("testuser", url, status="exists") result = ingestarticles.http_upload(job, path, file_upload) assert result is False assert file_upload.status == "failed" assert file_upload.error is not None and file_upload.error != "" assert file_upload.error_details is None assert list(file_upload.failure_reasons.keys()) == []
def test_3_submit_retry(self): app.config["HUEY_TASKS"]["ingest_articles"]["retries"] = 1 fu = models.FileUpload() fu.validated("doaj") fu.save() job = models.BackgroundJob() params = {} params["ingest_articles__file_upload_id"] = fu.id params["ingest_articles__attempts"] = 0 job.params = params job.save(blocking=True) # this assumes that huey is in always eager mode, and thus this immediately calls the async task, # which in turn calls execute, which ultimately calls run with self.assertRaises(RetryException): ingestarticles.IngestArticlesBackgroundTask.submit(job) job = models.BackgroundJob.pull(job.id) assert job.params.get("ingest_articles__attempts") == 1 assert job.status == "processing" # now do it again, to see the retry cause the job to fail on the second attempt as per the config with self.assertRaises(RetryException): ingestarticles.IngestArticlesBackgroundTask.submit(job) job = models.BackgroundJob.pull(job.id) assert job.params.get("ingest_articles__attempts") == 2 assert job.status == "error"
lastmods = list(set(lastmods)) lastmods.sort() for lm in lastmods: for obj in lookup[lm]: attempted += 1 publisher = obj["publisher"] filename = obj["filename"] id = obj["id"] f = id + ".xml" xml_file = os.path.join(xml_dir, f) uploaded = datetime.fromtimestamp(lm).strftime("%Y-%m-%dT%H:%M:%SZ") upload = models.FileUpload() upload.set_schema(xwalk.format_name) upload.upload(publisher, filename) upload.set_created(uploaded) upload.set_id() # now try and parse the file doc = None try: doc = etree.parse(open(xml_file)) except: failed += 1 print f, "Malformed XML" malformed_writer.writerow( [f, publisher, filename, uploaded, acc.email]) upload.failed("Unable to parse file")
def _url_upload(cls, username, url, schema, previous): # first define a few functions def __http_upload(record, previous, url): # first thing to try is a head request, supporting redirects head = requests.head(url, allow_redirects=True) if head.status_code == requests.codes.ok: return __ok(record, previous) # if we get to here, the head request failed. This might be because the file # isn't there, but it might also be that the server doesn't support HEAD (a lot # of webapps [including this one] don't implement it) # # so we do an interruptable get request instead, so we don't download too much # unnecessary content get = requests.get(url, stream=True) get.close() if get.status_code == requests.codes.ok: return __ok(record, previous) return __fail( record, previous, error='error while checking submitted file reference: {0}'. format(get.status_code)) def __ftp_upload(record, previous, parsed_url): # 1. find out whether the file exists # 2. that's it, return OK # We might as well check if the file exists using the SIZE command. # If the FTP server does not support SIZE, our article ingestion # script is going to refuse to process the file anyway, so might as # well get a failure now. # Also it's more of a faff to check file existence using LIST commands. try: f = ftplib.FTP(parsed_url.hostname, parsed_url.username, parsed_url.password) r = f.sendcmd( 'TYPE I' ) # SIZE is not usually allowed in ASCII mode, so set to binary mode if not r.startswith('2'): return __fail( record, previous, error='could not set binary ' 'mode in target FTP server while checking file exists') if f.size(parsed_url.path) < 0: # this will either raise an error which will get caught below # or, very rarely, will return an invalid size return __fail( record, previous, error='file does not seem to exist on FTP server') except Exception as e: return __fail(record, previous, error='error during FTP file existence check: ' + str(e.args)) return __ok(record, previous) def __ok(record, previous): record.exists() record.save() previous.insert(0, record) return record.id def __fail(record, previous, error): message = 'The URL could not be accessed; ' + error record.failed(message) record.save() previous.insert(0, record) raise BackgroundException(message) # prep a record to go into the index, to record this upload. The filename is the url record = models.FileUpload() record.upload(username, url) record.set_id() record.set_schema( schema) # although it could be wrong, this will get checked later # now we attempt to verify that the file is retrievable try: # first, determine if ftp or http parsed_url = urlparse(url) if parsed_url.scheme in ['http', "https"]: return __http_upload(record, previous, url) elif parsed_url.scheme == 'ftp': return __ftp_upload(record, previous, parsed_url) else: return __fail( record, previous, error= 'unsupported URL scheme "{0}". Only HTTP(s) and FTP are supported.' .format(parsed_url.scheme)) except BackgroundException as e: raise except Exception as e: return __fail(record, previous, error="please check it before submitting again; " + e.message)
def _file_upload(cls, username, f, schema, previous): # prep a record to go into the index, to record this upload record = models.FileUpload() record.upload(username, f.filename) record.set_id() # the file path that we are going to write to xml = os.path.join(app.config.get("UPLOAD_DIR", "."), record.local_filename) # it's critical here that no errors cause files to get left behind unrecorded try: # write the incoming file out to the XML file f.save(xml) # save the index entry record.save() except: # if we can't record either of these things, we need to back right off try: file_failed(xml) except: pass try: record.delete() except: pass raise BackgroundException( "Failed to upload file - please contact an administrator") xwalk_name = app.config.get("ARTICLE_CROSSWALKS", {}).get(schema) xwalk = plugin.load_class(xwalk_name)() # now we have the record in the index and on disk, we can attempt to # validate it try: with open(xml) as handle: xwalk.validate_file(handle) record.validated(schema) record.save() previous.insert(0, record) return record.id except IngestException as e: record.failed(e.message, e.inner_message) try: file_failed(xml) except: pass record.save() previous.insert(0, record) raise BackgroundException("Failed to upload file: " + e.message + "; " + str(e.inner_message)) except Exception as e: record.failed("File system error when reading file") try: file_failed(xml) except: pass record.save() previous.insert(0, record) raise BackgroundException( "Failed to upload file - please contact an administrator")
def _file_upload(f, schema, previous): # prep a record to go into the index, to record this upload record = models.FileUpload() record.upload(current_user.id, f.filename) record.set_id() # the file path that we are going to write to xml = os.path.join(app.config.get("UPLOAD_DIR", "."), record.local_filename) # it's critical here that no errors cause files to get left behind unrecorded try: # write the incoming file out to the XML file f.save(xml) # save the index entry record.save() except: # if we can't record either of these things, we need to back right off try: os.remove(xml) except: pass try: record.delete() except: pass flash("Failed to upload file - please contact an administrator", "error") return render_template('publisher/uploadfile.html', previous=previous) # now we have the record in the index and on disk, we can attempt to # validate it try: actual_schema = None with open(xml) as handle: actual_schema = article.check_schema(handle, schema) except: # file is a dud, so remove it try: os.remove(xml) except: pass # if we're unable to validate the file, we should record this as # a file error. record.failed("Unable to parse file") record.save() previous = [record] + previous flash("Failed to parse file - it is invalid XML; please fix it before attempting to upload again.", "error") return render_template('publisher/uploadfile.html', previous=previous) if actual_schema: record.validated(actual_schema) record.save() previous = [record] + previous # add the new record to the previous records flash("File successfully uploaded - it will be processed shortly", "success") return render_template('publisher/uploadfile.html', previous=previous) else: record.failed("File could not be validated against a known schema") record.save() os.remove(xml) previous = [record] + previous flash("File could not be validated against a known schema; please fix this before attempting to upload again", "error") return render_template('publisher/uploadfile.html', previous=previous)