def proxy_header(request, target_url): """ Proxy request so as to get around CORS issues for displaying PDFs with javascript and other needs """ gapi = GeneralAPI() ok = True status_code = 404 try: r = requests.head(target_url, headers=gapi.client_headers) status_code = r.status_code r.raise_for_status() output = {'status': status_code, 'url': target_url} if 'Content-Length' in r.headers: output['Content-Length'] = int(float(r.headers['Content-Length'])) if 'Content-Type' in r.headers: output['Content-Type'] = r.headers['Content-Type'] except: ok = False content = target_url + ' ' + str(status_code) if ok: json_output = json.dumps(output, indent=4, ensure_ascii=False) return HttpResponse(json_output, content_type='application/json; charset=utf8') else: return HttpResponse('Fail with HTTP status: ' + str(content), status=status_code, content_type='text/plain')
def update_metadata(self, deposition_id, metadata_dict): """ updates metadata for a deposition """ output = None gapi = GeneralAPI() headers = gapi.client_headers headers['Content-Type'] = 'application/json' deposition_id = str(deposition_id) url = self.url_prefix + '/api/deposit/depositions/%s' % deposition_id data = {'metadata': metadata_dict} if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) try: r = requests.put(url, timeout=240, headers=headers, params={'access_token': self.ACCESS_TOKEN}, data=json.dumps(data)) r.raise_for_status() output = r.status_code except: print('FAIL to update metadata with status code: ' + str(r.status_code)) print(str(r.json())) output = False return output
def create_empty_deposition(self): """ makes a new empty deposition containter to recieve files and metadata """ output = None gapi = GeneralAPI() headers = gapi.client_headers headers['Content-Type'] = 'application/json' url = self.url_prefix + '/api/deposit/depositions' if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) try: r = requests.post(url, timeout=240, headers=headers, params={'access_token': self.ACCESS_TOKEN}, json={}) r.raise_for_status() print('Status code: ' + str(r.status_code)) output = r.json() except: print('FAIL with Status code: ' + str(r.status_code)) print(str(r.json())) output = False return output
def get_deposition_meta_by_id(self, deposition_id): """ gets a deposition metadata object via a request for a JSON object from Zenodo """ gapi = GeneralAPI() headers = gapi.client_headers headers['Content-Type'] = 'application/json' if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) deposition_id = str(deposition_id) url = self.url_prefix + '/api/deposit/depositions/%s' % deposition_id try: r = requests.get(url, timeout=240, headers=headers, params={'access_token': self.ACCESS_TOKEN}) r.raise_for_status() output = r.json() except: output = False print('FAIL with Status code: ' + str(r.status_code)) print(str(r.json())) print('URL: ' + url) return output
def proxy(request, target_url): """ Proxy request so as to get around CORS issues for displaying PDFs with javascript and other needs """ gapi = GeneralAPI() if 'https:' in target_url: target_url = target_url.replace('https:', 'http:') if 'http://' not in target_url: target_url = target_url.replace('http:/', 'http://') ok = True status_code = 404 print('Try to see: ' + target_url) try: r = requests.get(target_url, timeout=240, headers=gapi.client_headers) status_code = r.status_code r.raise_for_status() except: ok = False content = target_url + ' ' + str(status_code) if ok: status_code = r.status_code mimetype = r.headers['Content-Type'] content = r.content return HttpResponse(content, status=status_code, content_type=mimetype) else: return HttpResponse('Fail with HTTP status: ' + str(content), status=status_code, content_type='text/plain')
def get_json_for_geonames_uri(self, geonames_uri): """ gets json data from a geonames_uri """ le_gen = LinkEntityGeneration() geonames_uri = le_gen.make_clean_uri(geonames_uri) # strip off any cruft in the URI geo_ex = geonames_uri.split('/') geonames_id = geo_ex[-1] url = self.json_base_url + str(geonames_id) if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) try: gapi = GeneralAPI() r = requests.get(url, timeout=240, headers=gapi.client_headers) r.raise_for_status() self.request_url = r.url json_r = r.json() except: json_r = False self.json_data = json_r return self.json_data
def upload_file_by_put(self, bucket_url, filename, full_path_file): """ uploads a file of filename, stored at full_path_file into a Zenodo deposit at location bucket_url """ output = None if not os.path.exists(full_path_file): # can't find the file to upload! output = False else: # we found the file to upload if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) url = bucket_url + '/' + filename try: # for bigger files, use this PUT method # Adapted from: https://github.com/zenodo/zenodo/issues/833#issuecomment-324760423 files = {'file': open(full_path_file, 'rb')} gapi = GeneralAPI() headers = gapi.client_headers headers['Accept'] = 'application/json' headers['Authorization'] = 'Bearer ' + self.ACCESS_TOKEN headers['Content-Type'] = 'application/octet-stream' r = requests.put(url, headers=headers, data=open(full_path_file, 'rb')) r.raise_for_status() output = r.json() except: output = False print('FAIL with Status code: ' + str(r.status_code)) print(str(r.json())) print('URL: ' + url) return output
def get_list_records(self, url, resumption_token=None): """ gets OAI-PMH list records, with an optional resumption_token """ xml = None params = None if 'verb=ListRecords' not in url: params = {} params['verb'] = 'ListRecords' if isinstance(resumption_token, str): if '?' in url: # do this to avoid URL encoding the resumption token url += '&resumptionToken=' + resumption_token else: url += '?resumptionToken=' + resumption_token if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) url_content = None if isinstance(params, dict): try: gapi = GeneralAPI() r = requests.get(url, params=params, timeout=240, headers=gapi.client_headers) self.request_url = r.url r.raise_for_status() url_content = r.content except: self.request_error = True url_content = None else: try: gapi = GeneralAPI() r = requests.get(url, timeout=240, headers=gapi.client_headers) self.request_url = r.url r.raise_for_status() url_content = r.content except: self.request_error = True url_content = None return url_content
def cache_page_locally(self, url, payload, act_dir, filename): """ caches content of a page locally if successfuly downloaded """ ok = False if url not in self.fail_urls: if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) file_path = self.define_import_directory_file(act_dir, filename) try: gapi = GeneralAPI() r = requests.get(url, params=payload, timeout=240, headers=gapi.client_headers) self.request_url = r.url r.encoding = 'utf-8' r.raise_for_status() content = str(r.content) saved = False print('Working in: ' + self.current_location) print('Attempting to save: ' + url) try: # file = codecs.open(file_path, 'w', 'utf-8') # file.write(codecs.BOM_UTF8) # file.write(content) # file.close() f = open(file_path, 'w', encoding='utf-8') f.write(content) f.close() saved = True except Exception as e: print('Save fail: ' + str(e)) saved = False if saved is False: content = unidecode(content) try: f = open(file_path, 'w', encoding='utf-8') f.write(content) f.close() except Exception as e: print('Save fail attempt 2: ' + str(e)) saved = False if saved: ok = True else: print('CANNOT SAVE: ' + file_path) self.save_fails.append(url) self.save_as_json_file(act_dir, self.save_fail_file, self.save_fails) ok = False except: ok = False self.fail_urls.append(url) self.save_as_json_file(act_dir, self.fail_url_file, self.fail_urls) return ok
def upload_file_by_post(self, deposition_id, filename, full_path_file, ok_if_exists=True): """ uploads a file of filename, stored at full_path_file into a Zenodo deposit with deposition_id will respond with an OK if it already exists This works by POST and is NOT the preferred method """ output = None gapi = GeneralAPI() headers = gapi.client_headers # headers['Content-Type'] = 'application/json' if not os.path.exists(full_path_file): # can't find the file to upload! output = False else: # we found the file to upload if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) data = {'filename': filename} deposition_id = str(deposition_id) url = self.url_prefix + '/api/deposit/depositions/%s/files' % deposition_id try: # for bigger files, this will not work routinely # See fix at: https://github.com/zenodo/zenodo/issues/833 with open(full_path_file, 'rb') as f: # stream the upload of the files, which can be really big! files = {'file': f} r = requests.post( url, timeout=240, headers=headers, params={'access_token': self.ACCESS_TOKEN}, data=data, files=files) r.raise_for_status() output = r.json() except: output = False if ok_if_exists and 'message' in r.json(): if r.json()['message'] == 'Filename already exists.': print('File already exists, with status code: ' + str(r.status_code)) output = True if output is False: # some other reason for failure print('FAIL with Status code: ' + str(r.status_code)) print(str(r.json())) print('URL: ' + url) return output
def search_admin_entity(self, q_str, admin_level=0, username='******', lat=None, lon=None, degree_dif=.5): """ searches for an entity of a given administrative type associated for a given q_str """ output = None all_params = {} all_params['q'] = q_str all_params['username'] = username all_params['maxRows'] = 1 if isinstance(lat, float) \ and isinstance(lon, float) \ and isinstance(degree_dif, float): all_params['east'] = lon - degree_dif all_params['west'] = lon + degree_dif all_params['south'] = lat - degree_dif all_params['north'] = lat + degree_dif if admin_level == 0: fcodes = ['PCLI', 'OCN'] elif admin_level == 1: fcodes = ['ADM1'] elif admin_level == 2: fcodes = ['ADM2'] else: fcodes = [None] for fcode in fcodes: params = all_params if isinstance(fcode, str): params['fcode'] = fcode if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) try: gapi = GeneralAPI() r = requests.get(self.SEARCH_BASE_URL, params=params, timeout=10, headers=gapi.client_headers) r.raise_for_status() # print('Checking: ' + r.url) json_r = r.json() except: json_r = False if json_r is not False: output = json_r break return output
def get_turtle_text(self, url): """ gets the turtle manifest as a string """ try: gapi = GeneralAPI() r = requests.get(url, timeout=240, headers=gapi.client_headers) r.raise_for_status() turtle = r.text except: print('Failed to get ' + url) turtle = False return turtle
def request_json_str(self, url): """requests JSON as a string from a URl """ json_output = None try: gapi = GeneralAPI() r = requests.get(url, timeout=240, headers=gapi.client_headers) r.raise_for_status() json_obj = r.json() except: json_obj = False if isinstance(json_obj, dict): json_output = json.dumps(json_obj, indent=4, ensure_ascii=False) return json_output
def get_periodo_data(self): """ gets json-ld data from Periodo """ url = self.data_url try: gapi = GeneralAPI() r = requests.get(url, timeout=240, headers=gapi.client_headers) r.raise_for_status() json_r = r.json() except: json_r = False self.periodo_data = json_r return json_r
def get_unit_json(self, unit_id): """ gets json data from tDAR in response to a keyword search """ url = self.base_json_url + unit_id try: gapi = GeneralAPI() r = requests.get(url, timeout=240, headers=gapi.client_headers) self.request_url = r.url r.raise_for_status() json_r = r.json() except: self.request_error = True json_r = False return json_r
def get_remote_text_from_url(self, url): """ gets remote text content from a URL """ if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) try: gapi = GeneralAPI() r = requests.get(url, timeout=240, headers=gapi.client_headers) self.request_url = r.url r.raise_for_status() text = r.text except: self.request_error = True text = False return text
def get_read_csv(self, url): """ gets json daa from a geonames_uri """ if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) try: gapi = GeneralAPI() r = requests.get(url, timeout=240, headers=gapi.client_headers) r.raise_for_status() csvfile = r.text.split('\n') self.csv_data = csv.reader(csvfile) except: self.csv_data = False return self.csv_data
def get_search_json(self, url): """ gets json data from Open Context in response to a keyword search """ if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) try: gapi = GeneralAPI() r = requests.get(url, timeout=240, headers=gapi.client_headers) self.request_url = r.url r.raise_for_status() json_r = r.json() except: self.request_error = True json_r = False return json_r
def get_search_html(self, url): """ Get HTML from Open Context from a URL, do nothing with the data however. """ gapi = GeneralAPI() headers = gapi.client_headers if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) try: r = requests.get(url, timeout=240, headers=headers) r.raise_for_status() ok = True except: ok = False return ok
def get_save_legacy_csv(self, table_id): """ gets and saves the legacy csv files from open context """ sleep(self.delay_before_request) dir_file = self.set_check_directory(self.table_dir) + table_id + '.csv' url = self.LEGACY_TAB_BASE_URI + table_id + '.csv' print('Working on :' + url) try: gapi = GeneralAPI() r = requests.get(url, timeout=480, headers=gapi.client_headers) r.raise_for_status() text = r.text except: text = False if text is not False: f = open(dir_file, 'w', encoding='utf-8') f.write(text) f.close()
def get_search_json(self, url): """ Gets json data from Open Context search API """ gapi = GeneralAPI() headers = gapi.client_headers headers['accept'] = 'application/json' if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) try: r = requests.get(url, timeout=240, headers=headers) r.raise_for_status() json_r = r.json() except: self.request_errors.append(url) json_r = False return json_r
def get_keyword_search_json(self, keyword): """ gets json data from Arachne in response to a keyword search """ payload = {'q': keyword} if self.filter_by_images: payload['fq'] = 'facet_image:ja' url = self.DEFAULT_API_BASE_URL try: gapi = GeneralAPI() r = requests.get(url, params=payload, timeout=240, headers=gapi.client_headers) self.set_arachne_search_urls(r.url) r.raise_for_status() json_r = r.json() except: json_r = False self.arachne_json_r = json_r return json_r
def get_basic_json_from_uri(self, orcid_uri): """ gets json daa from the ORCID URI """ url = self.make_orcid_api_url(orcid_uri) if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) try: gapi = GeneralAPI() headers = gapi.client_headers headers['Accept'] = 'application/json' r = requests.get(url, timeout=240, headers=headers) r.raise_for_status() self.response_headers = r.headers self.request_url = r.url json_r = r.json() except: json_r = False self.json_data = json_r return self.json_data
def get_jsonld_for_uri(self, uri): """ gets json-ld daa from the OCHRE URI """ le_gen = LinkEntityGeneration() uri = le_gen.make_clean_uri(uri) # strip off any cruft in the URI url = uri + '.jsonld' self.request_url = url if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) try: gapi = GeneralAPI() r = requests.get(url, timeout=240, headers=gapi.client_headers) r.raise_for_status() self.request_url = r.url json_r = r.json() except: json_r = False self.json_data = json_r return self.json_data
def get_keyword_search_json(self, keyword, keyword_type): """ gets json data from tDAR in response to a keyword search """ if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) payload = {'term': keyword, 'keywordType': keyword_type} url = self.KEYWORD_API_BASE_URL try: gapi = GeneralAPI() r = requests.get(url, params=payload, timeout=240, headers=gapi.client_headers) self.request_url = r.url r.raise_for_status() json_r = r.json() except: self.request_error = True json_r = False return json_r
def get_trench_book_index(self): """ gets the trench book index """ content = None if self.delay_before_request > 0: # default to sleep BEFORE a request is sent, to # give the remote service a break. sleep(self.delay_before_request) payload = {'oc': True} url = self.trench_book_index_url try: gapi = GeneralAPI() r = requests.get(url, params=payload, timeout=240, headers=gapi.client_headers) self.request_url = r.url r.raise_for_status() content = r.text except: self.request_error = True content = self.get_trench_book_index_from_file() return content
def get_arachne_json(self, payload): """ executes a search for json data from arachne """ if isinstance(payload, dict): if self.filter_by_images: payload['fq'] = 'facet_image:"ja"' url = self.DEFAULT_API_BASE_URL try: gapi = GeneralAPI() r = requests.get(url, params=payload, timeout=240, headers=gapi.client_headers) print('r url: ' + r.url) self.set_arachne_search_urls(r.url) r.raise_for_status() json_r = r.json() except: json_r = False else: json_r = False self.arachne_json_r = json_r return json_r