def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj) # guess delimiter if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")
def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj, delimiter=',') if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls', 'application/excel') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")
def webstorer_upload(context, data): context = json.loads(context) resource = json.loads(data) excel_types = ['xls', 'application/ms-excel', 'application/xls'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '') f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: table_sets = CSVTableSet.from_fileobj(f) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) types = guess_types(list(row_set.dicts(sample=True))) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(types_processor(types)) rows = [] for row in row_set.dicts(): rows.append(row) webstore_url = context.get('webstore_url').rstrip('/') webstore_request_url = '%s/%s/%s' % (webstore_url, context['username'], resource['id'] ) #check if resource is already there. webstore_response = requests.get(webstore_request_url+'.json') check_response_and_retry(webstore_response, webstore_request_url+'.json') #should be an empty list as no tables should be there. if json.loads(webstore_response.content): raise WebstorerError('Webstore already has this resource') response = requests.post(webstore_request_url+'/data', data = json.dumps(rows), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, webstore_request_url+'.json') if response.status_code != 201: raise WebstorerError('Websore bad response code (%s). Response was %s'% (response.status_code, response.content) ) ckan_url = context['site_url'].rstrip('/') ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': webstore_request_url+'/data', 'webstore_last_updated': datetime.datetime.now().isoformat() } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) if response.status_code not in (201, 200): raise WebstorerError('Ckan bad response code (%s). Response was %s'% (response.status_code, response.content) )
def _datastorer_upload(context, resource): excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '') f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: table_sets = CSVTableSet.from_fileobj(f) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) types = guess_types(list(row_set.dicts(sample=True))) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(types_processor(types)) ckan_url = context['site_url'].rstrip('/') webstore_request_url = '%s/api/data/%s/' % (ckan_url, resource['id'] ) def send_request(data): return requests.post(webstore_request_url + '_bulk', data = "%s%s" % ("\n".join(data), "\n"), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) data = [] for count,dict_ in enumerate(row_set.dicts()): data.append(json.dumps({"index": {"_id": count+1}})) data.append(json.dumps(dict_)) if (count % 100) == 0: response = send_request(data) check_response_and_retry(response, webstore_request_url+'_mapping') data[:] = [] if data: respose = send_request(data) check_response_and_retry(response, webstore_request_url+'_mapping') ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': webstore_request_url, 'webstore_last_updated': datetime.datetime.now().isoformat() } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) if response.status_code not in (201, 200): raise WebstorerError('Ckan bad response code (%s). Response was %s'% (response.status_code, response.content) )
def _datastorer_upload(context, resource, logger): excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel'] tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: is_tsv = (content_type in tsv_types or resource['format'] in tsv_types) delimiter = '\t' if is_tsv else ',' table_sets = CSVTableSet.from_fileobj(f, delimiter=delimiter) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = {'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data} response = requests.post(datastore_create_request_url, data=json.dumps(request), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, datastore_create_request_url, logger) logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat(), 'url': resource['url'] } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}) if response.status_code not in (201, 200): raise DatastorerException('Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))