def d3m_load_resource(self, path): """Open data file and return a pandas data frame""" print('-- d3m_load_resource --', path) path_ext = splitext(path.lower())[1] try: if path_ext == dp_static.EXT_CSV: print('csv file') # csv file # data = pd.read_csv(path, low_memory=False) elif path_ext in [dp_static.EXT_TSV, dp_static.EXT_TAB]: print('Tab-delimited') # Tab-delimited # data = pd.read_csv(path, delimiter='\t', low_memory=False) elif path_ext in [dp_static.EXT_XLS, dp_static.EXT_XLSX]: print('Excel file') # Excel file # data = pd.read_excel(path) else: return err_resp('File extension not valid: %s' % path_ext) except FileNotFoundError as err_obj: return err_resp('File not found: %s' % err_obj) except pd.errors.ParserError as err_obj: return err_resp('Failed to open file: %s' % err_obj) if 'd3mIndex' not in data: data.insert(0, 'd3mIndex', range(len(data))) return ok_resp(data)
def write_and_clear_behavioral_logs(user, user_workspace): """Write out any behavioral logs files and delete the entries from the database""" if not isinstance(user, User): return err_resp('user was not a User object') if user_workspace and not isinstance(user_workspace, UserWorkspace): return err_resp('user_workspace was not a UserWorkspace object') # Write out any behavioral logs for the workspace # if user_workspace: log_info = LogEntryMaker.write_user_log(user_workspace) if log_info.success: print('log written: ', log_info.result_obj) else: print('log writing failed: ', log_info.err_msg) # clear behavioral logs for current user # log_clear = BehavioralLogFormatter.delete_logs_for_user(user) if log_clear.success: print('\n'.join(log_clear.result_obj)) else: print(log_clear.err_msg)
def make_search_by_dataset_call(datamart_name, user_workspace_id, dataset_path, query=None, **kwargs): """Search the NYU datamart by dataset""" if not datamart_name: return err_resp('datamart_name must be set') if not user_workspace_id: return err_resp('user_workspace_id must be set') if not dataset_path: return err_resp('dataset_path must be set') # Async task to run search by dataset process # kick_off_search_by_dataset.delay(\ datamart_name, user_workspace_id, dataset_path, query=query, **kwargs) return ok_resp('augment process started')
def make_new_dataset(user_id, source_dir, writable_output_dir, **kwargs): """Return the result of a SearchSolutions call. If successful, an async process is kicked off""" if not user_id: return err_resp('user_workspace_id must be set') if not isdir(source_dir): return err_resp('source_dir not found: %s' % source_dir) if not isdir(writable_output_dir): return err_resp('writable_output_dir not found: %s' % writable_output_dir) source_files = [ join(source_dir, x) for x in os.listdir(source_dir) if splitext(x.lower())[1] in dp_static.VALID_EXTENSIONS ] if not source_files: return err_resp( f'No source files found in directory: {source_dir}') udu = UserDatasetUtil(1, source_files, writable_output_dir, **kwargs) if udu.has_error(): return err_resp(udu.error_message) return ok_resp('it worked')
def get_d3m_filepath(d3m_config, file_attr): """ Example from: https://datadrivendiscovery.org/wiki/display/gov/TA2+Configuration+file+syntax { "problem_schema": "/baseball/problemSchema.json", "dataset_schema": "/baseball/data/dataSchema.json", "training_data_root": "/baseball/data", "pipeline_logs_root": "/outputs/logs", "executables_root": "/outputs/executables", } return: success -> (filepath, None) err -> (None, err_msg) """ if not d3m_config: return err_resp('No D3MConfiguration specified.') if not file_attr in D3M_FILE_ATTRIBUTES: user_msg = 'unknown file attribute. Use one of %s' % D3M_FILE_ATTRIBUTES return err_resp(user_msg) filepath = d3m_config.__dict__.get(file_attr, '') if not isfile(filepath): return err_resp('file not found: %s' % filepath) return ok_resp(filepath)
def get_config_file_contents(d3m_config, config_key, as_dict=True): """Get contents of a file specified in the config""" if not isinstance(d3m_config, D3MConfiguration): return err_resp('d3m_config must be a D3MConfiguration object') if not config_key in D3M_FILE_ATTRIBUTES: return err_resp('config_key not found!') filepath_info = get_d3m_filepath(d3m_config, config_key) if not filepath_info.success: return err_resp(filepath_info.err_msg) fpath = filepath_info.result_obj try: with open(fpath, "r") as fh: contents = fh.read() except IOError as err_obj: user_msg = 'Failed to read file: %s\n%s' % \ (fpath, err_obj) return err_resp(user_msg) if not as_dict: return ok_resp(contents) doc_info = json_loads(contents) if not doc_info.success: return err_resp(doc_info.err_msg) return ok_resp(doc_info.result_obj)
def create_log_entry(user, entry_type, log_data): """Create a BehavioralLogEntry object""" if not isinstance(user, User): return err_resp("user must be a User object") if not isinstance(log_data, dict): return err_resp("log_data must be a dict object") # set entry type log_data['type'] = entry_type f = BehavioralLogEntryForm(log_data) if not f.is_valid(): err_msg = 'Log entry params are not valid: %s' % \ (dict(f.errors)) print(f'ERROR!: {err_msg}') return err_resp(err_msg) new_entry = BehavioralLogEntry(**f.cleaned_data) new_entry.user = user new_entry.save() # Write the entry to the Log File # #LogEntryMaker.write_to_log_file(user_workspace, new_entry) # user_msg = 'Log entry saved!' return ok_resp(new_entry)
def get_mkdoc_data_as_json(self, indent=None, **kwargs): """Return the preprocess data as a JSON string""" assert not self.has_error(), \ 'Make sure "has_error()" is False before calling this method' print('type(self.mkdoc_data) (get_mkdoc_data_as_json)', type(self.mkdoc_data)) if kwargs.get('problemDoc', None) is True: if not 'problemDoc' in self.mkdoc_data: return err_resp('Error: "problemDoc" not found in rook data') core_data = self.mkdoc_data['problemDoc'] elif kwargs.get('datasetDoc', None) is True: if not 'datasetDoc' in self.mkdoc_data: return err_resp('Error: "datasetDoc" not found in rook data') core_data = self.mkdoc_data['datasetDoc'] else: core_data = self.mkdoc_data json_str_info = json_dumps(core_data, indent=indent) if json_str_info.success: return ok_resp(json_str_info.result_obj) # only happens if not serializable return err_resp(json_str_info.err_msg)
def create_new_user_workspace(user, d3m_config, **kwargs): """Create a new UserWorkspace, making it the current workspace""" if not isinstance(user, User): return err_resp('"user" is not a User object') if not isinstance(d3m_config, D3MConfiguration): return err_resp('"d3m_config" is not a D3MConfiguration object') previous_workspace = kwargs.get('previous_workspace') params = dict(user=user, is_current_workspace=True, d3m_config=d3m_config) params = get_default_workspace_params(**params) new_workspace = UserWorkspace(**params) new_workspace.save() if previous_workspace: # At least the 2nd workspace, set pointers for previous and original new_workspace.previous_workspace = previous_workspace new_workspace.original_workspace = previous_workspace.original_workspace else: # Brand new, the original points back to itself new_workspace.original_workspace = new_workspace new_workspace.save() return ok_resp(new_workspace)
def search_with_dataset(dataset_path, query=None, **kwargs): """Search the datamart using a dataset""" if not isfile(dataset_path): user_msg = ('The dataset file could not be found.') return err_resp(user_msg) search_url = get_nyu_url() + '/search' # -------------------------------- # Behavioral logging # -------------------------------- if 'user_workspace' in kwargs: log_data = dict(feature_id=f'POST|by-dataset|{search_url}', activity_l1=bl_static.L1_DATA_PREPARATION, activity_l2=bl_static.L2_DATA_SEARCH, path=search_url) LogEntryMaker.create_datamart_entry(kwargs['user_workspace'], log_data) # -------------------------------- # -------------------------------- # Query the datamart # -------------------------------- try: with open(dataset_path, 'rb') as dataset_p: search_files = dict(data=dataset_p) if query: search_files['query'] = query try: response = requests.post(\ search_url, files=search_files, timeout=settings.DATAMART_LONG_TIMEOUT) except requests.exceptions.Timeout as err_obj: return err_resp('Request timed out. responded with: %s' % err_obj) except IOError as err_obj: user_msg = (f'Failed to search with the dataset file.' f' Technical: {err_obj}') return err_resp(user_msg) if response.status_code != 200: print(str(response)) print(response.text) return err_resp(('NYU Datamart internal server error.' ' status_code: %s') % response.status_code) json_results = response.json()['results'] if not json_results: return err_resp('No datasets found. (%s)' % \ (get_timestamp_string_readable(time_only=True),)) print('num results: ', len(json_results)) return ok_resp(json_results)
def export_csv(user_workspace, collection, data): """Export the dataset using the 'BasicProblemWriter' """ if not isinstance(user_workspace, UserWorkspace): user_msg = ('The user_workspace was not set correctly.' ' (export_dataset)') return err_resp(user_msg) if not isinstance(data, list) and not isinstance( data, types.GeneratorType): user_msg = 'export_dataset failed. "data" must be a list or generator' LOGGER.error(user_msg) return err_resp(user_msg) filename = os.path.join('manipulation_data', collection, 'TRAIN', 'tables', 'learningData.csv') params = { BasicProblemWriter.IS_CSV_DATA: True, BasicProblemWriter.INCREMENT_FILENAME: True, BasicProblemWriter.QUOTING: csv.QUOTE_NONNUMERIC } bpw = BasicProblemWriter(user_workspace, filename, data, **params) if bpw.has_error(): return err_resp(bpw.get_error_message()) return ok_resp(bpw.new_filepath)
def get_dict_value(data_dict, *keys): ''' Check if *keys (nested) exists in `data_dict` (dict). data = { "spam": { "egg": { "bacon": "Well.."} } } e.g. keys_exists(data, "spam", "egg", "bacon") ref: https://stackoverflow.com/questions/43491287/elegant-way-to-check-if-a-nested-key-exists-in-a-python-dict ''' if not isinstance(data_dict, dict): return err_resp('keys_exists() expects dict as first argument.') if not keys: return err_resp( ('get_dict_value(data_dict, *keys) expects at least two' ' arguments, one given.')) dict_val = data_dict for key in keys: try: dict_val = dict_val[key] except KeyError: return err_resp('Key not found: %s' % ', '.join(keys)) return ok_resp(dict_val)
def add_to_user_model(user_name, query_id, message): """ add to user notification model """ user_object = User.objects.get(username=user_name) if not user_object: return err_resp('could not find user with name %s' % user_name) try: saved_query = EventDataSavedQuery.objects.get(id=query_id) except ValueError: return err_resp('Could not retrieve query for id %s' % query_id) query = saved_query.as_dict()['query'] input_data = dict(user=user_object, message=message, read=False, archived_query=query) user_notify = UserNotification(**input_data) user_notify.save() if user_notify.id: """no error""" usr_dict = dict(success=True, message="query saved", data=user_notify.as_dict()) return ok_resp(usr_dict) else: """error""" usr_dict = dict(success=False, message="failed to save query", id=user_notify.id) return err_resp(usr_dict)
def publish_dataset(dataset_id): """ publish dataset might be using dataset_id later according to actual API request """ job = DataversePublishDataset() job2 = GetDataSetFileInfo() succ, res = job.return_status() if succ: success, res_info = job2.return_status() print("Res : ********* : ", res_info) if success: for d in res_info['data']['latestVersion']['files']: print("*******") file_id = d['dataFile']['id'] file_url = d['dataFile']['pidURL'] success, archive_job = ArchiveQueryJob.get_objects_by_id( file_id) if success: archive_job.archive_url = file_url archive_job.save() return ok_resp(res) else: return err_resp(archive_job) else: return err_resp(res_info) else: return err_resp(res)
def make_augment_call(user_workspace, augment_params, **kwargs): """Initiate the augment call If successful, an async process is kicked off""" if not user_workspace: return err_resp('user_workspace must be set') if not augment_params: return err_resp('augment_params must be set') print('augment_params', json.dumps(augment_params, indent=4)) print('augment keys', augment_params.keys()) # check if data is valid form = DatamartAugmentForm(augment_params) if not form.is_valid(): return err_resp('Invalid augment params: %s' % form.errors) #return JsonResponse({"success": False, "message": "invalid input", "errors": form.errors}) # Async task to run augment process # kick_off_augment_steps.delay(\ form.cleaned_data['source'], user_workspace.id, augment_params, **dict(websocket_id=user_workspace.user.username)) return ok_resp('augment process started')
def check_build_output_directories(d3m_config): """Used when setting a new a d3m_config: - check if the output directories exist - build them if they don't""" if not isinstance(d3m_config, D3MConfiguration): return err_resp('d3m_config must be a D3MConfiguration object') temp_path = None output_path = d3m_config.env_values.get(d3m_static.KEY_D3MOUTPUTDIR) if output_path: temp_path = join(output_path, 'temp') paths_to_check = [output_path, temp_path, d3m_config.env_values.get(d3m_static.KEY_D3MLOCALDIR), d3m_config.env_values.get(d3m_static.KEY_D3MSTATICDIR)] paths_to_build = [x for x in paths_to_check if x and not isdir(x)] fail_info = [] for build_path in paths_to_build: path_info = create_directory(build_path) if path_info.success: print('directory created: ', build_path) else: err_msg = 'Failed to build directory: %s' % (path_info.err_msg) fail_info.append(err_msg) if fail_info: return err_resp('\n'.join(fail_info)) return ok_resp('looks good')
def get_saved_workspace_by_request_and_id(request, user_workspace_id): """Retrieve a specific workspace by request, checking that it is owned by the correct user""" # Get the User user_info = get_authenticated_user(request) if not user_info.success: return err_resp(user_info.err_msg) user = user_info.result_obj # Get the workspace # ws_info = get_user_workspace_config(user, user_workspace_id) if not ws_info.success: return err_resp(ws_info.err_msg) user_workspace = ws_info.result_obj # Does the user in the request match the one in the workspace # - Later add additional permissions here for sharing # if not user.is_superuser: if not user == user_workspace.user: err_msg = (f'Sorry! User {user} does not have permission for ' f' workspace id: {user_workspace_id}.') return err_resp(err_msg) return ok_resp(user_workspace)
def set_shared_workspace_by_hash_id(request, hash_id): """Retrieve a shared workspace Basic sequence: - Is it a public workspace? - Does the shared workspace.user match the logged in user? - Yes: Proceed as if loading a regular workspace - No: - Does the logged in user already have this workspace? (e.g. as an original) - Yes: load it directly - No: Create a new workspace, copying the data from the shared workspaces """ # Get the User user_info = get_authenticated_user(request) if not user_info.success: return err_resp(user_info.err_msg) user = user_info.result_obj try: workspace = UserWorkspace.objects.get(hash_id=hash_id) except UserWorkspace.DoesNotExist: user_msg = ('No public workspaces were found for this shared link.' ' <br /><br />(id: hash_id: %s)') % \ (hash_id) return err_resp(user_msg) if not workspace.is_public: user_msg = ('No public workspaces were found for this shared link.' '<br /><br />Note: The workspace may have been made private.' ' <br /><br />(id: hash_id: %s)') % \ (hash_id) return err_resp(user_msg) if workspace.user == user: # Make this the current workspace workspace.is_current_workspace = True workspace.save() return ok_resp(workspace) # Create a new workspace, based on the shared workspace # params = dict(user=user, name=workspace.name, is_current_workspace=True, is_public=False, d3m_config=workspace.d3m_config, raven_config=workspace.raven_config, original_workspace=workspace.original_workspace, previous_workspace=workspace) params = get_default_workspace_params(**params) new_workspace = UserWorkspace(**params) new_workspace.save() # new_workspace.original_workspace = new_workspace # new_workspace.save() return ok_resp(new_workspace)
def get_authenticated_user(request): """Return the user from the request""" if not request: return err_resp('request is None') if not request.user.is_authenticated: return err_resp('user is not authenticated') return ok_resp(request.user)
def run_process(self): dataverse_server = settings.DATAVERSE_SERVER # no trailing slash api_key = settings.DATAVERSE_API_KEY # generated from kripanshu's account # dataset_id = 1 # database id of the dataset persistentId = settings.DATASET_PERSISTENT_ID # doi or hdl of the dataset # -------------------------------------------------- # Using a "jsonData" parameter, add optional description + file tags # -------------------------------------------------- params = dict(description='Testing file upload', categories=['Test', 'Two Ravens', 'EventData']) params_as_json_string = json.dumps(params) payload = dict(jsonData=params_as_json_string) # -------------------------------------------------- # Add file using the Dataset's persistentId (e.g. doi, hdl, etc) # -------------------------------------------------- url_persistent_id = '%s/api/datasets/:persistentId/add?persistentId=%s&key=%s' % (dataverse_server, persistentId, api_key) # ------------------- # Update the file content to avoid a duplicate file error # ------------------- # file_open = open(temp_file_path, 'r').read() # file_content = 'query: %s' % file_open # file_name = self.filename # files = {'file': (file_name, file_content)} self.get_file_content() if self.has_error(): return err_resp(self.error_message) success, files = self.update_file_content(self.input_type) if not success: self.add_err_msg(files) return err_resp(self.error_message) # ------------------- # Make the request # ------------------- print('-' * 40) print('making request: %s' % url_persistent_id) r = requests.post(url_persistent_id, data=payload, files=files) # ------------------- # Print the response # ------------------- print('-' * 40) print(r.json()) print(r.status_code) self.status_code = r.status_code if r.status_code == 200: self.res = r.json() else: self.res = r.json()
def get_value_by_key(self, key): """Used for pulling a value from the response""" if not self.response: return err_resp('No response available') if not key in self.response: return err_resp('Key not found in response') return ok_resp(self.response[key])
def fit_solution(raven_json_str=None): """ Send a FitSolutionRequest to the FitSolution command """ if raven_json_str is None: err_msg = 'No data found for the FitSolutionRequest' return err_resp(err_msg) # -------------------------------- # Make sure it's valid JSON # -------------------------------- raven_json_info = json_loads(raven_json_str) if not raven_json_info.success: return err_resp(raven_json_info.err_msg) # -------------------------------- # convert the JSON string to a gRPC request # -------------------------------- try: req = Parse(raven_json_str, core_pb2.FitSolutionRequest()) except ParseError as err_obj: err_msg = ('Failed to convert JSON to gRPC: %s' ' (req_search_solutions)' '\nraven_json_str: %s') % \ (err_obj, raven_json_str) print('-' * 40) print(err_msg) return err_resp(err_msg) # In test mode, return canned response # if settings.TA2_STATIC_TEST_MODE: resp = core_pb2.FitSolutionResponse(\ request_id='requestId_%s' % get_alphanumeric_string(6)) return ok_resp(message_to_json(resp)) core_stub, err_msg = TA2Connection.get_grpc_stub() if err_msg: return err_resp(err_msg) # -------------------------------- # Send the gRPC request # -------------------------------- try: reply = core_stub.FitSolution(\ req, timeout=settings.TA2_GRPC_SHORT_TIMEOUT) except Exception as err_obj: return err_resp(str(err_obj)) # -------------------------------- # Convert the reply to JSON and send it back # -------------------------------- return ok_resp(message_to_json(reply))
def set_as_default(d3m_config): if not isinstance(d3m_config, D3MConfiguration): return err_resp('"d3m_config" is not a D3MConfiguration object') d3m_config.is_default = True d3m_config.save() if d3m_config.is_default: return ok_resp('Default set!') return err_resp('Default NOT set. Save failed.')
def json_loads(json_str): """wrapper for json.loads with OrderedDict""" try: json_dict = json.loads(json_str, object_pairs_hook=OrderedDict) except json.decoder.JSONDecodeError as err_obj: err_msg = 'Failed to convert string to JSON: %s' % (err_obj) return err_resp(err_msg) except TypeError as err_obj: err_msg = 'Failed to convert string to JSON: %s' % (err_obj) return err_resp(err_msg) return ok_resp(json_dict)
def ta2_hello(): """Hello. This is a "heartbeat" request for the TA2""" # -------------------------------- # convert the JSON string to a gRPC request # for this call,this step is un-needed, just keeping it # in case things change... # -------------------------------- try: req = Parse("{}", core_pb2.HelloRequest()) #req = core_pb2.HelloRequest() except ParseError as err_obj: err_msg = 'Failed to convert JSON to gRPC: %s' % (err_obj) return err_resp(err_msg) content = MessageToJson(req, including_default_value_fields=True) # print('content as JSON:\n', content) # In test mode, check if the incoming JSON is legit (in line above) # -- then return canned response # if settings.TA2_STATIC_TEST_MODE: info = dict(TA3TA2_API_VERSION=TA3TA2Util.get_api_version(), TA3_GRPC_USER_AGENT=settings.TA3_GRPC_USER_AGENT) resp_str = get_grpc_test_json(\ 'test_responses/Hello_ok.json', info) return ok_resp(resp_str) # -------------------------------- # Get the connection, return an error if there are channel issues # -------------------------------- core_stub, err_msg = TA2Connection.get_grpc_stub() if err_msg: return err_resp(err_msg) # -------------------------------- # Send the gRPC request # -------------------------------- try: reply = core_stub.Hello(req, timeout=settings.TA2_GRPC_FAST_TIMEOUT) except Exception as ex: return err_resp(str(ex)) # -------------------------------- # Convert the reply to JSON and send it back # -------------------------------- user_msg_json = MessageToJson(reply, including_default_value_fields=True) return ok_resp(user_msg_json)
def write_user_log(user_workspace): """Dump the existing log entries to a file or GCE Bucket""" if not isinstance(user_workspace, UserWorkspace): return err_resp("user must be a UserWorkspace object") # Log file name... # server_name = settings.RAVENS_SERVER_NAME if not server_name: server_name = 'unknown-server' server_name = server_name.replace('.', '-') log_name = slugify((f'{server_name}`_' f'{get_timestamp_string()}_' f'ws-{user_workspace.id}_' f'u-{user_workspace.user.id}_' f'{user_workspace.d3m_config.name}_')) log_name = f'{log_name}.csv' # Retrieve the BehavioralLogEntry objects # log_entry_info = BehavioralLogFormatter.get_log_entries( user_workspace.user) if not log_entry_info.success: return err_resp(log_entry_info.err_msg) log_entries = log_entry_info.result_obj if log_entries.count() < 1: user_msg = 'Fewer than 2 entries, no need to write log' return err_resp(user_msg) # Write the CSV content to a ContentFile object # encoding = 'utf-8' csv_output = io.StringIO() blf = BehavioralLogFormatter(csv_output_object=csv_output, log_entries=log_entries) if blf.has_error(): user_msg = 'Error: %s' % blf.get_error_message() return err_resp(user_msg) content = csv_output.getvalue() content = content.encode(encoding) cfile = ContentFile(content) user_workspace.behavioral_log.save(log_name, cfile) return ok_resp(log_name) """
def remove_directory(dir_path): """Delete a directory""" if isdir(dir_path): try: shutil.rmtree(dir_path) return ok_resp(f'Directory removed {dir_path}') except TypeError as err_obj: return err_resp(f'Failed to remove directory. {err_obj}') except FileNotFoundError as err_obj: return err_resp(f'Directory not found: {err_obj}') return ok_resp(f'Not a directory {dir_path}')
def move_file(src_file, dest_file): """Move a file""" if src_file == dest_file: return err_resp('The source and destination cannot be the same') try: shutil.copyfile(src_file, dest_file) except IOError as err_obj: user_msg = ('Failed to copy file: %s to %s\n%s') % \ (src_file, dest_file, err_obj) return err_resp(user_msg) return ok_resp('File copied to: %s' % dest_file)
def get_request_body(request): """Retrieve the request body Returns either: (True, content text) (Fales, error message) """ if not request: return err_resp('request is None') if not request.body: return err_resp('request.body not found') return ok_resp(request.body.decode('utf-8'))
def get_output_filepath(user_workspace, datamart_id, dir_type='materialize'): """Create the output filepath for materialze and augment""" if not isinstance(user_workspace, UserWorkspace): return err_resp('user_workspace must be a UserWorkspace') if not datamart_id: return err_resp('"datamart_id" must be set') output_path = join(user_workspace.d3m_config.additional_inputs, dir_type, str(datamart_id), 'learningData.csv') return ok_resp(output_path)