def action_info_curation(endpoint: str) -> Tuple[bool, Union[str, dict]]: """ Returns a list of curation statistics :param endpoint: curation endpoint to collect statistics from :return bool: :return str: :return stats_dict: """ # get curation endpoint path endpoint_curation = pathlib.Path(utils.curation_tree_path(endpoint)) if endpoint_curation.is_dir() is False: return False, 'Curation endpoint path does not exist.\n' # get statisics files in curation endpint stats_file = os.path.join(endpoint_curation, 'statistics.pkl') if not os.path.isfile(stats_file): return False, 'Statistics file does not exist.\n' stats = [] with (open(stats_file, "rb")) as openfile: stats = pickle.load(openfile) return True, stats
def action_header_curation(endpoint: str) -> Tuple[bool, Union[str, dict]]: """ Returns a dataframe of curation output header :param endpoint: curation endpoint :return bool: :return str: :return head_: """ # get curation endpoint path endpoint_curation = pathlib.Path(utils.curation_tree_path(endpoint)) if endpoint_curation.is_dir() is False: return False, 'Curation endpoint path does not exist.\n' # get header file in curation endpint header_file = os.path.join(endpoint_curation, 'curated_data_head.pkl') if not os.path.isfile(header_file): return False, 'Curation header file does not exist.\n' head_ = [] with (open(header_file, "rb")) as openfile: head_ = pickle.load(openfile) return True, head_
def action_kill(curation_endpoint: str) -> Tuple[bool, str]: """ Removes the endpoint tree described by the argument. :param curation_endpoint: path to curation endpoint in the repo. :return bool: :return str: """ if not curation_endpoint: return False, 'Empty endpoint name' ndir = utils.curation_tree_path(curation_endpoint) if not os.path.isdir(ndir): return False, "Model {} not found".format(curation_endpoint) try: shutil.rmtree(ndir, ignore_errors=True) except: return False, "Failed to remove model {}".format(curation_endpoint) sys.stderr.write("Model {} removed\n".format(curation_endpoint)) return True, "Model {} removed".format(curation_endpoint)
def loadYaml_curation(self, curation_path: str) -> Tuple[bool, str]: """ load a set of parameters from the configuration file present at the model directory adds some parameters identifying the model and the hash of the configuration file :param curation_path: """ # obtain the path and the default name of the model parameters parameters_file_path = utils.curation_tree_path(curation_path) if not os.path.isdir(parameters_file_path): return False, 'Curation "{}" not found'.format(curation_path) parameters_file_name = os.path.join(parameters_file_path, 'curation_parameters.yaml') # load the main class dictionary (p) from this yaml file if not os.path.isfile(parameters_file_name): return False, 'Parameters file not found' try: with open(parameters_file_name, 'r') as pfile: self.p = yaml.safe_load(pfile) except Exception as e: return False, e # add keys for the model self.p['endpoint'] = curation_path self.p['curation_path'] = parameters_file_path return True, 'OK'
def curation_cmd(commnad_dict: dict) -> Optional[bool]: """ Instantiate curate objectt using commnad_dict from argument parser. :param commnad_dict: - data_input: input file name to be processed - molecule_identifier: column name containing the molecule ID. Usually CAS is used - endpoint: curation endpoint name - structure_column: column name containing the SMILES string - metadata: column names for metadata processing (only for API) - separator: file separator if input file is a csv or a tsv - remove_problematic: boolean indicating the option of removing problematic structures or not - outfile_type: output file type: xlsx, csv, tsv, sdf or json """ import curate.dataset_curation as datacur # safety check if curation endpoint exists output_dir = utils.curation_tree_path(commnad_dict['endpoint']) if not os.path.isdir(output_dir): sys.stderr.write("Endpoint name not found in model repository.\n") return # check of metadata if 'metadata' in commnad_dict.keys(): metadata_ = commnad_dict['metadata'].split(',') if (commnad_dict['molecule_identifier'] in metadata_) or (commnad_dict['structure_column'] in metadata_): sys.stderr.write( "datacur curate : metadata can't contain the ID nor the SMILES column names.\n" ) return else: metadata_ = None # call of curation functions curating = datacur.DataCuration( data_input=commnad_dict['data_input'], molecule_identifier=commnad_dict['molecule_identifier'], structure_column=commnad_dict['structure_column'], output_dir=output_dir, endpoint=commnad_dict['endpoint'], metadata=metadata_, separator=commnad_dict['separator'], remove_problematic=commnad_dict['remove_problematic'], outfile_type=commnad_dict['outfile_type']) curating.curate_data() curating.get_output_file(smiles_column='structure_curated')
def action_curation_results(endpoint: str) -> Tuple[bool, str]: """ Returns the output file :param endpoint: curation endpoint :return bool: :return str: :return head_: """ # get curation endpoint path endpoint_curation = pathlib.Path(utils.curation_tree_path(endpoint)) if endpoint_curation.is_dir() is False: return False, 'Curation endpoint path does not exist.\n' # get curation file in curation endpint curation_file = [ f for f in os.listdir(endpoint_curation) if f.startswith('curated_data') and 'head' not in f ] if not curation_file: return False, { 'code': 0, 'message': 'curations not found for {} directory'.format(endpoint) } else: curation_file_path = os.path.join(endpoint_curation, curation_file[0]) # curation_ = [] # if curation_file_path.endswith('.csv'): # with (open(curation_file_path, "rb")) as openfile: # curation_ = pd.read_csv(curation_file_path, delimiter=',') # curation_ = curation_.to_dict('list') # elif curation_file_path.endswith('.tsv'): # with (open(curation_file_path, "rb")) as openfile: # curation_ = pd.read_csv(curation_file_path, delimiter='\t') # curation_ = curation_.to_dict('list') # elif curation_file_path.endswith('.xlsx'): # curation_ = pd.read_excel(curation_file_path, engine='openpyxl') # curation_ = curation_.to_dict('list') # elif curation_file_path.endswith('.json'): # with (open(curation_file_path)) as openfile: # curation_.append(json.load(openfile)) # elif curation_file_path.endswith('.sdf'): # curation_ = PandasTools.LoadSDF(curation_file_path, smilesName='structure_curated',molColName='name', removeHs=False, strictParsing=True) # curation_ = curation_.to_dict('list') return True, curation_file_path
def delta_curation(self, curation: str, parameters: str, iformat: str ='YAML') -> Tuple[str, bool]: """ load a set of parameters from the configuration file present at the curation directory also, inserts the keys present in the param_file provided, assuming that it contains a YAML-compatible format, like the one generated by manage adds some parameters identifying the curation :param curation: :param parameters: """ if not self.loadYaml_curation(curation): return False, 'file not found' # parse parameter file assuning it will be in # a YAML-compatible format if iformat == 'JSONS': try: newp = json.loads(parameters) except Exception as e: return False, e else: try: with open(parameters, 'r') as pfile: if iformat == 'YAML': newp = yaml.safe_load(pfile) elif iformat == 'JSON': newp = json.load(pfile) except Exception as e: return False, e self.applyDelta_curation(newp) # dump internal dict to the parameters file parameters_file_path = utils.curation_tree_path(curation) parameters_file_name = os.path.join(parameters_file_path, 'curation_parameters.yaml') try: with open(parameters_file_name, 'w') as pfile: yaml.dump (self.p, pfile) except Exception as e: return False, 'unable to write parameters' return True, 'OK'
def action_list(curation_dir: str) -> Tuple[bool, str]: """ In no argument is provided lists all endpoints present at the repository otherwyse lists all files for the endpoint provided as argument. :param curation_dir: path to the endpoint in curation repo """ # if no name is provided, just list the different curation dirs if not curation_dir: rdir = utils.curation_repository_path() if os.path.isdir(rdir) is False: return False, 'the curation repository path does not exist. Please run "datacur -c config".\n' num_curs = 0 sys.stderr.write('Curation endpoints found in repository:\n') for x in os.listdir(rdir): xpath = os.path.join(rdir, x) # discard if the item is not a directory if not os.path.isdir(xpath): continue num_curs += 1 creation_date = get_creation_date(xpath) sys.stderr.write("\n{} {}\n".format(x, creation_date)) sys.stderr.write( "\nRetrieved list of curation endpoints from {}\n".format(rdir)) return True, "{} endpoints found".format(num_curs) else: # if a path name is provided, list files base_path = utils.curation_tree_path(curation_dir) num_files = 0 sys.stderr.write( 'Files found in curation endpoint {}:\n'.format(curation_dir)) for x in os.listdir(base_path): if x.endswith('.json'): continue num_files += 1 xpath = os.path.join(base_path, x) creation_date = get_creation_date(xpath) sys.stderr.write("\n{} {}\n".format(x, creation_date)) return True, "Endpoint {} has {} files".format(curation_dir, num_files)
def action_new(curation_path: str) -> Tuple[bool, str]: """ Create a new curation endpoint tree, using the given name. :param curation_path: curation endpoint in curation repository where output will be saved :return bool: True when evertyhing has workded, otherwise False. :return str: strings that would be the equivalent to the standard error. """ if not curation_path: return False, 'empty endpoint curation label\n' # importlib does not allow using 'test' and issues a misterious error when we # try to use this name. This is a simple workaround to prevent creating paths # with this name if curation_path == 'test': return False, 'the name "test" is disallowed, please use any other name' # curation endpoint directory ndir = pathlib.Path(utils.curation_tree_path(curation_path)) # check if there is already a tree for this endpoint if ndir.exists(): return False, "Endpoint {} already exists\n".format(curation_path) try: ndir.mkdir(parents=True) sys.stderr.write("{} created\n".format(ndir)) except: return False, "Unable to create path for {} endpoint".format( curation_path) # Copy classes skeletons to ndir wkd = pathlib.Path(os.path.dirname(os.path.abspath(__file__))) # copy parameter yml file params_path = wkd / 'children' / 'curation_parameters.yaml' shutil.copy(params_path, ndir) sys.stderr.write("New endpoint {} created\n".format(curation_path)) return True, "new endpoint {} created".format(curation_path)
def action_remove(curation_endpoint: str) -> Tuple[bool, str]: """ Remove the curation endpoint directory indicated as argument :param curation_endpoint: curation endpoint to be removed """ if not curation_endpoint: return False, 'Empty curation endpoint' rdir = utils.curation_tree_path(curation_endpoint) if not os.path.isdir(rdir): return False, '{} not found'.format(curation_endpoint) shutil.rmtree(rdir, ignore_errors=True) sys.stderr.write("Curation endpoint dir {} has been removed\n".format( curation_endpoint)) return True, "Curation endpoint dir {} has been removed".format( curation_endpoint)
def calculate_data_stats(self, dataframe: pd.DataFrame): """ Counts how many substances have been processed, how many haven't and the different types of substances calculated. :param dataframe: curated data dataframe """ data_stats = self.get_number_of_processed_vs_unprocessed(dataframe) subs_types_stats = self.get_total_of_smiles_per_type_of_substance( dataframe) general_stats = {} general_stats['curation_stats'] = data_stats general_stats['substance_types'] = subs_types_stats stats_file = utils.curation_tree_path('/'.join( [self.endpoint, 'statistics.pkl'])) with open(stats_file, 'wb') as fo: pickle.dump(general_stats, fo)
def update_file_curation(self, curation: str) -> Union[Tuple[bool,str], bool]: """ Function to save current parameter values modified at the object level (i.e: From a interactive python shell) :param curation: """ p = self.p if not p: return False, 'No loaded parameters' parameters_file_path = utils.curation_tree_path(curation) parameters_file_name = os.path.join(parameters_file_path, 'curation_parameters.yaml') try: with open(parameters_file_name, 'w') as pfile: yaml.dump (p, pfile) except Exception as e: return False, e return True
def action_export(curation_endpoint: str) -> Tuple[bool, str]: """ Exports the whole curation endpoint tree indicated in the argument as a single tarball file with the same name. :param curation_endpoint: path to curation endpoint in the repo. """ if not curation_endpoint: return False, 'Empty endpoint name' current_path = os.getcwd() exportfile = os.path.join(current_path, curation_endpoint + '.tgz') base_path = utils.curation_tree_path(curation_endpoint) if not os.path.isdir(base_path): return False, 'Unable to export, endpoint directory not found' # change to curation repository to tar the file from there os.chdir(base_path) itemend = os.listdir() itemend.sort() with tarfile.open(exportfile, 'w:gz') as tar: for iversion in itemend: if not os.path.isdir(iversion): continue tar.add(iversion) # return to current directory os.chdir(current_path) sys.stderr.write("Endpoint {} exported as {}.tgz\n".format( curation_endpoint, curation_endpoint)) return True, "Endpoint {} exported as {}.tgz".format( curation_endpoint, curation_endpoint)