def load_folder(folder: str, file_type: str = "", logger=app_logger) -> List[str]: """ Load all files under given folder, optional with selected file suffix Args: folder: path of the folder. file_type: type of the file, default value is "" for no file type filter. logger: logger for logging. Returns: file_abs_path_lst: the list of files under given folder in absolute path """ abs_path = get_abs_path(folder) file_lst = os.listdir(abs_path) file_abs_path_lst = [os.path.join(abs_path, x) for x in file_lst] if file_type: file_abs_path_lst = [ f for f in file_abs_path_lst if f.lower().endswith(file_type.lower()) ] file_abs_path_lst = [abs_f for abs_f in file_abs_path_lst if os.path.isfile(abs_f)] logger.debug( f"Fund {file_type} files:\nunder folder: {folder}\nfiles:\n {file_abs_path_lst}" ) return file_abs_path_lst
def __init__( self, db_info: dict, schema: str = "lynx_db", output_rules: dict = default_output_rules, nomenclature: str = "LipidLynxX", logger=app_logger, ): self.logger = logger self.nomenclature = nomenclature self.export_rule = load_output_rule(output_rules, nomenclature) self.db_sites_rule = self.export_rule.get("DB_SITES", None) self.db_separators = self.export_rule.get("SEPARATORS", []) if not self.db_sites_rule: raise ValueError( f"Cannot find output rule for 'MODS' from nomenclature: {nomenclature}." ) self.db_info = db_info.get("DB_INFO", {}).get("0.0_DB", {}) self.schema = schema self.type = "DB" self.db_level = str(db_info.get("DB_LEVEL", 0)) if self.db_level == "0": self.db_level = "0.0" with open(get_abs_path(lynx_schema_cfg[self.schema]), "r") as s_obj: self.validator = Draft7Validator( json.load(s_obj), resolver=RefResolver(f"file://{core_schema_path}", referrer=core_schema), ) self.db_count = self.db_info.get("DB_COUNT", 0) self.db_site = self.to_db_site_list() self.db_site_info = self.to_db_site_info_list() self.sum_db_info = self.to_sum_info()
def get_json(file: str) -> dict: file = get_abs_path(file) if file.lower().endswith(".json"): with open(file) as file_obj: js_obj = json.load(file_obj) return js_obj else: raise IOError(f"Input file: {file} is not json file")
def save_table(df: pd.DataFrame, file_name: str) -> (bool, str): is_output = False abs_output_path = None if not df.empty: df.to_excel(file_name) is_output = True abs_output_path = get_abs_path(file_name) return is_output, abs_output_path
def test_input_rule(test_file): app_logger.debug("SETUP TESTS...") app_logger.info(test_file) in_file = None if test_file: in_file = get_abs_path(test_file) if not in_file: in_file = get_abs_path( r"lynx/configurations/rules/input/LipidLynxX.json") app_logger.info(f"Test file {in_file}") rule = InputRules(test_file) app_logger.debug(f"Got infile {in_file}") app_logger.debug(f"test input rule {rule.sources}") if rule.is_validated is False: raise Exception(f"FAILED: test input rule {rule.sources}") else: app_logger.info(f"PASSED: test input rule {rule.sources}") app_logger.info(f"test PASSED")
def test_output_rule(test_file): app_logger.debug("SETUP TESTS...") app_logger.info(test_file) in_file = None if test_file: in_file = get_abs_path(test_file) if not in_file: in_file = get_abs_path( r"lynx/configurations/rules/output/LipidLynxX.json") app_logger.info(f"Test file {in_file}") rule = OutputRules(test_file) app_logger.debug(f"Got Output infile {in_file}") app_logger.debug(f"test Output rule {rule.nomenclature}") if rule.is_structure_valid is False: raise Exception(f"FAILED: test Rule {rule.nomenclature}") else: app_logger.info(f"PASSED: test Rule {rule.nomenclature}") app_logger.info( f"Supported LMSD classes: {rule.supported_lmsd_classes}") app_logger.info(f"test PASSED")
def __init__( self, mod_info: dict, db: int = 0, num_o: int = 0, schema: str = "lynx_mod", output_rules: dict = default_output_rules, nomenclature: str = "LipidLynxX", logger=app_logger, ): self.logger = logger self.nomenclature = nomenclature self.export_rule = load_output_rule(output_rules, nomenclature) self.mod_rule = self.export_rule.get("MODS", None) self.mod_rule_orders = self.mod_rule.get("MOD", {}).get("ORDER", []) self.mod_separators = self.export_rule.get("SEPARATORS", []) if not self.mod_rule: raise ValueError( f"Cannot find output rule for 'MODS' from nomenclature: {nomenclature}." ) self.mod_info = mod_info.get("MOD_INFO", {}) self.schema = schema self.type = "Modification" self.mod_level = str(mod_info.get("MOD_LEVEL", 0)) if num_o > 0: if self.mod_level in ["0", "1"]: self.mod_level = "2" elif self.mod_level in ["0.1", "1.1"]: self.mod_level = "2.1" elif self.mod_level in ["0.2", "1.2"]: self.mod_level = "2.2" else: pass else: pass with open(get_abs_path(lynx_schema_cfg[self.schema]), "r") as s_obj: self.validator = Draft7Validator( json.load(s_obj), resolver=RefResolver(f"file://{core_schema_path}", referrer=core_schema), ) self.db_count = db self.additional_o_count = num_o self.sum_mod_info = self.to_sum_info() self.mod_id = self.sum_mod_info.get("id", "") self.mod_linked_ids = self.sum_mod_info.get("linked_ids", {}) self.mod_list = self.sum_mod_info.get("info", {})
def __init__(self, lipid_code: str, logger=app_logger): self.lipid_code = lipid_code self.lynx_class_lv0 = "" self.schema = "lynx_core" with open(get_abs_path(lynx_schema_cfg[self.schema]), "r") as s_obj: self.validator = Draft7Validator(json.load(s_obj)) self.level = "B0" self._lipid_level = "B" self._max_mod_level = 0 self.is_modified = False self.sum_info = self.__post_init__() self.residues = self.sum_info.get("residues", []) self.level = self.sum_info.get("level", "") self.linked_ids = self.sum_info.get("linked_ids", {}) self.logger = logger self.logger.info( f"Level {self.level:4s} FattyAcid created from: {self.lipid_code}" )
def __init__( self, input_data: Union[str, dict, InputDictData], level: Union[str, List[str]], rule: str = "LipidLynxX", input_rules: dict = default_input_rules, output_rules: dict = default_output_rules, logger=app_logger, ): if isinstance(input_data, str): abs_path = get_abs_path(input_data) if abs_path.lower().endswith(".xlsx"): df = pd.read_excel(abs_path) elif abs_path.lower().endswith(".csv"): df = pd.read_csv(abs_path) else: raise ValueError(f"Cannot read file {abs_path}") df.fillna("") self.data = df.to_dict(orient="list") elif isinstance(input_data, dict): self.data = input_data else: raise ValueError(f"Not supported input {type(input_data)}") if isinstance(level, str): self.levels = [level] else: self.levels = level self.encoder = Encoder( style=rule, input_rules=input_rules, output_rules=output_rules, logger=logger, ) self.header_lst = self.data.keys() self.logger = logger
def load_cfg_info(cfg_path: str = None) -> Dict[str, str]: cfg_dct = {} default_fields = [ "api_version", "app_log_level", "app_url", "app_port", "app_prefix", "cli_log_level", "controlled_vocabularies", "defined_alias", "input_rules", "output_rules", "resource_kegg", "resource_lion", "resource_lion", "temp_folder", "temp_max_days", "temp_max_files", "zmq_client_port", "zmq_worker_port", "zmq_worker_runner", ] config = configparser.ConfigParser() if cfg_path and isinstance(cfg_path, str): config_path = get_abs_path(cfg_path) else: try: config_path = get_abs_path("config.ini") except FileNotFoundError: config_path = get_abs_path("configure.ini") config.read(config_path) if config.has_section("settings"): user_cfg = "settings" elif config.has_section("default"): user_cfg = "default" else: user_cfg = "" raise ValueError(f"Cannot __load__ settings from file {config_path}") if len(user_cfg) > 2: options = config.options(user_cfg) for field in default_fields: if field in options and field in [ "controlled_vocabularies", "defined_alias", "input_rules", "output_rules", ]: cfg_dct[field] = get_abs_path(config.get(user_cfg, field)) else: try: cfg_dct[field] = config.get(user_cfg, field) except configparser.NoOptionError: pass if "app_url" not in cfg_dct: cfg_dct["app_url"] = "127.0.0.1" if "app_port" not in cfg_dct: cfg_dct["app_port"] = "1399" if "zmq_client_port" not in cfg_dct: cfg_dct["zmq_client_port"] = 5559 if "zmq_worker_port" not in cfg_dct: cfg_dct["zmq_worker_port"] = 5560 if "zmq_worker_runner" not in cfg_dct: cfg_dct["zmq_worker_runner"] = 5 usr_app_prefix = cfg_dct.get("app_prefix", "").strip("/") if usr_app_prefix: if re.match(r"^\s*None\s*$", usr_app_prefix, re.IGNORECASE): usr_app_prefix = "" else: usr_app_prefix = f"/{usr_app_prefix}" cfg_dct["app_prefix"] = usr_app_prefix return cfg_dct
def create_converter_output( data: dict, output_name: Union[str, Path] = None, file_type: str = ".xlsx", converted_only: bool = False, ) -> Union[BytesIO, str]: file_info = None converted_df = pd.DataFrame() not_converted_df = pd.DataFrame() if data and not converted_only: not_converted_dct = {} df_lst = [] for k in data: if isinstance(data[k], dict): k_pairs = data[k].get("converted", []) k_not_converted = data[k].get("skipped", []) if k_pairs and isinstance(k, str): df_lst.append(pd.DataFrame(k_pairs, columns=[k, f"{k}_converted"])) if k_not_converted: not_converted_dct[f"{k}_skipped"] = k_not_converted elif isinstance(data[k], ConvertedListData): k_pairs = data[k].converted if k_pairs and isinstance(k, str): df_lst.append(pd.DataFrame(k_pairs, columns=[k, f"{k}_converted"])) k_not_converted = data[k].skipped if k_not_converted: not_converted_dct[f"{k}_skipped"] = k_not_converted elif isinstance(data[k], list) and k == "converted": k_pairs = data.get("converted", []) if k_pairs: df_lst.append( pd.DataFrame(k_pairs, columns=["input", f"converted"]) ) elif isinstance(data[k], list) and k == "skipped": k_not_converted = data.get("skipped", []) if k_not_converted: not_converted_dct[f"skipped"] = k_not_converted if df_lst: converted_df = pd.concat(df_lst, axis=1) if not_converted_dct: not_converted_df = pd.DataFrame.from_dict( not_converted_dct, orient="index" ).T elif data and converted_only: converted_df = pd.DataFrame( {key: pd.Series(value) for key, value in data.items()} ) else: pass if not converted_df.empty: if output_name: try: err_msg = None if isinstance(output_name, Path): output_name = output_name.as_posix() elif isinstance(output_name, str): pass else: err_msg = ( f"[Type error] Cannot create file: {output_name} as output." ) if output_name.lower().endswith("csv"): converted_df.to_csv(output_name) else: converted_df.to_excel( output_name, sheet_name="converted", index=False ) if err_msg: file_info = err_msg else: file_info = get_abs_path(output_name) except IOError: file_info = f"[IO error] Cannot create file: {output_name} as output." else: file_info = BytesIO() if file_type.lower().endswith("csv"): file_info.write(converted_df.to_csv().encode("utf-8")) else: output_writer = pd.ExcelWriter( file_info, engine="openpyxl" ) # write to BytesIO instead of file path converted_df.to_excel( output_writer, sheet_name="converted", index=False ) if not not_converted_df.empty: not_converted_df.to_excel( output_writer, sheet_name="skipped", index=False ) output_writer.save() file_info.seek(0) return file_info
def create_linker_output( data: dict, output_name: Union[str, Path] = None, file_type: str = ".xlsx", export_url: bool = True, ) -> Union[BytesIO, str]: file_info = None file_linked_resources = {} if data: for sheet in data: sheet_linked_resources = {} sheet_data = data.get(sheet, {}) sheet_export_data = sheet_data.get("export_file_data", {}) idx = 1 for lipid_name in sheet_export_data: lipid_resources = {} if isinstance(sheet_export_data[lipid_name], dict): lipid_resources["Input_Name"] = sheet_export_data[lipid_name].get( "lipid_name", "" ) lipid_resources["Shorthand_Notation"] = sheet_export_data[ lipid_name ].get("shorthand_name", "") lipid_resources["LipidLynxX"] = sheet_export_data[lipid_name].get( "lynx_name", "" ) lipid_resources["BioPAN"] = sheet_export_data[lipid_name].get( "biopan_name", "" ) resource_data = sheet_export_data[lipid_name].get( "resource_data", {} ) for db_group in resource_data: db_group_resources = resource_data[db_group] for db in db_group_resources: db_resources = db_group_resources.get(db) if db_resources and isinstance(db_resources, dict): if len(list(db_resources.keys())) < 2: lipid_resources[db] = ";".join( list(db_resources.keys()) ) lipid_resources[f"Link_{db}"] = ";".join( [db_resources.get(i) for i in db_resources] ) else: lipid_resources[db] = json.dumps( list(db_resources.keys()) ) lipid_resources[f"Link_{db}"] = json.dumps( [db_resources.get(i) for i in db_resources] ) else: lipid_resources[db] = "" sheet_linked_resources[idx] = lipid_resources idx += 1 file_linked_resources[sheet] = sheet_linked_resources default_col = ["Input_Name", "Shorthand_Notation", "LipidLynxX", "BioPAN"] file_linked_df_dct = {} if file_linked_resources: for sheet in file_linked_resources: sum_df = pd.DataFrame(data=file_linked_resources.get(sheet)).T sum_df_columns = sum_df.columns.tolist() link_cols = [] for col in sum_df_columns: if col.startswith("Link_"): sum_df_columns.remove(col) if export_url: link_cols.append(col) elif col in default_col: sum_df_columns.remove(col) sum_df_columns = ( default_col + natsorted(sum_df_columns) + natsorted(link_cols) ) linked_df = pd.DataFrame(sum_df, columns=sum_df_columns) file_linked_df_dct[sheet] = linked_df if output_name: try: err_msg = None if isinstance(output_name, Path): output_name = output_name.as_posix() elif isinstance(output_name, str): pass else: err_msg = f"[Type error] Cannot create file: {output_name} as output." if output_name.lower().endswith("csv"): for s in file_linked_df_dct: s_df = file_linked_df_dct.get(s, pd.DataFrame()) s_df.to_csv(output_name) break else: output_writer = pd.ExcelWriter(output_name, engine="openpyxl") for s in file_linked_df_dct: s_df = file_linked_df_dct.get(s, pd.DataFrame()) if not s_df.empty: s_df.to_excel(output_writer, sheet_name=s) else: pass output_writer.save() if err_msg: file_info = err_msg else: file_info = get_abs_path(output_name) except IOError: file_info = f"[IO error] Cannot create file: {output_name} as output." else: file_info = BytesIO() if file_type.lower().endswith("csv"): for s in file_linked_df_dct: s_df = file_linked_df_dct.get(s, pd.DataFrame()) file_info.write(s_df.to_csv().encode("utf-8")) break else: output_writer = pd.ExcelWriter( file_info, engine="openpyxl" ) # write to BytesIO instead of file path for s in file_linked_df_dct: s_df = file_linked_df_dct.get(s, pd.DataFrame()) if not s_df.empty: s_df.to_excel(output_writer, sheet_name=s) else: pass output_writer.save() file_info.seek(0) return file_info
from lynx.utils.basics import get_abs_path from lynx.utils.cfg_reader import app_cfg_info from lynx.utils.params_loader import ( build_mod_parser, build_input_rules, build_output_rules, ) from lynx.utils.ports import check_port # Define default values across LipidLynx # load default values from files defined in config.ini # following parameters generated will be used as global values default_input_rules = build_input_rules(app_cfg_info["input_rules"]) default_output_rules = build_output_rules(app_cfg_info["output_rules"]) default_cv_file = get_abs_path(app_cfg_info["controlled_vocabularies"]) default_alias_file = get_abs_path(app_cfg_info["defined_alias"]) default_kegg_file = get_abs_path(app_cfg_info["resource_kegg"]) default_lion_file = get_abs_path(app_cfg_info["resource_lion"]) default_temp_folder = app_cfg_info.get("temp_folder", r"lynx/temp") default_temp_max_days = int(app_cfg_info.get("temp_max_days", "3")) default_temp_max_files = int(app_cfg_info.get("temp_max_files", "99")) default_zmq_worker_runner = int(app_cfg_info.get("zmq_worker_runner", 5)) if os.path.isdir(default_temp_folder): pass else: os.mkdir(default_temp_folder) default_temp_folder = get_abs_path(default_temp_folder) with open(default_cv_file, "r") as cv_js: