def df_check(df=[], meta_id=""): source_data = get_meta_data(meta_id=meta_id) meta_name = source_data["name"] meta_type = source_data["d_type"] schema_data = get_schema_data(meta_name=meta_name) # check index column is unique if "index" in schema_data: index_property = schema_data["index"] dup_check(df, index_property) outDir = make_outDir(meta_id) df_types = df.dtypes.apply(lambda x: x.name).to_dict() header = compare_df_to_schema(df_types, schema_data, meta_type) return header
def single_source(meta_id=""): meta_data = get_meta_data(meta_id=meta_id) out_dir = make_outDir(meta_id=meta_id) csv_file = os.path.join(out_dir, meta_id + ".csv.gz") csv_header = os.path.join(out_dir, meta_id + ".header") # create symlinks for import statements source_file = os.path.join(out_dir, meta_id + "-import-nodes.txt") target_file = os.path.join(merge_dir, meta_data["name"] + "-import-nodes.txt") create_sym_link(source=source_file, target=target_file) # create symlinks for constraint statements source_file = os.path.join(out_dir, meta_id + "-constraint.txt") target_file = os.path.join(merge_dir, meta_data["name"] + "-constraint.txt") if os.path.exists(source_file): create_sym_link(source=source_file, target=target_file)
def get_source_data(dname="all"): logger.info('Running merge with "{}" data types', dname) source_data = get_meta_data(meta_id="all") node_d = find_multiple(source_data, "nodes") if not dname == "all": node_d = {dname: node_d[dname]} logger.debug(node_d) for i in node_d: # check if already done f = os.path.join(merge_dir, i + ".csv.gz") logger.debug("Checking if already done {}", f) if os.path.exists(f): logger.info("Already processed {}", i) else: logger.info("Processing node: {} ...", i) if len(node_d[i]) > 1: df_merged = merge_source(node_d[i]) write_new_merged_files(df_merged, i) else: single_source(node_d[i][0]) logger.info("Processed node: {}", i)
from workflow.scripts.utils import settings from workflow.scripts.utils.general import get_meta_data env_configs = settings.env_configs graph_bolt_port = env_configs["graph_bolt"] graph_user = env_configs["graph_user"] graph_password = env_configs["graph_pass"] neo4j_import_dir = env_configs["neo4j_import_dir"] constraints = [] import_nodes = [] import_rels = [] source_data = get_meta_data(meta_id="all") # loop through nodes merged directory d = os.path.join(neo4j_import_dir, "nodes", "merged") for filename in os.listdir(d): if filename.endswith("constraint.txt"): with open(os.path.join(d, filename)) as f: for line in f: if not line.startswith("#"): constraints.append("echo '" + line.rstrip() + "'") constraints.append( "cypher-shell -a bolt://localhost:" + graph_bolt_port + " -u " + graph_user + " -p "
def merge_source(meta_ids=[]): logger.debug("multi source {}", meta_ids) data_frames = [] index_col = "" for i in meta_ids: logger.info("Processing meta_id: {}", i) meta_data = get_meta_data(i) schema_data = get_schema_data(meta_data["name"]) logger.debug(schema_data) out_dir = make_outDir(meta_id=i) if not args.nrows is None: args.nrows = int(args.nrows) df = create_df(data_dir=out_dir, name=i, nrows=args.nrows) # make index column a string to avoid merge issues, e.g. float and object index_col = f"{schema_data['index']}:ID({meta_data['name']}-ID)" # logger.debug('index_col {}',index_col) # don't need to fix int/float issues anymore as reading everything in as strings # df = column_zero_fix(df) logger.debug("\n{}", df.head()) logger.debug("\n{}", df.dtypes) data_frames.append(df) # get the constraints (not sure how to deal with multiple constraint files, assume they are the same...?) source_file = os.path.join(out_dir, i + "-constraint.txt") target_file = os.path.join(merge_dir, meta_data["name"] + "-constraint.txt") if os.path.exists(source_file): create_sym_link(source=source_file, target=target_file) logger.debug("index column: {}", index_col) # merge the dataframes on index logger.info("Merging {}", meta_ids) df_merged = reduce( lambda left, right: pd.merge(left, right, on=[index_col], how="outer"), data_frames, ).fillna("") logger.debug("\n{}", df_merged.head()) # find duplicate source columns and aggregate source_cols = df_merged.filter(regex="^_source.*", axis=1) logger.info("Aggregating source columns {}", source_cols.columns) # aggregate into neo4j array style (separated by ;) source_agg = source_cols.agg(lambda x: ";".join(y for y in x if y != ""), axis=1) logger.debug("\n{}", source_agg.value_counts()) # drop the merge source columns drop_cols = list(df_merged.filter(regex="^_source.*")) logger.debug("dropping cols {}", drop_cols) df_merged.drop(drop_cols, inplace=True, axis=1) # df_merged = df_merged[df_merged.columns.drop(drop_cols)] df_merged["_source:string[]"] = source_agg # check for column conflicts, e.g. b_x and b_y logger.info("Running conflict check with {} threads", THREADS) df_merged = column_conflict_check(df_merged) logger.debug("\n{}", df_merged.head()) # issue with merging adding .0 to integers df = column_zero_fix(df_merged) # convert entire df to strings as don't need integers for neo4j import df_merged = df_merged.applymap(str) # need to convert nan to empty string df_merged = df_merged.replace("nan", "") df_merged = df_merged.replace("None", "") # logger.debug("\n{}",df_merged) return df_merged
def create_import(df=[], meta_id="", import_type="import"): # qc the df schema_cols = df_check(df, meta_id) logger.info("Matched these columns {}", schema_cols) # add source column to node headers and df if node # meta_data = get_meta_data(meta_id) # if meta_data["d_type"] == "nodes": # schema_cols.append("source:string[]") # df["source:string[]"] = meta_data["source"] # add source info to nodes and rels meta_data = get_meta_data(meta_id) schema_cols.append("_source:string[]") df["_source:string[]"] = meta_data["source"] # add meta cols _name and _id to nodes if meta_data["d_type"] == "nodes": source_data = get_meta_data(meta_id=meta_id) meta_name = source_data["name"] schema_data = get_schema_data(meta_name=meta_name) logger.debug(schema_data) node_meta = node_meta_check(schema_data) # get type for _name and _id col name_col_type = schema_data["properties"][node_meta["_name"]]["type"] name_col_text = f"_name:{name_col_type}" id_col_type = schema_data["properties"][node_meta["_id"]]["type"] id_col_text = f"_id:{id_col_type}" # add to schema cols schema_cols.extend([name_col_text, id_col_text]) # add to dataframe df[name_col_text] = df[node_meta["_name"]] df[id_col_text] = df[node_meta["_id"]] logger.debug("\n{}", df.head()) # add indexes for meta properties constraintCommands = [ f"CREATE index on :{meta_name}(_name);", f"CREATE index on :{meta_name}(_id);", ] create_constraints(constraintCommands, meta_id) # create copy of header for import creation logger.info("Creating import statement") import_header = schema_cols.copy() create_import_commands(header=import_header, meta_id=meta_id, import_type=import_type) outDir = make_outDir(meta_id) # logger.debug(outDir) file_name = os.path.join(outDir, meta_id + ".csv.gz") df.to_csv(file_name, index=False, header=False, compression="gzip", columns=schema_cols) # run pandas profiling com = f"sh workflow/scripts/utils/pandas-profiling.sh {outDir} {meta_id} {THREADS}" logger.debug(com) try: out = subprocess.check_output(com, shell=True) logger.info(out) except: logger.error( "Pandas profiling didn't work, perhaps you haven't installed shuf, see README.md?" ) exit() # backup backup_processed_data(outDir, meta_id, meta_data["d_type"])
def create_import_commands(header, meta_id, import_type): outDir = make_outDir(meta_id) metaData = get_meta_data(meta_id) source_data = get_meta_data(meta_id=meta_id) meta_name = source_data["name"] meta_type = source_data["d_type"] schema_data = get_schema_data(meta_name=meta_name) # logger.debug(schema_data) if meta_type == "nodes": # convert node ID property to neo4j style if "index" in schema_data: index_property = schema_data["index"] li = header.index(index_property) logger.info("Index = {} {}", index_property, li) header[li] = index_property + ":ID(" + meta_name + "-ID)" logger.info(header) else: logger.error("Schema has no index, exiting") exit() # add meta _name and _id node_meta = node_meta_check(schema_data) # header.extend(['_name','_id']) elif meta_type == "rels": # convert relationships source/target properties to neo4j START END style source_index = header.index("source") source_id = schema_data["properties"]["source"]["type"] target_index = header.index("target") target_id = schema_data["properties"]["target"]["type"] header[source_index] = ":START_ID(" + source_id + "-ID)" header[target_index] = ":END_ID(" + target_id + "-ID)" # add property types for i, item in enumerate(header): if item in schema_data["properties"]: property_type = schema_data["properties"][item]["type"] # deal with arrays if property_type == "array": items_type = schema_data["properties"][item]["items"]["type"] property_type = f"{items_type}[]" elif property_type == "integer": property_type = "int" header[i] = item + ":" + property_type write_header( dir=outDir, headerData={ "fileName": meta_id + ".header", "data": ",".join(header), }, ) # don't create import statements for load csv data if not import_type == "load": write_import( id=meta_id, dir=outDir, importCommands=[{ "type": metaData["d_type"], "name": metaData["name"], "file": os.path.join("import", metaData["d_type"], meta_id, meta_id + ".csv.gz"), "header": os.path.join("import", metaData["d_type"], meta_id, meta_id + ".header"), }], )