def analyze_csv(file_path, model_ml=None, num_rows=500, date_process=TODAY, do_both_analysis="ml", return_probabilities=True, output_mode="ALL"): logger.info(" csv_detective on {}".format(file_path)) try: if do_both_analysis: logger.info(f"Starting vanilla CSV Detective on file {file_path}") if do_both_analysis != "ml": dict_result = routine(file_path, num_rows=num_rows, output_mode="ALL") else: dict_result = routine(file_path, num_rows=num_rows, user_input_tests=None) else: # Get ML tagging logger.info(f"Starting ML CSV Detective on file {file_path}") dict_result = routine(file_path, num_rows=num_rows, user_input_tests=None) if do_both_analysis != 'rule': dict_result = routine_ml(csv_detective_results=dict_result, file_path=file_path, model_ml=model_ml, num_rows=500, return_probabilities=return_probabilities) # combine rb and ml dicts into a single one if output_mode == "ALL" and return_probabilities: if "columns" in dict_result and "columns_ml" in dict_result: dict_result["columns"] = join_reports( dict_rb=dict_result["columns"], dict_ml=dict_result["columns_ml_probas"]) dict_result.pop("columns_ml_probas") else: logger.error(f"Only ML or RULE analysis is ongoing...") except Exception as e: logger.info("Analyzing file {0} failed with {1}".format(file_path, e)) return {"error": "{}".format(e)} dict_result['analysis_date'] = date_process return dict_result
def analyze_csv(file_path, analysis_type="both", pipeline=None, num_rows=500, include_datasetID=None): logger.info(" csv_detective on {}".format(file_path)) if include_datasetID: final_id = extract_id(file_path) if final_id in include_datasetID: final_id = f"{include_datasetID[final_id]}/{final_id}" else: final_id = f"NODATASETID/{final_id}" logger.info( f"Resource ID {final_id} not found in RESOURCEID2DATASETID dict" ) try: if analysis_type == "both" or analysis_type == "rule": logger.info(f"Starting vanilla CSV Detective on file {file_path}") dict_result = routine(file_path, num_rows=num_rows) if "columns" in dict_result: dict_result["columns"] = { k.strip('"'): v for k, v in dict_result["columns"].items() } dict_result["columns_rb"] = dict_result["columns"] dict_result.pop("columns") else: # Get ML tagging logger.info(f"Starting ML CSV Detective on file {file_path}") dict_result = routine(file_path, num_rows=num_rows, user_input_tests=None) if analysis_type != "rule": assert pipeline is not None y_pred, csv_info = get_columns_ML_prediction(file_path, pipeline, dict_result, num_rows=num_rows) dict_result["columns_ml"] = get_columns_types(y_pred, csv_info) except Exception as e: logger.info("Analyzing file {0} failed with {1}".format(file_path, e)) return final_id, {"error": "{}".format(e)} dict_result['analysis_date'] = TODAY return final_id, dict_result
def test_old_detection(): file_path = './annuaire-de-leducation.csv' expected_results = sort_keys(json.load(open("baseline_result.json"))) # Open your file and run csv_detective inspection_results = sort_keys(routine(file_path)) current_result = json.dump(inspection_results, open("current_result.json", "w")) pprint(inspection_results) assert str(inspection_results) == str(expected_results)
def run_csv_detective(file_path): logger.info("Treating file {}".format(file_path)) try: inspection_results = routine(file_path) except Exception as e: logger.info(e) return if len(inspection_results) > 2 and len(inspection_results["columns"]): inspection_results["file"] = file_path logger.info(file_path, inspection_results) return inspection_results else: logger.info("Analysis output of file {} was empty".format(files_path))
def get_csv_detective_metadata(csv_detective_cache: dict, csv_file_path: Path, num_rows=5000): """ Try and get the already computed meta-data of the csv_id passed, whether from a cached dict or calling the csv_detective routines :param csv_detective_cache: A key:value dict csv_id:csv_detective_info. Or an empty dict. :param csv_id: The id of the currently analysed csv file :return: The metadata of the csv file """ csv_file_path = Path(csv_file_path) csv_id = csv_file_path.stem if csv_detective_cache and csv_id in csv_detective_cache: return csv_detective_cache[csv_id] try: dict_result = routine(csv_file_path.as_posix(), num_rows=num_rows) except: return {} csv_detective_cache[csv_id] = dict_result json.dump(csv_detective_cache, open("./data/csv_detective_analysis.json", "w"), indent=4) return dict_result
# Import the csv_detective package import os import json from pathlib import Path from csv_detective.explore_csv import routine # Replace by your file path input_folder = Path() / "tests" / "data" output_folder = Path() / 'tests' / 'output_data' for folder in os.listdir(input_folder): for file in os.listdir(input_folder / folder): # Open your file and run csv_detective file_path = input_folder / folder / file inspection_results = routine(file_path) # Write your file as json output_folder_file = output_folder / folder if not output_folder_file.exists(): os.makedirs(output_folder_file) output_file_path = output_folder_file / file with open(output_file_path.with_suffix('.json'), 'w', encoding="utf8") as fp: json.dump(inspection_results, fp, indent=4, separators=(',', ': '))
if from_cache_only: generator = [os.path.join('.cache_csv', x) for x in os.listdir('.cache_csv')] # generator = [x for x in generator if '.zip' in x] else: generator = download_all() # zfile = zipfile.ZipFile(generator[0]) # for name in zfile.namelist(): # (dirname, filename) = os.path.split(name) # print "Decompressing " + filename + " on " + dirname # if not os.path.exists(dirname): # os.makedirs(dirname) # zfile.extract(name, dirname) # import pdb # pdb.set_trace() for idx, file_path in enumerate(generator): print idx, if '.csv' in file_path: # Open your file and run csv_detective with open(file_path, 'r') as file: inspection_results = routine(file, user_input_tests = list_tests) # Write your file as json json_path = os.path.join('cache_json', os.path.basename(file_path).replace('.csv', '.json')) with open(json_path, 'wb') as fp: json.dump(inspection_results, fp, indent=4, separators=(',', ': '), encoding="utf-8") if erase_csv_cache: os.remove(file_path)