def test_comp_df(): df = pd.DataFrame({ "a": [1, 1, 2, 2, 1, 3, 4, 4], "b": ["A", "A", "B", "B", "A", "C,D", "D C", "D C"] }) comp_df = lzhw.CompressedDF(df, parallel=True) comp_df2 = lzhw.CompressedDF(df, sliding_window=10) assert all(comp_df.compressed[1].decompress() == df.b) assert all(comp_df2.compressed[0].decompress() == df.a)
def main(): ## This script and the solution to convert xlsx into csv was thanks to the answer found here: ## https://stackoverflow.com/questions/28766133/faster-way-to-read-excel-files-to-pandas-dataframe ## and here: https://stackoverflow.com/questions/1858195/convert-xls-to-csv-on-command-line vbscript = """if WScript.Arguments.Count < 3 Then WScript.Echo "Please specify the source and the destination files. Usage: ExcelToCsv <xls/xlsx source file> <csv destination file> <worksheet number (starts at 1)>" Wscript.Quit End If csv_format = 6 Set objFSO = CreateObject("Scripting.FileSystemObject") src_file = objFSO.GetAbsolutePathName(Wscript.Arguments.Item(0)) dest_file = objFSO.GetAbsolutePathName(WScript.Arguments.Item(1)) worksheet_number = CInt(WScript.Arguments.Item(2)) Dim oExcel Set oExcel = CreateObject("Excel.Application") Dim oBook Set oBook = oExcel.Workbooks.Open(src_file) oBook.Worksheets(worksheet_number).Activate oBook.SaveAs dest_file, csv_format oBook.Close False oExcel.Quit """ def is_number(s): try: float(s) return True except ValueError: return False def csv_reader(file, cols, col_arg, nh_arg): if nh_arg: h = None else: h = 0 if col_arg: cols_used = cols.split(",") if is_number(cols_used[0]): cols_used = [int(i) - 1 for i in cols_used] else: cols_used = None data = pd.read_csv(file, header=h, usecols=cols_used) data.columns = list(map(str, data.columns)) return data parser = argparse.ArgumentParser( description= "LZHW is a tabular data compression tool. It is used to compress excel, csv and any flat file. Version: 0.0.10" ) parser.add_argument("-d", "--decompress", help="decompress input into output", action="store_true", default=False) parser.add_argument("-f", "--input", help="input file to be (de)compressed", type=str, required=True) parser.add_argument("-o", "--output", help="output where to save result", type=str, required=True) parser.add_argument( "-c", "--columns", nargs="+", help= "select specific columns by names or indices (1-based) to compress or decompress", type=str, required=False) parser.add_argument("-r", "--rows", help="select specific rows to decompress (1-based)", type=str, required=False) parser.add_argument( "-nh", "--no-header", help="skip header / data to be compressed has no header", action="store_true", default=False) parser.add_argument("-p", "--parallel", help="compress or decompress in parallel", action="store_true", default=False) parser.add_argument( "-j", "--jobs", help="Number of CPUs to use if parallel (default all but 2)", type=str, required=False, default="-3") args = vars(parser.parse_args()) file = args["input"] output = args["output"] para = args["parallel"] n_jobs = args["jobs"] if args["columns"]: cols = args["columns"][0] else: cols = "all" if args["rows"]: n_rows = int(args["rows"]) else: n_rows = 0 if args["decompress"]: start = time() if cols != "all": cols = cols.split(",") if is_number(cols[0]): cols = [int(i) - 1 for i in cols] if para: decompressed = lzhw.decompress_df_from_file(file, cols, n_rows, parallel=para, n_jobs=int(n_jobs)) else: decompressed = lzhw.decompress_df_from_file(file, cols, n_rows) decompressed.fillna("", inplace=True) decompressed = decompressed.replace("nan", "", regex=True) if "xls" in output: options = {} options["strings_to_formulas"] = False options["strings_to_urls"] = False writer = pd.ExcelWriter(output, engine="xlsxwriter", options=options) decompressed.to_excel(writer, output.split(".xls")[0], index=False) writer.save() if "csv" in output: decompressed.to_csv(output, index=False) else: with open(output, "w") as o: decompressed.to_string(o, index=False) print("Finalizing Decompression ...") print(f"Creating {output} file ...") print("time taken: ", (time() - start) / 60, " minutes") print("Decompressed Successfully") else: start = time() if "xls" in file: print( "Reading files, Can take 1 minute or something ...", "\nRunning CScript.exe to convert xls file to csv for better performance", "\n") f = open("excel_to_csv.vbs", "w") f.write(vbscript) f.close() csv_file = file.split(".xls")[0] + "1" + ".csv" call(["cscript.exe", "excel_to_csv.vbs", file, csv_file, "1"]) os.remove("excel_to_csv.vbs") data = csv_reader(csv_file, cols, args["columns"], args["no_header"]) os.remove(csv_file) elif "csv" in file: print("Reading files ...") data = csv_reader(file, cols, args["columns"], args["no_header"]) else: with open(file, "r") as i: data = i.read() if para: comp_df = lzhw.CompressedDF(data, parallel=para, n_jobs=int(n_jobs)) else: comp_df = lzhw.CompressedDF(data) print("Finalizing Compression ...") comp_df.save_to_file(output) print(f"Creating {output} file ...") print("time taken: ", (time() - start) / 60, " minutes") print("Compressed Successfully")
parser.add_argument("-o", "--output", help="output where to save result", type=str, required=True) args = vars(parser.parse_args()) file = args["input"] output = args["output"] if args["decompress"]: decompressed = lzhw.decompress_df_from_file(file) if "xls" in output: decompressed.to_excel(output, index=False) if "csv" in output: decompressed.to_csv(output, index=False) else: with open(output, "w") as o: decompressed.to_string(o, index=False) print("decompressed successfully") else: if "xls" in file: data = pd.read_excel(file) if "csv" in file: data = pd.read_csv(file) else: with open(file, "r") as i: data = i.read() comp_df = lzhw.CompressedDF(data) comp_df.save_to_file(output) print("compressed successfully")
def test_comp_df(): df = pd.DataFrame({"a": [1, 1, 2, 2, 1, 3, 4, 4], "b": ["A", "A", "B", "B", "A", "C,D", "D C", "D C"]}) comp_df = lzhw.CompressedDF(df) assert comp_df.compressed[1].decompress() == list(map(str, df.b))
import flask import lzhw from time import time import pandas as pd from flask import jsonify, request app = flask.Flask(__name__) app.config["DEBUG"] = True df = pd.read_csv("1500000 Sales Records.csv") comp = lzhw.CompressedDF(df) for i in range(len(df.columns)): comp.compressed[i].compressed = [ bin(i)[2:] for i in comp.compressed[i].compressed ] @app.route('/full', methods=['GET']) def get_full(): col = int(request.args["col"]) start = time() try: return df.iloc[:, col].to_json() finally: print(time() - start) @app.route('/compressed', methods=['GET']) def get_compressed(): col = int(request.args["col"]) try: