def Json2Parq(args=None): schema = pa.schema([ pa.field('QueryID', pa.string), pa.field('QueryText', pa.string), ]) date_format = "%Y-%m-%dT%H:%M:%S.%fZ" input_filename = "/data/query_logs.json" output_filename = "/Users/ka/2020fa-final-project-kumar-anish/data/query_logs.parquet" convert_json(input_filename, output_filename) print("done...")
def invertedIndex(self,keyColumn, valueColumn, fileName,parquet=True): if self.df is None: print("No Data Available") else: inverted_index= dict() for index,row in self.df.iterrows(): if row[keyColumn] in inverted_index: inverted_index[row[keyColumn]].append(row[valueColumn]) else: inverted_index[row[keyColumn]]=[row[valueColumn]] JSONfileName=fileName+".json" with open(JSONfileName, 'w') as outfile: json.dump(inverted_index, outfile) if parquet==True: convert_json(JSONfileName,fileName+".parquet")
from json2parquet import convert_json convert_json("../files/worldcities.json", "../files/worldcities.parquet")
def JSONtoParquet(jsonFile,fileName): convert_json(jsonFile,fileName+".parquet")
import os import shutil import gzip # this script converts zipped ndjson files from path ndjson_dir_name to parquet files stored in '../../parquet dir = os.path.dirname(__file__) ndjson_dir_name = "../../../zq-sample-data/zeek-ndjson" unzipped_dir = '../../unzipped_ndj' parquet_dir = '../../parquet' for root, dirs, files in os.walk(ndjson_dir_name, topdown=False): for name in files: src_filename = os.path.join(dir, unzipped_dir, name.split('.')[0] + '.ndjson') dest_filename = os.path.join(dir, parquet_dir, name.split('.')[0] + '.parquet') os.makedirs(os.path.join(dir, unzipped_dir), exist_ok=True) os.makedirs(os.path.join(dir, parquet_dir), exist_ok=True) zipped_ndjson_file = os.path.join(root, name) print("processing " + os.path.join(root, name)) with gzip.open(zipped_ndjson_file, 'rb') as f_in: with open(src_filename, 'wb') as f_out: f_out.write(f_in.read()) try: convert_json(src_filename, dest_filename) except Exception as e: print("Failed to process the file: " + name) print(e)
# use Go IEX's pcap2json examples below # pcap2csv parses out just open, high, low, close, volume by symbol w ns timestamp pcap2csv < data%2Ffeeds%2F20180913%2F20180913_IEXTP1_DEEP1.0.pcap > 20180913_IEXTP1_DEEP1.0.csv # pcap2json parses out the tcp headers and leave all of other message Database pcap2json < data%2Ffeeds%2F20180913%2F20180913_IEXTP1_DEEP1.0.pcap > 20180913_IEXTP1_DEEP1.0.json # json2parquet python library convert the json to parquet, which pandas and pyarrow work better with from json2parquet import convert_json # Infer Schema (requires reading dataset for column names) convert_json('20180913_IEXTP1_DEEP1.0.json', '20180913_IEXTP1_DEEP1.0.parquet') # -*- coding: utf-8 -*- import click import logging from pathlib import Path from dotenv import find_dotenv, load_dotenv @click.command() @click.argument('input_filepath', type=click.Path(exists=True)) @click.argument('output_filepath', type=click.Path()) def main(input_filepath, output_filepath): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') if __name__ == '__main__':
from json2parquet import convert_json columns = [ "method", "path", "format", "controller", "action", "status", "duration", "view", "db", "ip", "route", "request_id", "req_params", "user_id", "realname", "nickname", "email", "source", "tags", "@timestamp", "@version" ] convert_json('logstasher.log', 'logstasher_current.log', columns)