def count_chunk_elements1(fname, chunksize=1000000, max_chunk=None, process_chunk=False): symbol_roots = Counter() for (i, chunk) in enumerate( islice( raw_taq.TAQ2Chunks(fname, chunksize=chunksize, process_chunk=process_chunk), max_chunk)): counts = np.unique(chunk[:]['Symbol_root'], return_counts=True) symbol_roots.update(dict(zip_longest(counts[0], counts[1]))) #print("\r {0}".format(i),end="") return symbol_roots
def test_row_values(fname): sample = taq.TAQ2Chunks(sample_data_dir + fname) chunk = next(sample.iter_) assert len(chunk) == sample.chunksize first_row_vals = {} for (x, y) in config.items('file1-row-values'): first_row_vals[x] = y print(first_row_vals) field_mapping = {} field_names = chunk.dtype.names i = 0 for field in field_names: field_lower = field.lower() field_mapping[field_lower] = str(chunk[0][i]) i += 1 assert field_mapping[field_lower] == first_row_vals[field_lower] print(field_mapping)
#!/usr/bin/env python3 import raw_taq import pandas as pd import numpy as np from statistics import mode, StatisticsError def process_chunks(taq): chunk_gen = taq.convert_taq(20) #create a generator for calling each chunk first_chunk = next(chunk_gen) accum = pd.DataFrame(first_chunk) for chunk in chunk_gen: accum.append(pd.DataFrame(chunk)) print(accum) if __name__ == '__main__': # fname = '../local_data/EQY_US_ALL_BBO_20150102.zip' # fname = '../local_data/EQY_US_ALL_BBO_20140206.zip' from sys import argv fname = '../local_data/EQY_US_ALL_BBO_201501' + argv[1] + '.zip' print("processing", fname) local_taq = raw_taq.TAQ2Chunks(fname) process_chunks(local_taq)
if __name__ == '__main__': options = read_command(sys.argv) # Prompt user to overwrite previous output files clear_log_dir() for i in range(len(DATA_FILES)): test_file = DATA_FILES[i] # Generate name for output file. Assumes filename of form "EQY_US_ALL_BBO_YYYYMMDD.zip" out_name = test_file[15:23] # type(sample) is raw_taq.TAQ2Chunks sample = taq.TAQ2Chunks(test_file) print ("+++ Creating log file for [" + test_file + "] as ./test-logs/"+out_name+"_log.txt") with open("test-logs/"+out_name+"_log.txt", 'w') as log: for chunk in sample.iter_: # chunk is a numpy array of tuples # print (type(chunk[0])) sorted_dtype = [(x,str(y[0])) for x,y in sorted(chunk.dtype.fields.items(),key=lambda k: k[0])] for attr, type in sorted_dtype: log.write(attr + " ") # for attr, type in chunk.dtype.fields.items(): # print (attr) # print (" ")
process_chunk=process_chunk), max_chunk)): counts = np.unique(chunk[:]['Symbol_root'], return_counts=True) symbol_roots.update(dict(zip_longest(counts[0], counts[1]))) #print("\r {0}".format(i),end="") return symbol_roots if __name__ == '__main__': t0 = time.time() faqname = "/global/scratch/aculich/mirror/EQY_US_ALL_BBO/EQY_US_ALL_BBO_2015/EQY_US_ALL_BBO_201501/EQY_US_ALL_BBO_20150102.zip" chunks = raw_taq.TAQ2Chunks(faqname, chunksize=1, process_chunk=False) try: max_chunk = int(argv[1]) except: max_chunk = None c = count_chunk_elements1(faqname, max_chunk=max_chunk) t1 = time.time() print("total number of records", sum(c.values())) print("timing:", t0, t1, t1 - t0) for (i, (k, v)) in enumerate(islice(c.most_common(), 100)):