def write_summary(self): ############################################################ # Finish filling in the parse summary info and close up ############################################################ self.__parse_manager.data_start(self.__data_start_timestamp) self.__parse_manager.data_stop(self.__current_timestamp) self.__parse_manager.irrelevants(0) self.__parse_manager.processed(self.__line_number+1) self.__parse_manager.mark_stop(True) self.__h5_file.close() ParseManager.summarize_file(self.__output_path)
def advance_date(self, new_date): if self.__h5_file: self.write_summary() self.__output_path = CME_OUT_PATH / get_date_string(new_date) if not self.__output_path.parent.exists(): os.makedirs(self.__output_path.parent) print "OUT", self.__output_path self.__h5_file = openFile(self.__output_path, mode="w", title="CME Fix Data") self.__parse_manager = ParseManager(self.__current_input_path, self.__h5_file) self.__parse_manager.mark_start() self.__prior_day_books = {} self.__data_start_timestamp = 0 self.__current_timestamp = 0 for symbol, builder in self.__book_builders.items(): self.__prior_day_books[symbol] = (builder.bid_book, builder.ask_book) self.__book_builders = {}
def parse(self, build_book = True, force = False, stop_early_at_hit=0): """ Parse the input file. There are two modes: build_book=True and build_book=False. If build_book=False, the h5 file is simply the same record data from the gz file, but stored as hdf5. If build_book=True, the hdf5 file created has book data for all matching inputs. Each symbol gets it's own dataset. The ParseManager is used to store summary information for the parse of this data. """ self.__output_path = self.__output_base + (build_book and ".h5" or "_AMD_.h5") logging.info("Parsing file %s\n\tto create %s"% (self.__input_path, self.__output_path)) if self.__output_path.exists() and not force: return if not self.__output_path.parent.exists(): os.makedirs(self.__output_path.parent) self.__h5_file = openFile(self.__output_path, mode = "w", title = "ARCA Equity Data") if not build_book: ## If not building book, then just writing out AMD data as hdf5 filters = Filters(complevel=1, complib='zlib') group = self.__h5_file.createGroup("/", '_AMD_Data_', 'Add-Modify-Delete data') table = self.__h5_file.createTable(group, 'records', ArcaRecord, "Data for "+str(self.__date), filters=filters) h5Record = table.row self.__parse_manager = ParseManager(self.__input_path, self.__h5_file) self.__parse_manager.mark_start() hit_count = 0 data_start_timestamp = None for self.__line_number, line in enumerate(gzip.open(self.__input_path, 'rb')): if stop_early_at_hit and hit_count == stop_early_at_hit: break ################################################### # Show progress periodically ################################################### if 0 == (self.__line_number % 1000000): logging.info("At %d hit count is %d on %s" % (self.__line_number, hit_count, (self.__symbols and self.__symbols or "*"))) fields = re.split(r'\s*,\s*', line) code = fields[0] record = None if code == 'A': record = AddRecord(fields, self.__start_of_date) elif code == 'D': record = DeleteRecord(fields, self.__start_of_date) elif code == 'M': record = ModifyRecord(fields, self.__start_of_date) elif code == 'I' or code == 'V': continue else: continue #raise RuntimeError("Unexpected record type '" + # code + "' at line " + str(self.__line_number) + # " of file " + self.__input_path) if self.__symbols and (not record.symbol in self.__symbols): continue else: hit_count += 1 # record the timestamp of the first record as data_start if not data_start_timestamp: data_start_timestamp = record.timestamp if build_book: self.build_books(record) else: h5Record['ts'] = record.timestamp h5Record['asc_ts'] = chicago_time_str(record.timestamp) h5Record['symbol'] = record.symbol h5Record['seq_num'] = record.seq_num h5Record['order_id'] = record.order_id h5Record['record_type'] = code h5Record['buy_sell'] = (record.is_buy and 'B' or 'S') if code != 'D': h5Record['price'] = record.price h5Record['quantity'] = record.quantity h5Record.append() if 0 == hit_count % __FLUSH_FREQ__: table.flush() books_good = True total_unchanged = 0 for symbol, builder in self.__book_builders.iteritems(): books_good = books_good and builder.summary() total_unchanged += builder.unchanged ############################################################ # Finish filling in the parse summary info and close up ############################################################ self.__parse_manager.data_start(data_start_timestamp) self.__parse_manager.data_stop(record.timestamp) self.__parse_manager.irrelevants(total_unchanged) self.__parse_manager.processed(self.__line_number+1) self.__parse_manager.mark_stop(books_good) self.__h5_file.close() ParseManager.summarize_file(self.__output_path)
class CmeRlcParser(object): r""" """ readable(input_paths=None) match_all = re.compile(".*") def __init__(self, input_path_list): """ """ self.__input_path_list = copy(input_path_list) self.__book_builders = {} self.__h5_file = None self.__ts = None self.__chi_ts = None self.__data_start_timestamp = 0 self.__current_timestamp = None self.__output_path = None self.__prior_day_books = {} def write_summary(self): ############################################################ # Finish filling in the parse summary info and close up ############################################################ self.__parse_manager.data_start(self.__data_start_timestamp) self.__parse_manager.data_stop(self.__current_timestamp) self.__parse_manager.irrelevants(0) self.__parse_manager.processed(self.__line_number+1) self.__parse_manager.mark_stop(True) self.__h5_file.close() ParseManager.summarize_file(self.__output_path) def advance_date(self, new_date): if self.__h5_file: self.write_summary() self.__output_path = CME_OUT_PATH / get_date_string(new_date) if not self.__output_path.parent.exists(): os.makedirs(self.__output_path.parent) print "OUT", self.__output_path self.__h5_file = openFile(self.__output_path, mode="w", title="CME Fix Data") self.__parse_manager = ParseManager(self.__current_input_path, self.__h5_file) self.__parse_manager.mark_start() self.__prior_day_books = {} self.__data_start_timestamp = 0 self.__current_timestamp = 0 for symbol, builder in self.__book_builders.items(): self.__prior_day_books[symbol] = (builder.bid_book, builder.ask_book) self.__book_builders = {} def build_books(self, record): try: symbol = record.symbol builder = self.__book_builders.get(symbol) if not builder: builder = CmeRlcBookBuilder(symbol, self.__h5_file, self.__prior_day_books.get(symbol, None), include_trades = True) self.__book_builders[symbol] = builder if record.is_book_message(): builder.process_record(record) elif record.is_trade_message(): builder.write_trade(record) except Exception,e: print traceback.format_exc() self.__parse_manager.warning(self.__current_file + ':' + e.message, 'G', self.__current_timestamp, self.__line_number+1)
class CmeFixParser(object): r""" """ readable(input_paths=None) match_all = re.compile(".*") def __init__(self, input_paths): """ """ self.__input_paths = input_paths self.__book_builders = {} self.__prior_day_books = {} self.__h5_file = None self.__ts = 0 self.__chi_ts = None self.__data_start_timestamp = 0 self.__output_path = None def write_summary(self): ############################################################ # Finish filling in the parse summary info and close up ############################################################ self.__parse_manager.data_start(self.__data_start_timestamp) self.__parse_manager.data_stop(self.__ts) self.__parse_manager.irrelevants(0) self.__parse_manager.processed(self.__line_number+1) self.__parse_manager.mark_stop(True) self.__h5_file.close() ParseManager.summarize_file(self.__output_path) def advance_date(self, new_date): if self.__h5_file: self.write_summary() self.__output_path = CME_OUT_PATH / get_date_string(new_date) if not self.__output_path.parent.exists(): os.makedirs(self.__output_path.parent) print "OUT", self.__output_path self.__h5_file = openFile(self.__output_path, mode="w", title="CME Fix Data") self.__parse_manager = ParseManager(self.__current_input_path, self.__h5_file) self.__parse_manager.mark_start() self.__prior_day_books = {} self.__data_start_timestamp = 0 self.__ts = 0 for symbol, builder in self.__book_builders.items(): self.__prior_day_books[symbol] = (builder.bid_book, builder.ask_book) self.__book_builders = {} def build_books(self, msg): try: ts = timestamp_from_cme_timestamp(msg.sending_time) if 0 == self.__line_number % 100000: print "st:", msg.sending_time, "vs", ts, "vs", chicago_time_str(ts) if self.__ts: if ts < self.__ts: print "At", self.__line_number+1, "of", self.__current_file, \ "previous ts:", self.__chi_ts, "new:", chi_ts, \ "Current Message:", pprint.pformat(msg) assert False, "Timestamps going backward" self.__ts = ts self.__chi_ts = chicago_time_str(self.__ts) if 0 == self.__data_start_timestamp: self.__data_start_timestamp = self.__ts affected_builders = sets.Set() for update in msg.entries: symbol = update[SecurityDesc] builder = self.__book_builders.get(symbol, None) if not builder: builder = CmeBookBuilder(symbol, self.__h5_file, self.__prior_day_books.get(symbol, None), include_trades = True) self.__book_builders[symbol] = builder if not update[MDEntryType] in __BOOK_ENTRY_TYPES__: continue builder.process_record(update, self.__ts, self.__chi_ts, msg.msg_seq_num) affected_builders.add(builder) for builder in affected_builders: top_bid = builder.top_bid() top_ask = builder.top_ask() if top_bid and top_ask: if top_bid == top_ask: warning_msg = builder.symbol + ': Locked (%s, %s)'%(top_bid, top_ask) print warning_msg self.__parse_manager.warning(warning_msg, 'L', self.__ts, self.__line_number+1) elif top_bid > top_ask: warning_msg = builder.symbol + ': Crossed (%s, %s)'%(top_bid, top_ask) print warning_msg self.__parse_manager.warning(warning_msg, 'C', self.__ts, self.__line_number+1) if not builder.write_record(self.__ts, self.__chi_ts, msg.msg_seq_num): #print "Msg no book change", msg.line pass except Exception,e: print traceback.format_exc() self.__parse_manager.warning(self.__current_file + ':' + e.message, 'G', self.__ts, self.__line_number+1)
table.sortby = "Date" table.align["Date"] = "l" for f in outfiles: d = get_date_of_file(f) if not d: print "Warning: found cme output file with no date", f continue dup = outfile_map.get(d) if dup: print "Warning: dup for date", d outfile_map[d] = f record = fileset.date_map.get(d) if not record: print "Could not find record for date", d else: try: summary = ParseManager.get_summary_record(f) datestr = str(d) + d.strftime(" (%A)") if summary: table.add_row([datestr, record['type'], "Valid" if summary['is_valid'] else "Invalid", chicago_time(summary['data_start']), chicago_time(summary['data_stop']), '?']) else: table.add_row([datestr, record['type'], "No Summary", "", "", ""]) except Exception, e: print "Caught exception:", e table.add_row([d, record['type'], "No Summary", "", "", ""]) print table print "Num rows:", table.rowcount
def parse(self, build_book = True, force = False, stop_early_at_hit=0): """ Parse the input file. There are two modes: build_book=True and build_book=False. If build_book=False, the h5 file is simply the same record data from the gz file, but stored as hdf5. If build_book=True, the hdf5 file created has book data for all matching inputs. Each symbol gets it's own dataset. The ParseManager is used to store summary information for the parse of this data. """ self.__output_path = self.__output_base + (build_book and ".h5" or "_AMD_.h5") logging.info("Parsing file %s\n\tto create %s"% (self.__input_path, self.__output_path)) if self.__output_path.exists() and not force: print 'Error: output file already exists. Use --force to overwrite.' return if not self.__output_path.parent.exists(): os.makedirs(self.__output_path.parent) self.__h5_file = openFile(bytes(self.__output_path), mode = "w", title = "ARCA Equity Data") if not build_book: ## If not building book, then just writing out AMD data as hdf5 filters = Filters(complevel=1, complib='zlib') group = self.__h5_file.createGroup("/", '_AMD_Data_', 'Add-Modify-Delete data') table = self.__h5_file.createTable(group, 'records', ArcaRecord, "Data for "+str(self.__date), filters=filters) h5Record = table.row self.__parse_manager = ParseManager(self.__input_path, self.__h5_file) self.__parse_manager.mark_start() hit_count = 0 data_start_timestamp = None symbols_cleaned = map(lambda s: s if len(s) > 1 else ','+s+',', self.__symbols) symbol_regex = r'/\(' + string.join(symbols_cleaned, r'\|') + r'\)/p' unzip = subprocess.Popen(['gzip','-d','-c', self.__input_path], stdout=subprocess.PIPE, bufsize=-1) sed = subprocess.Popen(['sed','-n',symbol_regex], stdin=unzip.stdout, stdout=subprocess.PIPE, bufsize=-1) # print "symbol regex: {}".format(symbol_regex) # print "from sed: {}".format(sed.stdout.readline()) # print "from unzip: {}".format(unzip.stdout.readline()) print "input path: {}".format(self.__input_path) infile = csv.reader(iter(sed.stdout.readline, '')) for self.__line_number, fields in enumerate(infile): # print "reading fields: {}".format(fields) if stop_early_at_hit and hit_count == stop_early_at_hit: break ################################################### # Show progress periodically ################################################### if 0 == (self.__line_number % 1000000): logging.info("At %d hit count is %d on %s" % (self.__line_number, hit_count, (self.__symbols and self.__symbols or "*"))) code = fields[0] record = None if code == 'A': record = AddRecord(fields, self.__start_of_date) elif code == 'D': record = DeleteRecord(fields, self.__start_of_date) elif code == 'M': record = ModifyRecord(fields, self.__start_of_date) elif code == 'I' or code == 'V': continue else: continue #raise RuntimeError("Unexpected record type '" + # code + "' at line " + str(self.__line_number) + # " of file " + self.__input_path) # print "built record: {}".format(record) if self.__symbols and (not record.symbol in self.__symbols): # print "passing" continue else: hit_count += 1 # print "setting start timestamp" # record the timestamp of the first record as data_start if not data_start_timestamp: data_start_timestamp = record.timestamp if build_book: # print "calling book builder" self.build_books(record) else: h5Record['ts'] = record.timestamp h5Record['asc_ts'] = chicago_time_str(record.timestamp) h5Record['symbol'] = record.symbol h5Record['seq_num'] = record.seq_num h5Record['order_id'] = record.order_id h5Record['record_type'] = code h5Record['buy_sell'] = (record.is_buy and 'B' or 'S') if code != 'D': h5Record['price'] = record.price h5Record['quantity'] = record.quantity h5Record.append() if 0 == hit_count % __FLUSH_FREQ__: table.flush() sed.wait() books_good = True total_unchanged = 0 for symbol, builder in self.__book_builders.iteritems(): books_good = books_good and builder.summary() total_unchanged += builder.unchanged ############################################################ # Finish filling in the parse summary info and close up ############################################################ self.__parse_manager.data_start(data_start_timestamp) self.__parse_manager.data_stop(record.timestamp) self.__parse_manager.irrelevants(total_unchanged) self.__parse_manager.processed(self.__line_number+1) self.__parse_manager.mark_stop(books_good) self.__h5_file.close() ParseManager.summarize_file(bytes(self.__output_path))