def stat_value_types(filepath, outf, txt_outf=None): value_sep = "|&|" filename = ".".join(os.path.basename(filepath).split(".")[:-1]) print filename file_stat = FileValueStat(filename) for line in file(filepath): parts = extractor.parse_line(line) rtype = parts[2] values = parts[3].split(value_sep) file_stat.add(rtype, values) outf.write("***** %s start *****\n" % filename) for rtype in sorted(file_stat.type_stats.keys()): for out_str in file_stat.type_stats[rtype].to_string(): outf.write(out_str + "\n") outf.write("***** %s end *****\n" % filename) if txt_outf is not None: for rtype in sorted(file_stat.type_stats.keys()): type_stat = file_stat.type_stats[rtype] for value_stat in type_stat.value_stats: if len(value_stat.ntype_txt) > 0: name = type_stat.rtype + "#" + str(value_stat.order) text = json.dumps([text for text in value_stat.ntype_txt]) txt_outf.write(name + '\t' + str(len(value_stat.ntype_txt)) + '\t' + text + "\n")
def add_data(self, line): self.nb_event += 1 parts = extractor.parse_line(line) pid = get_id(parts[0]) rtype = parts[2] self.add_pid(pid) self.rtype_set.add(rtype) self.pid_event_cnt[pid] += 1
def count_table_event(filepath): logging.info("count event from [%s]" % os.path.basename(filepath)) event_cnt = {} for line in file(filepath): parts = extractor.parse_line(line) pid = int(parts[0].split("_")[0]) if not pid in event_cnt: event_cnt[pid] = 0 event_cnt[pid] += 1 return event_cnt
def build(self, row): parts = parse_line(row) pid = int(parts[0].split("_")[0]) time = parse_time(parts[1]) values = parts[3].split(EventBuilder.sep) event_idx = self.build_event(time, values) features = self.build_features(time, values) event = Event(event_idx, features, pid, time) if event.is_valid(): return event else: return None
def build_event(filepath, builders): name = ".".join(os.path.basename(filepath).split(".")[:-1]) logging.info("build event = %s", name) outf = file(os.path.join(event_dir, name + ".tsv"), 'w') for line in file(filepath): line = line.strip() parts = parse_line(line) rtype = parts[2] if rtype in builders: event = builders[rtype].build(line) if event is not None: outf.write(str(event) + "\n") outf.close()
def process(filename, outfilename, value_stats): stats = {} logging.info('process [%s]' % (os.path.basename(filename))) cnt = 0 for line in file(filename, 'r'): cnt += 1 if cnt % 1000000 == 0: logging.info("\t %d lines" % cnt) parts = extractor.parse_line(line) ID = get_id(parts[0]) if ID is None: continue rtype = parts[2] if not rtype in stats: stats[rtype] = Stat(rtype) stat = stats[rtype] time = parse_time(parts[1]) stat.add_entry(ID, time, parts[3].strip()) values = parts[3].split("|&|") if len(values) >= 1: time = parse_time(values[0]) if time is not None: stat.add_entry(ID, time) if outfilename is not None: outf = file(outfilename, 'w') for rtype in sorted(stats.keys()): stats[rtype].write_to_local(outf) outf.close() if not value_stats is None: for rtype in stats.keys(): stat = stats[rtype] total = stat.nrow() value_cnts = stat.get_value_stat() for i in range(len(value_cnts)): key = rtype + "#" + str(i) rate = round(value_cnts[i] / (total + 0.0), 3) value = (rate, value_cnts[i]) value_stats[key] = value