예제 #1
0
def stat_value_types(filepath, outf, txt_outf=None):
    value_sep = "|&|"
    filename = ".".join(os.path.basename(filepath).split(".")[:-1])
    print filename
    file_stat = FileValueStat(filename)
    for line in file(filepath):
        parts = extractor.parse_line(line)
        rtype = parts[2]
        values = parts[3].split(value_sep)
        file_stat.add(rtype, values)
    outf.write("***** %s start *****\n" % filename)
    for rtype in sorted(file_stat.type_stats.keys()):
        for out_str in file_stat.type_stats[rtype].to_string():
            outf.write(out_str + "\n")
    outf.write("***** %s end   *****\n" % filename)

    if txt_outf is not None:
        for rtype in sorted(file_stat.type_stats.keys()):
            type_stat = file_stat.type_stats[rtype]
            for value_stat in type_stat.value_stats:
                if len(value_stat.ntype_txt) > 0:
                    name = type_stat.rtype + "#" + str(value_stat.order)
                    text = json.dumps([text for text in value_stat.ntype_txt])
                    txt_outf.write(name + '\t' +
                                   str(len(value_stat.ntype_txt)) + '\t' +
                                   text + "\n")
 def add_data(self, line):
     self.nb_event += 1
     parts = extractor.parse_line(line)
     pid = get_id(parts[0])
     rtype = parts[2]
     self.add_pid(pid)
     self.rtype_set.add(rtype)
     self.pid_event_cnt[pid] += 1
def count_table_event(filepath):
    logging.info("count event from [%s]" % os.path.basename(filepath))
    event_cnt = {}
    for line in file(filepath):
        parts = extractor.parse_line(line)
        pid = int(parts[0].split("_")[0])
        if not pid in event_cnt:
            event_cnt[pid] = 0
        event_cnt[pid] += 1
    return event_cnt
예제 #4
0
 def build(self, row):
     parts = parse_line(row)
     pid = int(parts[0].split("_")[0])
     time = parse_time(parts[1])
     values = parts[3].split(EventBuilder.sep)
     event_idx = self.build_event(time, values)
     features = self.build_features(time, values)
     event = Event(event_idx, features, pid, time)
     if event.is_valid():
         return event
     else:
         return None
예제 #5
0
def build_event(filepath, builders):
    name = ".".join(os.path.basename(filepath).split(".")[:-1])
    logging.info("build event = %s", name)
    outf = file(os.path.join(event_dir, name + ".tsv"), 'w')
    for line in file(filepath):
        line = line.strip()
        parts = parse_line(line)
        rtype = parts[2]
        if rtype in builders:
            event = builders[rtype].build(line)
            if event is not None:
                outf.write(str(event) + "\n")
    outf.close()
def process(filename, outfilename, value_stats):
    stats = {}
    logging.info('process [%s]' % (os.path.basename(filename)))
    cnt = 0
    for line in file(filename, 'r'):
        cnt += 1
        if cnt % 1000000 == 0:
            logging.info("\t %d lines" % cnt)
        parts = extractor.parse_line(line)

        ID = get_id(parts[0])
        if ID is None:
            continue

        rtype = parts[2]
        if not rtype in stats:
            stats[rtype] = Stat(rtype)
        stat = stats[rtype]

        time = parse_time(parts[1])
        stat.add_entry(ID, time, parts[3].strip())

        values = parts[3].split("|&|")
        if len(values) >= 1:
            time = parse_time(values[0])
            if time is not None:
                stat.add_entry(ID, time)
    if outfilename is not None:
        outf = file(outfilename, 'w')
        for rtype in sorted(stats.keys()):
            stats[rtype].write_to_local(outf)
        outf.close()

    if not value_stats is None:
        for rtype in stats.keys():
            stat = stats[rtype]
            total = stat.nrow()
            value_cnts = stat.get_value_stat()
            for i in range(len(value_cnts)):
                key = rtype + "#" + str(i)
                rate = round(value_cnts[i] / (total + 0.0), 3)
                value = (rate, value_cnts[i])
                value_stats[key] = value