Пример #1
0
def load_and_preprocess_data(config):
    logger.info("Loading training data...")
    train_emails = read_records(config.train_filepath)
    logger.info("Done. Read %d emails", len(train_emails))
    logger.info("Loading dev data...")
    dev_emails = read_records(config.dev_filepath)
    logger.info("Done. Read %d emails", len(dev_emails))
    logger.info("Loading test data...")
    test_emails = read_records(config.test_filepath)
    logger.info("Done. Read %d emails", len(test_emails))
    return train_emails, dev_emails, test_emails
Пример #2
0
def snap_records(combined_seg,
                 segments_index,
                 infile,
                 record_type,
                 startyear=None,
                 endyear=None):

    records = util.read_records(infile, record_type, startyear, endyear)
    if record_type == 'concern' and not records:
        print "no concerns found"
        return

    # Find nearest crashes - 30 tolerance
    print "snapping " + record_type + " records to segments"
    util.find_nearest(records,
                      combined_seg,
                      segments_index,
                      30,
                      type_record=True)

    # Write out snapped records
    schema = records[0].schema
    shpfile = os.path.join(MAP_FP, record_type + '_joined.shp')
    util.records_to_shapefile(schema, shpfile, records)

    jsonfile = os.path.join(PROCESSED_DATA_FP, record_type + '_joined.json')

    print "output " + record_type + " data to " + jsonfile
    with open(jsonfile, 'w') as f:
        json.dump([r.properties for r in records], f)
Пример #3
0
def plot_CDF(filename, start_selector, end_selector, start_index=None, end_index=None, **kwargs):
    records = read_records(filename)
    lineages = preprocess(records, cmd_of_interest='', send_only=False)
    intervals = get_intervals(lineages, start_selector, end_selector, start_index=start_index, end_index=end_index, **kwargs)

    sortedtime = np.sort(intervals.values())
    p = 1. * np.arange(len(intervals.values())) / (len(intervals.values()) - 1)
    plt.plot(sortedtime, p, **kwargs)
Пример #4
0
def running_time(filename):
    records = read_records(filename)
    total_time = 0.0
    running = 0
    last_ts = records[0]['ts']
    for r in records:
        if r['op'] == 'recv' and r['msg'] == 'lambda_start_ts':
            total_time += (r['ts'] - last_ts) * running
            running += 1
            last_ts = r['ts']
        elif r['op'] == 'send' and r['msg'] == 'quit:':
            total_time += (r['ts'] - last_ts) * running
            running -= 1
            last_ts = r['ts']
    assert (running == 0)
    return total_time
Пример #5
0
def plot_lineages(filename,
                  start_selector,
                  end_selector,
                  start_index=None,
                  end_index=None,
                  **kwargs):
    records = read_records(filename)
    lineages = preprocess(records, cmd_of_interest='', send_only=False)
    intervals = get_intervals(lineages,
                              start_selector,
                              end_selector,
                              start_index=start_index,
                              end_index=end_index,
                              **kwargs)
    items = intervals.items()
    items.sort(key=lambda i: int(i[0]))
    return plt.plot([i[0] for i in items], [i[1] for i in items], **kwargs)
Пример #6
0
def get_completion_time(filename):
    records = read_records(filename)
    lineages = preprocess(records, cmd_of_interest='', send_only=False)
    lineage_time = [l[-1]['ts'] for l in lineages.values()]
    return np.percentile(lineage_time, (95, 99, 100))
Пример #7
0
                 color='k',
                 linestyle='-',
                 linewidth=2)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="plot stack graphs")
    parser.add_argument("logfile", help="pipeline log file", type=str)
    parser.add_argument("-v",
                        "--verbose",
                        help="show ts of all commands",
                        action="store_true")
    parser.add_argument("-s",
                        "--sort",
                        help="sort by completion time",
                        action="store_true")
    parser.add_argument("--chunklen", help="chunk length", type=float)
    parser.add_argument("--playtime",
                        help="virtual playback start time",
                        type=float)
    args = parser.parse_args()

    lines = read_records(sys.argv[1])

    plot_stack(lines,
               chunk_length=args.chunklen,
               ystart=args.playtime,
               verbose=args.verbose,
               sort_by_completion_time=args.sort)
    plt.show()