def main():
    parser = OptionParser()
    parser.add_option("-s",
                      "--sample_accessions_file",
                      help="Path to JSON containing sample accessions")
    parser.add_option(
        "-m",
        "--metadata_input",
        help="JSON file mapping each sample to its raw key-value pairs")
    parser.add_option("-o",
                      "--pipeline_results_file",
                      help="File to which to write the matches.")
    (options, args) = parser.parse_args()

    sample_accs = None
    with open(options.sample_accessions_file, 'r') as f:
        j = json.load(f)
        sample_accs = j["sample_accessions"]

    with open(options.metadata_input, 'r') as f:
        sample_to_metadata = json.load(f)

    # If specified in options, run the pipeline
    pipeline_func = pipeline.build_pipeline()
    sample_acc_to_matches = run_pipeline(pipeline_func, sample_accs,
                                         sample_to_metadata)
    with open(options.pipeline_results_file, 'w') as f:
        f.write(
            json.dumps(sample_acc_to_matches,
                       sort_keys=True,
                       indent=4,
                       separators=(',', ': ')))
Пример #2
0
def run(args):
    logger = logging.getLogger("dp")
    doc = Discourse()
    pipeline = build_pipeline(schema=args.schema,
                              segmenter_name=args.segmenter_name,
                              use_gpu=args.use_gpu)
    with open(args.source, "r", encoding=args.encoding) as source_fd:
        for line in tqdm.tqdm(source_fd,
                              desc="parsing %s" % args.source,
                              unit=" para"):
            line = line.strip()
            if line:
                para = pipeline(line)
                if args.draw:
                    para.draw()
                doc.append(para)
    logger.info("save parsing to %s" % args.save)
    doc.to_xml(args.save, encoding=args.encoding)
Пример #3
0
def make_network2(input_shape=INPUT_SHAPE):
    return (NeuralNetwork()
            .input(input_shape)
            .conv([5, 5, 12])  # <== doubled
            .max_pool()
            .relu()
            .conv([5, 5, 32])  # <== doubled
            .max_pool()
            .relu()
            .flatten()
            .dense(240) # <== doubled
            .relu()
            .dense(N_CLASSES))

with Session() as session:
    pipeline = build_pipeline(preprocessors, session, make_network2(), make_adam(1.0e-3))
    learning_curve = train_evaluate(pipeline)
    session.save('checkpoint/network2.ckpt')

show_learning_curve(learning_curve)
with Session() as session:
    pipeline = build_pipeline(preprocessors, session, make_network2())
    session.load('checkpoint/network2.ckpt')
    pred = pipeline.predict(X_valid)
cm = confusion_matrix(y_valid, pred)
plot_confusion_matrix(cm)
print_confusion_matrix(cm)
'''X_new = np.array(glob.glob('images/*.ppm'))

new_images = [plt.imread(path) for path in X_new]
                                                   sum(y_train == result[0])))
        accuracies.append(result[2])
    print('-' * 50)
    print('Accuracy: Mean: {:.3f} Std: {:.3f}'.format(np.mean(accuracies),
                                                      np.std(accuracies)))


def make_network5(input_shape=INPUT_SHAPE):
    return (NeuralNetwork().input(input_shape).conv(
        [5, 5, 24]).max_pool().elu()  # <== ELU
            .conv([5, 5, 64]).max_pool().elu()  # <== ELU
            .flatten().dense(480).elu()  # <== ELU
            .dense(N_CLASSES))


with Session() as session:
    pipeline = build_pipeline(preprocessors, session, make_network5(),
                              make_adam(0.5e-3))
    learning_curve = train_evaluate(pipeline, epochs=20)
    session.save('checkpoint/network5.ckpt')

show_learning_curve(learning_curve)

with Session() as session:
    pipeline = build_pipeline(preprocessors, session, make_network5())
    session.load('checkpoint/network5.ckpt')
    pred = pipeline.predict(X_valid)
cm = confusion_matrix(y_valid, pred)
plot_confusion_matrix(cm)
print_confusion_matrix(cm)
def evaluate(args):
    pipeline = build_pipeline(schema=args.schema,
                              segmenter_name=args.segmenter_name,
                              use_gpu=args.use_gpu)
    cdtb = CDTB(args.data,
                "TRAIN",
                "VALIDATE",
                "TEST",
                ctb_dir=args.ctb_dir,
                preprocess=True,
                cache_dir=args.cache_dir)
    golds = list(filter(lambda d: d.root_relation(), chain(*cdtb.test)))
    parses = []

    if args.use_gold_edu:
        logger.info("evaluation with gold edu segmentation")
    else:
        logger.info("evaluation with auto edu segmentation")

    for para in tqdm(golds, desc="parsing", unit=" para"):
        if args.use_gold_edu:
            edus = []
            for edu in para.edus():
                edu_copy = EDU([TEXT(edu.text)])
                setattr(edu_copy, "words", edu.words)
                setattr(edu_copy, "tags", edu.tags)
                edus.append(edu_copy)
        else:
            sentences = []
            for sentence in para.sentences():
                if list(sentence.iterfind(node_type_filter(EDU))):
                    copy_sentence = Sentence([TEXT([sentence.text])])
                    if hasattr(sentence, "words"):
                        setattr(copy_sentence, "words", sentence.words)
                    if hasattr(sentence, "tags"):
                        setattr(copy_sentence, "tags", sentence.tags)
                    setattr(copy_sentence, "parse", cdtb.ctb[sentence.sid])
                    sentences.append(copy_sentence)
            para = pipeline.cut_edu(Paragraph(sentences))
            edus = []
            for edu in para.edus():
                edu_copy = EDU([TEXT(edu.text)])
                setattr(edu_copy, "words", edu.words)
                setattr(edu_copy, "tags", edu.tags)
                edus.append(edu_copy)
        parse = pipeline.parse(Paragraph(edus))
        parses.append(parse)

    # edu score
    scores = edu_eval(golds, parses)
    logger.info("EDU segmentation scores:")
    logger.info(gen_edu_report(scores))

    # parser score
    cdtb_macro_scores = eval.parse_eval(parses, golds, average="macro")
    logger.info("CDTB macro (strict) scores:")
    logger.info(eval.gen_parse_report(*cdtb_macro_scores))

    # nuclear scores
    nuclear_scores = eval.nuclear_eval(parses, golds)
    logger.info("nuclear scores:")
    logger.info(eval.gen_category_report(nuclear_scores))

    # relation scores
    ctype_scores, ftype_scores = eval.relation_eval(parses, golds)
    logger.info("coarse relation scores:")
    logger.info(eval.gen_category_report(ctype_scores))
    logger.info("fine relation scores:")
    logger.info(eval.gen_category_report(ftype_scores))

    # height eval
    height_scores = eval.height_eval(parses, golds)
    logger.info("structure precision by node height:")
    logger.info(eval.gen_height_report(height_scores))
Пример #6
0
print('-' * 80)
print('New Images for Random Testing')
print('-' * 80)

plt.figure(figsize=(15, 5))
for i, image in enumerate(new_images):
    plt.subplot(2, len(X_new) // 2, i + 1)
    plt.imshow(image)
    plt.xticks([])
    plt.yticks([])
plt.show()

print('getting top 5 results')

with Session() as session:
    pipeline = build_pipeline(preprocessors, session, make_network3())
    session.load('checkpoint/network3_e-100_lr-1.0e-4.ckpt')
    prob = pipeline.predict_proba(X_new)
    estimator = pipeline.steps[-1][1]
    top_5_prob, top_5_pred = estimator.top_k_

print('done')
print('-' * 80)
print('Top 5 Predictions')
print('-' * 80)

for i, (preds, probs,
        image) in enumerate(zip(top_5_pred, top_5_prob, new_images)):
    plt.imshow(image)
    plt.xticks([])
    plt.yticks([])
Пример #7
0
ACCESS_LOG_INPUT = 'sample_data/unit-test2.log'
EVIL_IP_INPUT = 'sample_data/ip-list.txt'

EVIL_REPORT_OUTPUT_FOLDER = 'report_output'

## Create and Configure Spark Context and Session
(sc, spark) = utils.gen_spark_context('LogProcessor', local=False)

## Configure Input
access_log_rdd = sc.textFile(ACCESS_LOG_INPUT)
evil_ip_rdd = sc.textFile(EVIL_IP_INPUT)

## Call the pipeline
pipeline = pipeline.LogProcessorPipeline(sc, spark)
(stat_df, evil_ip_report_df) = pipeline.build_pipeline(access_log_rdd,
                                                       evil_ip_rdd)

## Configure Output
# spark.conf.set('spark.sql.shuffle.partitions', 100)

stat_df.write \
    .format('jdbc') \
    .option('url', 'jdbc:mysql://localhost/spark_test') \
    .option('dbtable', 'log_report') \
    .option('user', 'spark') \
    .option('driver', 'com.mysql.jdbc.Driver') \
    .option('password', 'spark123') \
    .option('numPartition', '1') \
    .save()

# spark.conf.set('spark.sql.shuffle.partitions', 300)
Пример #8
0
def main():
    df = load_data()
    X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
    y = df[['Species']]
    pipe = fit_model(build_pipeline(X), X, y)
    persist_model(pipe)
Пример #9
0
    'place', 'manv', 'circ', 'lum', 'catv', 'obsm', 'infra', 'agg', 'atm',
    'catr', 'situ', 'obs', 'vosp', 'catu', 'int', 'trajet', 'sexe', 'plan',
    'choc', 'col'
]

CYCLICAL_FEATURES = ['hour_of_day', 'mois']

##################### WORKFLOW #####################

# Load data
data = build_dataset(DATAPATH)
data_cleaned = clean_dataset(data, USELESS_FEATURES)

# Hold-out
X, y = data_cleaned.drop(columns=['grav']), binarize_target(
    data_cleaned['grav'])
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.3)

# Preprocessing and fitting pipeline
pipe = build_pipeline(numerical_features=NUMERICAL_FEATURES,
                      categorical_features=CATEGORICAL_FEATURES,
                      cyclical_features=CYCLICAL_FEATURES)

pipe.fit(X_train, y_train)

# Evaluation
print(classification_report(pipe.predict(X_test), y_test))
Пример #10
0
# OUTPUT_FOLDER = 's3://...'
OUTPUT_FOLDER_LOG = 'output'
OUTPUT_FOLDER_STATS = 'output'

# Constructing the Spark Context ...
sc = utils.gen_spark_context(local=False)
spark = SparkSession(sc)

# Handling input
# input_rdd = sc.parallelize([ ... ])
input_rdd = sc.textFile(INPUT_FOLDER)
evil_ip_list_rdd = sc.textFile(EVIL_IP_INPUT)

# Building the pipeline
pipeline = pipeline.LogProcessorPipeline(sc, spark)
(log_df, stat_df, alarm_df, malicious_ip_df) = pipeline.build_pipeline(input_rdd, evil_ip_list_rdd)

# Writing down the data
log_df.write \
    .format('parquet') \
    .mode('overwrite') \
    .partitionBy('date') \
    .save(OUTPUT_FOLDER_LOG)
   # .saveAsTable('table_name')

#stat_df.write \
#    .format('jdbc') \
#    .option('url', 'jdbc:mysql://localhost/spark_test') \
#    .option('dbtable', 'log_report') \
#    .option('user', 'spark') \
#    .option('driver', 'com.mysql.jdbc.Driver') \