def main(): parser = OptionParser() parser.add_option("-s", "--sample_accessions_file", help="Path to JSON containing sample accessions") parser.add_option( "-m", "--metadata_input", help="JSON file mapping each sample to its raw key-value pairs") parser.add_option("-o", "--pipeline_results_file", help="File to which to write the matches.") (options, args) = parser.parse_args() sample_accs = None with open(options.sample_accessions_file, 'r') as f: j = json.load(f) sample_accs = j["sample_accessions"] with open(options.metadata_input, 'r') as f: sample_to_metadata = json.load(f) # If specified in options, run the pipeline pipeline_func = pipeline.build_pipeline() sample_acc_to_matches = run_pipeline(pipeline_func, sample_accs, sample_to_metadata) with open(options.pipeline_results_file, 'w') as f: f.write( json.dumps(sample_acc_to_matches, sort_keys=True, indent=4, separators=(',', ': ')))
def run(args): logger = logging.getLogger("dp") doc = Discourse() pipeline = build_pipeline(schema=args.schema, segmenter_name=args.segmenter_name, use_gpu=args.use_gpu) with open(args.source, "r", encoding=args.encoding) as source_fd: for line in tqdm.tqdm(source_fd, desc="parsing %s" % args.source, unit=" para"): line = line.strip() if line: para = pipeline(line) if args.draw: para.draw() doc.append(para) logger.info("save parsing to %s" % args.save) doc.to_xml(args.save, encoding=args.encoding)
def make_network2(input_shape=INPUT_SHAPE): return (NeuralNetwork() .input(input_shape) .conv([5, 5, 12]) # <== doubled .max_pool() .relu() .conv([5, 5, 32]) # <== doubled .max_pool() .relu() .flatten() .dense(240) # <== doubled .relu() .dense(N_CLASSES)) with Session() as session: pipeline = build_pipeline(preprocessors, session, make_network2(), make_adam(1.0e-3)) learning_curve = train_evaluate(pipeline) session.save('checkpoint/network2.ckpt') show_learning_curve(learning_curve) with Session() as session: pipeline = build_pipeline(preprocessors, session, make_network2()) session.load('checkpoint/network2.ckpt') pred = pipeline.predict(X_valid) cm = confusion_matrix(y_valid, pred) plot_confusion_matrix(cm) print_confusion_matrix(cm) '''X_new = np.array(glob.glob('images/*.ppm')) new_images = [plt.imread(path) for path in X_new]
sum(y_train == result[0]))) accuracies.append(result[2]) print('-' * 50) print('Accuracy: Mean: {:.3f} Std: {:.3f}'.format(np.mean(accuracies), np.std(accuracies))) def make_network5(input_shape=INPUT_SHAPE): return (NeuralNetwork().input(input_shape).conv( [5, 5, 24]).max_pool().elu() # <== ELU .conv([5, 5, 64]).max_pool().elu() # <== ELU .flatten().dense(480).elu() # <== ELU .dense(N_CLASSES)) with Session() as session: pipeline = build_pipeline(preprocessors, session, make_network5(), make_adam(0.5e-3)) learning_curve = train_evaluate(pipeline, epochs=20) session.save('checkpoint/network5.ckpt') show_learning_curve(learning_curve) with Session() as session: pipeline = build_pipeline(preprocessors, session, make_network5()) session.load('checkpoint/network5.ckpt') pred = pipeline.predict(X_valid) cm = confusion_matrix(y_valid, pred) plot_confusion_matrix(cm) print_confusion_matrix(cm)
def evaluate(args): pipeline = build_pipeline(schema=args.schema, segmenter_name=args.segmenter_name, use_gpu=args.use_gpu) cdtb = CDTB(args.data, "TRAIN", "VALIDATE", "TEST", ctb_dir=args.ctb_dir, preprocess=True, cache_dir=args.cache_dir) golds = list(filter(lambda d: d.root_relation(), chain(*cdtb.test))) parses = [] if args.use_gold_edu: logger.info("evaluation with gold edu segmentation") else: logger.info("evaluation with auto edu segmentation") for para in tqdm(golds, desc="parsing", unit=" para"): if args.use_gold_edu: edus = [] for edu in para.edus(): edu_copy = EDU([TEXT(edu.text)]) setattr(edu_copy, "words", edu.words) setattr(edu_copy, "tags", edu.tags) edus.append(edu_copy) else: sentences = [] for sentence in para.sentences(): if list(sentence.iterfind(node_type_filter(EDU))): copy_sentence = Sentence([TEXT([sentence.text])]) if hasattr(sentence, "words"): setattr(copy_sentence, "words", sentence.words) if hasattr(sentence, "tags"): setattr(copy_sentence, "tags", sentence.tags) setattr(copy_sentence, "parse", cdtb.ctb[sentence.sid]) sentences.append(copy_sentence) para = pipeline.cut_edu(Paragraph(sentences)) edus = [] for edu in para.edus(): edu_copy = EDU([TEXT(edu.text)]) setattr(edu_copy, "words", edu.words) setattr(edu_copy, "tags", edu.tags) edus.append(edu_copy) parse = pipeline.parse(Paragraph(edus)) parses.append(parse) # edu score scores = edu_eval(golds, parses) logger.info("EDU segmentation scores:") logger.info(gen_edu_report(scores)) # parser score cdtb_macro_scores = eval.parse_eval(parses, golds, average="macro") logger.info("CDTB macro (strict) scores:") logger.info(eval.gen_parse_report(*cdtb_macro_scores)) # nuclear scores nuclear_scores = eval.nuclear_eval(parses, golds) logger.info("nuclear scores:") logger.info(eval.gen_category_report(nuclear_scores)) # relation scores ctype_scores, ftype_scores = eval.relation_eval(parses, golds) logger.info("coarse relation scores:") logger.info(eval.gen_category_report(ctype_scores)) logger.info("fine relation scores:") logger.info(eval.gen_category_report(ftype_scores)) # height eval height_scores = eval.height_eval(parses, golds) logger.info("structure precision by node height:") logger.info(eval.gen_height_report(height_scores))
print('-' * 80) print('New Images for Random Testing') print('-' * 80) plt.figure(figsize=(15, 5)) for i, image in enumerate(new_images): plt.subplot(2, len(X_new) // 2, i + 1) plt.imshow(image) plt.xticks([]) plt.yticks([]) plt.show() print('getting top 5 results') with Session() as session: pipeline = build_pipeline(preprocessors, session, make_network3()) session.load('checkpoint/network3_e-100_lr-1.0e-4.ckpt') prob = pipeline.predict_proba(X_new) estimator = pipeline.steps[-1][1] top_5_prob, top_5_pred = estimator.top_k_ print('done') print('-' * 80) print('Top 5 Predictions') print('-' * 80) for i, (preds, probs, image) in enumerate(zip(top_5_pred, top_5_prob, new_images)): plt.imshow(image) plt.xticks([]) plt.yticks([])
ACCESS_LOG_INPUT = 'sample_data/unit-test2.log' EVIL_IP_INPUT = 'sample_data/ip-list.txt' EVIL_REPORT_OUTPUT_FOLDER = 'report_output' ## Create and Configure Spark Context and Session (sc, spark) = utils.gen_spark_context('LogProcessor', local=False) ## Configure Input access_log_rdd = sc.textFile(ACCESS_LOG_INPUT) evil_ip_rdd = sc.textFile(EVIL_IP_INPUT) ## Call the pipeline pipeline = pipeline.LogProcessorPipeline(sc, spark) (stat_df, evil_ip_report_df) = pipeline.build_pipeline(access_log_rdd, evil_ip_rdd) ## Configure Output # spark.conf.set('spark.sql.shuffle.partitions', 100) stat_df.write \ .format('jdbc') \ .option('url', 'jdbc:mysql://localhost/spark_test') \ .option('dbtable', 'log_report') \ .option('user', 'spark') \ .option('driver', 'com.mysql.jdbc.Driver') \ .option('password', 'spark123') \ .option('numPartition', '1') \ .save() # spark.conf.set('spark.sql.shuffle.partitions', 300)
def main(): df = load_data() X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']] y = df[['Species']] pipe = fit_model(build_pipeline(X), X, y) persist_model(pipe)
'place', 'manv', 'circ', 'lum', 'catv', 'obsm', 'infra', 'agg', 'atm', 'catr', 'situ', 'obs', 'vosp', 'catu', 'int', 'trajet', 'sexe', 'plan', 'choc', 'col' ] CYCLICAL_FEATURES = ['hour_of_day', 'mois'] ##################### WORKFLOW ##################### # Load data data = build_dataset(DATAPATH) data_cleaned = clean_dataset(data, USELESS_FEATURES) # Hold-out X, y = data_cleaned.drop(columns=['grav']), binarize_target( data_cleaned['grav']) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3) # Preprocessing and fitting pipeline pipe = build_pipeline(numerical_features=NUMERICAL_FEATURES, categorical_features=CATEGORICAL_FEATURES, cyclical_features=CYCLICAL_FEATURES) pipe.fit(X_train, y_train) # Evaluation print(classification_report(pipe.predict(X_test), y_test))
# OUTPUT_FOLDER = 's3://...' OUTPUT_FOLDER_LOG = 'output' OUTPUT_FOLDER_STATS = 'output' # Constructing the Spark Context ... sc = utils.gen_spark_context(local=False) spark = SparkSession(sc) # Handling input # input_rdd = sc.parallelize([ ... ]) input_rdd = sc.textFile(INPUT_FOLDER) evil_ip_list_rdd = sc.textFile(EVIL_IP_INPUT) # Building the pipeline pipeline = pipeline.LogProcessorPipeline(sc, spark) (log_df, stat_df, alarm_df, malicious_ip_df) = pipeline.build_pipeline(input_rdd, evil_ip_list_rdd) # Writing down the data log_df.write \ .format('parquet') \ .mode('overwrite') \ .partitionBy('date') \ .save(OUTPUT_FOLDER_LOG) # .saveAsTable('table_name') #stat_df.write \ # .format('jdbc') \ # .option('url', 'jdbc:mysql://localhost/spark_test') \ # .option('dbtable', 'log_report') \ # .option('user', 'spark') \ # .option('driver', 'com.mysql.jdbc.Driver') \