def test_prediction_on_classifiers(X: pd.DataFrame, y: pd.Series, cv_sets=10, test_prefix=None): if cv_sets is None: cv = 10 print("Using 10-fold cross validation...") else: cv = cv_sets print(f"Using {len(cv_sets)}x preselected train/test sets...") results = pd.DataFrame() for clf_name, clf in ML_CLASSIFIERS.items(): timer = Timer() with timer: scores = cross_validate(clf, X, y, scoring=scoring, cv=cv, n_jobs=-1) print("Accuracy: %0.2f (+/- %0.2f) <-- %s" % ( scores["test_accuracy"].mean(), scores["test_accuracy"].std() * 2, clf_name, )) data = { score_type: scores[score_field] for score_type, score_field in zip(scoring, score_fields) } data[ "method"] = clf_name if test_prefix is None else test_prefix + clf_name data["time"] = timer.interval results = results.append(pd.DataFrame(data), ignore_index=True) return results
def train_provenance_kernel_pipeline( graphs: pd.DataFrame, output_path: Path, kernel: str, level: int, y_column: str, including_edge_type_counts: bool = False, ) -> Pipeline: X, y = load_kernel_ml_data( graphs, output_path, kernel, level, y_column, including_edge_type_counts ) clf = Pipeline( [ ("scale", StandardScaler(with_mean=False)), ("svm", SVC(kernel="rbf", gamma="scale", class_weight="balanced")), ] ) gs = GridSearchCV( estimator=clf, param_grid={ "svm__C": SVM_C_PARAMS, }, refit=True, n_jobs=-1, ) with Timer(): gs.fit(X, y) clf = gs.best_estimator_ print(" - Best params:", gs.best_params_) print(" - Best score:", gs.best_score_) print(" - Accuracy:", clf.score(X, y)) return clf
def test_prediction_on_Grakel_kernels( graphs: pd.DataFrame, y_column: str, cv_sets=None, ignore_kernels=None ): if cv_sets is None: cv = 10 print("Using 10-fold cross validation...") else: cv = cv_sets print(f"Using {len(cv_sets)}x preselected train/test sets...") if ignore_kernels is None: ignore_kernels = set() results = pd.DataFrame() for method_id, gk_class in GRAKEL_KERNELS.items(): if method_id in ignore_kernels: logger.info("Skipping testing kernel: %s", method_id) continue logger.info("Testing graph kernel: %s", method_id) print("Testing GraKeL kernel:", method_id) gk = gk_class() has_timed_out = False try: timer = Timer(timeout=TIMEOUT) with timer: # TODO: break if timed out # only time the kerneling cost X = gk.fit_transform(graphs.grakel_graphs) except TimeoutException: has_timed_out = True print("*** TIMED OUT - %s ***" % method_id) if not has_timed_out: clf = SVC(kernel="precomputed", gamma="scale", class_weight="balanced") scores = cross_validate( clf, X, graphs[y_column], scoring=scoring, cv=cv, n_jobs=-1 ) print( "Accuracy: %0.2f (+/- %0.2f) <-- %s" % ( scores["test_accuracy"].mean(), scores["test_accuracy"].std() * 2, method_id, ) ) data = { score_type: scores[score_field] for score_type, score_field in zip(scoring, score_fields) } data["method"] = method_id data["time"] = timer.interval results = results.append(pd.DataFrame(data), ignore_index=True) return results
def test_prediction_on_classifiers(X: pd.DataFrame, output_path: Path, y: pd.Series, cv_sets=10, test_prefix=None): if cv_sets is None: cv = 10 print("> Using 10-fold cross validation...") else: cv = cv_sets print(f"> Using {len(cv_sets)}x preselected train/test sets...") results = pd.DataFrame() for clf_name, clf in ML_CLASSIFIERS.items(): method_id = clf_name if test_prefix is None else test_prefix + clf_name # load existing scorings scorings = load_experiment_scorings(output_path, method_id) if scorings is None: print("> Testing ML method:", method_id) timer = Timer() with timer: scores = cross_validate(clf, X, y, scoring=scoring, cv=cv, n_jobs=-1) print(" - Accuracy: %0.2f (+/- %0.2f) <-- %s" % ( scores["test_accuracy"].mean(), scores["test_accuracy"].std() * 2, clf_name, )) data = { score_type: scores[score_field] for score_type, score_field in zip(scoring, score_fields) } data["method"] = method_id data["time"] = timer.interval scorings = pd.DataFrame(data) save_experiment_scorings(output_path, method_id, scorings) results = results.append(scorings, ignore_index=True) return results
def calculate_provenance_features_for_file(filepath: Path) -> list: # Calculate Provenance Network Metrics (22) and number of edge types try: # load the file prov_doc = ProvDocument.deserialize(filepath) except Exception as e: logger.error("Cannot deserialize %s", filepath) raise e try: timer = Timer(verbose=False) with timer: # counting the record types rec_type_counts = count_record_types(prov_doc) prov_rel_cols = [ rec_type_counts[rec_type] if rec_type in rec_type_counts else 0 for rec_type in PROV_RELATION_NAMES ] mv5 = version5(prov_doc, flat=True) # calculate return mv5[:-4] + prov_rel_cols + [timer.interval] except Exception as e: logger.error("Cannot calculate metrics for %s", filepath) raise e
def test_prediction_on_Grakel_kernels( graphs: pd.DataFrame, output_path: Path, y_column: str, cv_sets=None, ignore_kernels=None, ): if cv_sets is None: cv = 10 print("> Using 10-fold cross validation...") else: cv = cv_sets print(f"> Using {len(cv_sets)}x preselected train/test sets...") if ignore_kernels is None: ignore_kernels = set() results = pd.DataFrame() for method_id, gk_class in GRAKEL_KERNELS.items(): if method_id in ignore_kernels: logger.info("Skipping testing kernel: %s", method_id) continue # load existing scorings scorings = load_experiment_scorings(output_path, method_id) if scorings is None: # run the experiment logger.info("Testing graph kernel: %s", method_id) print("> Testing GraKeL kernel:", method_id) gk = gk_class() failed = False try: timer = Timer(timeout=TIMEOUT) with timer: # TODO: break if timed out # only time the kerneling cost X = gk.fit_transform(graphs.grakel_graphs) except TimeoutException: failed = True print("*** TIMED OUT - %s ***" % method_id) except Exception as e: failed = True print(f"*** EXCEPTION - {method_id} ***\n{e}") if failed: # skip this, go to the next experiment continue clf = SVC(kernel="precomputed", gamma="scale", class_weight="balanced") gs = GridSearchCV( estimator=clf, param_grid={ "C": SVM_C_PARAMS, }, ) scores = cross_validate(gs, X, graphs[y_column], scoring=scoring, cv=cv, n_jobs=-1) print(" - Accuracy: %0.2f (+/- %0.2f) <-- %s" % ( scores["test_accuracy"].mean(), scores["test_accuracy"].std() * 2, method_id, )) data = { score_type: scores[score_field] for score_type, score_field in zip(scoring, score_fields) } data["method"] = method_id data["time"] = timer.interval scorings = pd.DataFrame(data) save_experiment_scorings(output_path, method_id, scorings) results = results.append(scorings, ignore_index=True) return results
import re from scripts.utils import Timer from urllib.parse import urlsplit from ural.patterns import DOMAIN_TEMPLATE N = 1_000_000 URL = 'http://www.lemonde.fr:8000/article/1234/index.html?query=mobile#2' with Timer('urlsplit'): for _ in range(N): parsed = urlsplit(URL) parsed.hostname pattern = re.compile(DOMAIN_TEMPLATE % r'lemonde\.fr') with Timer('regex'): for _ in range(N): parsed = pattern.match(URL)
class SwiftCSVReader(object): def __init__(self, reader): self.reader = reader def __iter__(self): return self def __next__(self): try: line = next(self.reader) return CSVLine(line) except StopIteration: raise with Timer('reader'): with open('./scripts/data/youtube-urls.csv') as f: for line in csv.reader(f): line[1] with Timer('DictReader'): with open('./scripts/data/youtube-urls.csv') as f: for line in csv.DictReader(f): line['youtube_url'] with Timer('SwiftCSVReader'): with open('./scripts/data/youtube-urls.csv') as f: for line in SwiftCSVReader(csv.reader(f)): line['youtube_url']