Пример #1
0
def get_features_extr(features_str_list, verbose=1):
    '''
    Returns a feature union object containing all the features extractor 
    referenced to in the features_str_list.
    '''
    features_str_list = features_str_list.split("+")
    feat_extr_list = []
    # final feature extractor name
    feat_extr_union_name = ""

    if(verbose):
        print("Starting loading features extractor ... ")
    
    # load each features vectorizer and build the union
    # the name of each sub extractor is the final estimator
    for feat_extr_str in features_str_list:
        feat_extr = load_features_extr(feat_extr_str, verbose)
        feat_extr_pipe_name = feat_extr[-1][0]
        feat_extr_pipe = get_pipeline(
            features_extr=feat_extr,
            classifier=None,
            verbose=verbose>2
            )
        feat_extr_list.append((feat_extr_pipe_name,feat_extr_pipe))
        feat_extr_union_name += "+" + feat_extr_pipe_name
        
    feat_extr_union_name = feat_extr_union_name[1:]
    feat_extr_union = FeatureUnion(feat_extr_list)
    res = (feat_extr_union_name, feat_extr_union)
    
    if(verbose):
        print("features extractor loaded : " + feat_extr_union_name + "\n")
    return res
def get_prepared_data(payload):
    # num_attribs = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
    #     'total_bedrooms', 'population', 'households', 'median_income']

    full_pipeline = get_pipeline()

    prepared_data = full_pipeline.transform(payload)
    return prepared_data
Пример #3
0
    def set_pipeline(self):
        Pipeline = get_pipeline(self.dataset, self.config.suffix)
        pipe = Pipeline(self.output_size,
                        self.c_dim,
                        self.real_batch_size,
                        os.path.join(self.data_dir, self.dataset),
                        with_labels=False,
                        format=self.format,
                        timer=self.timer,
                        sample_dir=self.sample_dir)

        self.image_batch = pipe.connect()
        print(self.format)
        if self.format == 'NCHW':
            self.images_NHWC = tf.transpose(self.image_batch, [0, 2, 3, 1])
        else:
            self.images_NHWC = self.image_batch
        self.pipe = pipe
Пример #4
0
def get_prepared_data(payload):
    full_pipeline = get_pipeline()

    prepared_data = full_pipeline.transform(payload)
    return prepared_data
Пример #5
0
import boto3
import sagemaker

region = boto3.Session().region_name
role = sagemaker.get_execution_role()
default_bucket = sagemaker.session.Session().default_bucket()

# Change these to reflect your project/business name or if you want to separate ModelPackageGroup/Pipeline from the rest of your team
model_package_group_name = f"sagemaker-group-insurance"
pipeline_name = f"sagemaker-pipeline-insurance"
print(role)

from pipeline import get_pipeline

pipeline = get_pipeline(
    region=region,
    role=role,
    default_bucket=default_bucket,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
)

pipeline.upsert(role_arn=role)

execution = pipeline.start()
Пример #6
0
def create_pipeline():
    """Create pipeline."""
    pl.get_pipeline(PIPELINE_FILE)
Пример #7
0
def get_pipeline():
    """Get the pipeline object."""
    pip = pl.get_pipeline(PIPELINE_FILE)
    pip.loglev = 'debug'
    return pip
Пример #8
0
def optimize(options):
    '''
    Optimize the given classifier or/and features extractor on a specified list
    of parameters
    Will proceed as follows :
        - loads the dataset
        - builds the corpus
        - load the parameters for tuning
        - loads the classifiers
        - loads the features extractors
        - builds the execution pipelines
        - trains and compares the different classifiers on the corpus
        - outputs the best set of parameters found
    '''

    #--------------------------------------------------------------------------
    # Check basic requirements
    if not (options["label-type"]):
        abort_clean("Label type not specified", "expected 'v' or 'g'")

    if not (options["hyper-parameters"]):
        abort_clean("hyper parameters not specified")

    if not (options["aggregation"]):
        abort_clean("Aggregation strategy not specified")

    #--------------------------------------------------------------------------
    # Load the tweets in one language for variety or gender classification
    Authors = parse_tweets_from_dir(input_dir=options["input-dir"],
                                    output_dir=options["processed-tweets-dir"],
                                    label=True,
                                    aggregation=options["aggregation"],
                                    verbosity_level=options["verbosity"])

    if not (Authors):
        abort_clean("Tweets loading failed")

    #--------------------------------------------------------------------------
    # Load the optimize parameters

    try:
        params = load_config(options["hyper-parameters"])
    except:
        abort_clean("Configuration couldn't be loaded",
                    "given path: " + options["hyper-parameters"])

    #--------------------------------------------------------------------------
    # Load the classifier

    t0 = time()
    classifier = get_classifier(classifier_str=params["classifier-call"],
                                config=None,
                                verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Load the features extractors

    features_extr = get_features_extr(
        features_str_list=params["features-extractr-call"],
        verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Build the execution pipeline

    pipeline = get_pipeline(features_extr=features_extr,
                            classifier=classifier,
                            verbose=options["verbosity"])

    # Set the classifier and the parameters to be tuned
    tuning_parameters = get_opt_parameters(params)
    scores = params["scores"]

    if options["verbosity"]:
        print("Starting the optimization process ...")

    # Launch the tuning of hyper parameters
    for score in scores:
        print("Tuning hyper-parameters for %s" % score)

        optimize_corpus = build_corpus(authors=Authors,
                                       label_type=options["label-type"],
                                       verbosity=options["verbosity"])

        clf_optimizer = GridSearchCV(estimator=pipeline,
                                     param_grid=tuning_parameters,
                                     scoring='%s_macro' % score,
                                     fit_params=None,
                                     n_jobs=-1,
                                     pre_dispatch='2*n_jobs',
                                     iid=True,
                                     cv=None,
                                     refit=True,
                                     verbose=options["verbosity"],
                                     error_score='raise',
                                     return_train_score=True)

        # Start optimisation
        clf_optimizer.fit(optimize_corpus["tweets"], optimize_corpus["labels"])

        if options["verbosity"]:
            print("Best parameters set found on development set:")
            best_parameters = clf_optimizer.best_params_
            for param_name in sorted(best_parameters.keys()):
                print("\t%s: %r" % (param_name, best_parameters[param_name]))
            print()

        if options["verbosity"] > 1:
            print("Grid scores on development set:")
            means = clf_optimizer.cv_results_['mean_test_score']
            stds = clf_optimizer.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds,
                                         clf_optimizer.cv_results_['params']):
                print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

        # saving results
        save_optimisation_results(grid=clf_optimizer,
                                  output_dir=options["output-dir"],
                                  score=score,
                                  verbose=options["verbosity"])
Пример #9
0
def train(options):
    '''
    Trains a specified classifier on a specified dataset using specified 
    feature extractors.
    Will proceed as follows :
        - loads the dataset
        - builds the corpus
        - loads the classifier
        - loads the features extractor
        - builds the execution pipeline
        - trains the classifier on the corpus
        - cross-validates the resulting model [optional]
        - saves the resulting model [optional]
    '''

    #--------------------------------------------------------------------------
    # Check basic requirements
    if not (options["label-type"]):
        abort_clean("Labels not specified", "expected 'l', 'g' or 'v'")

    if not (options["features"]) and not (options["gensim"]):
        abort_clean("Features not specified")

    if not (options["classifier"]):
        abort_clean("Classifier not specified")

    if not (options["aggregation"]):
        abort_clean("Aggregation strategy not specified")

    #--------------------------------------------------------------------------
    # Load the tweets in one language for variety or gender classification
    Authors = parse_tweets_from_dir(input_dir=options["input-dir"],
                                    output_dir=options["processed-tweets-dir"],
                                    label=True,
                                    aggregation=options["aggregation"],
                                    verbosity_level=options["verbosity"])

    if not (Authors):
        abort_clean("Tweets loading failed")

    #--------------------------------------------------------------------------
    # Load the classifier

    t0 = time()
    classifier = get_classifier(classifier_str=options["classifier"][0],
                                config=None,
                                verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Load the features extractors

    features_extr = None
    if not (options["gensim"]):
        features_extr = get_features_extr(
            features_str_list=options["features"][0],
            verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Build the execution pipeline

    pipeline = get_pipeline(features_extr=features_extr,
                            classifier=classifier,
                            verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Train the execution pipeline

    # train and cross validate results
    if (options["cross-validation"]):
        if (options["verbosity"]):
            print("Model Training with cross validation\n")

        if options["gensim"]:
            model, pipeline, scores = train_model_gensim_cross_validation(
                authors=Authors,
                label_type=options["label-type"],
                pipeline=pipeline,
                config=options["hyper-parameters"],
                token_level=options["token-level"],
                verbose=options["verbosity"])
        else:
            pipeline, scores = train_model_cross_validation(
                authors=Authors,
                label_type=options["label-type"],
                pipeline=pipeline,
                verbose=options["verbosity"])

        if options["verbosity"]:
            print_scores(scores)
        if options["output-dir"]:
            if options["gensim"]:
                filename = str("doc2vec" + "-siz_" +
                               str(model[0].vector_size) + "-win_" +
                               str(model[0].window) + "-cnt_" +
                               str(model[0].min_count) +
                               get_classifier_name(classifier))
            else:
                filename = str(
                    get_features_extr_name(features_extr) + "+" +
                    get_classifier_name(classifier))
                save_scores(scores=scores,
                            output_dir=options["output-dir"],
                            filename=filename,
                            verbose=options["verbosity"])

    # train without validation --> output-dir required
    else:
        if options["verbosity"]:
            print("Model Training without cross validation\n")
        if not (options["output-dir"]):
            abort_clean("No output directory specified.",
                        "Training without persisting is not allowed")

        train_corpus = build_corpus(authors=Authors,
                                    label_type=options["label-type"],
                                    verbosity=options["verbosity"])

        pipeline = train_model(corpus=train_corpus,
                               pipeline=pipeline,
                               verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Save the resulting model
    if options["gensim"]:
        filename = "doc2vec+" + get_classifier_name(classifier)
    else:
        filename = str(
            get_features_extr_name(features_extr) + "+" +
            get_classifier_name(classifier))

        save_model(pipeline=pipeline,
                   output_dir=options["output-dir"],
                   filename=filename,
                   verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # End Execution
    if options["verbosity"]:
        print("Training task complete in " + str(round(time() - t0)) + " s")
Пример #10
0
def compare(options):
    '''
    Compare a set of specified classifiers on a specified dataset using 
    specified features
    Will proceed as follows :
        - loads the dataset
        - builds the corpus
        - loads the classifiers
        - loads the features extractors
        - builds the execution pipelines
        - trains the different classifiers on the corpus
        - saves the scores obtained by each classifier on each set of features
    '''

    #--------------------------------------------------------------------------
    # Check basic requirements
    if not (options["label-type"]):
        abort_clean("label type not specified", "expected 'l', 'g' or 'v'")

    if not (options["features"]):
        abort_clean("Features not specified")

    if not (options["classifier"]):
        abort_clean("Classifier not specified")

    #--------------------------------------------------------------------------
    # Load the tweets
    Authors = parse_tweets_from_dir(input_dir=options["input-dir"],
                                    output_dir=options["processed-tweets-dir"],
                                    label=True,
                                    aggregation=options["aggregation"],
                                    verbosity_level=options["verbosity"])

    if not (Authors):
        abort_clean("Tweets loading failed")

    #--------------------------------------------------------------------------
    # Load the classifiers

    classifier_str_list = []
    if isinstance(options["classifier"], list):
        classifier_str_list = options["classifier"]
    else:
        classifier_str_list = [options["classifier"]]

    classifiers = [
        get_classifier(classifier_str=clf, config=None, verbose=False)
        for clf in classifier_str_list
    ]

    if options["verbosity"]:
        print("Classifiers Loaded: ")
        for clf in classifiers:
            print("    - '" + clf[0] + "'")
        print()

    #--------------------------------------------------------------------------
    # Load the features extractors

    extractors_str_list = options["features"]

    extractors = [
        get_features_extr(features_str_list=extr, verbose=False)
        for extr in extractors_str_list
    ]

    if options["verbosity"]:
        print("Features extractors Loaded: ")
        for extrs in extractors:
            print("    - '" + extrs[0] + "'")
        print()

    #--------------------------------------------------------------------------
    # Prepare results informations supports

    F1_micro = [[0 for x in classifiers] for y in extractors]
    F1_macro = [[0 for x in classifiers] for y in extractors]
    Time_train = [[0 for x in classifiers] for y in extractors]

    output_dir = options["output-dir"]
    individual_scores_dir = output_dir + "indiv_scores/"
    create_dir(individual_scores_dir)

    #--------------------------------------------------------------------------
    # Start the model comparison

    t0 = time()
    total_iteration = len(classifiers) * len(extractors)
    if options["verbosity"]:
        print("Starting model comparisons")

    # Loop for each pair features-extractor/classifier
    for idx_extr, extr in enumerate(extractors):
        extr_name = get_features_extr_name(extr)

        for idx_clf, clf in enumerate(classifiers):
            clf_name = get_classifier_name(clf)

            if options["verbosity"]:
                iteration_number = (idx_extr) * len(classifiers) + idx_clf + 1
                print("Iteration : " + str(iteration_number) + "/" +
                      str(total_iteration))
                print("Testing : Features: " + extr_name + " | Classifier: " +
                      clf_name)

            t0_step = time()

            # Build pipeline
            pipeline = get_pipeline(features_extr=extr,
                                    classifier=clf,
                                    verbose=False)

            # Start training + cross validation
            try:
                model, step_scores = train_model_cross_validation(
                    authors=Authors,
                    label_type=options["label-type"],
                    pipeline=pipeline,
                    verbose=False)
            except:
                print("some error occured - the features extracted and the \
                    classifier are problably incompatible\n")
                continue

            if options["verbosity"]:
                print("Training complete in " + str(round(time() - t0_step)) +
                      " seconds")
                print_scores(step_scores)
                print()

            # Save scores
            save_scores(scores=step_scores,
                        output_dir=individual_scores_dir,
                        filename=extr_name + "+" + clf_name,
                        verbose=False)
            F1_micro[idx_extr][idx_clf] = step_scores["mean_score_micro"]
            F1_macro[idx_extr][idx_clf] = step_scores["mean_score_macro"]
            Time_train[idx_extr][idx_clf] = round(time() - t0_step)

    # Save final micro and macro measuresand execution time
    save_comparison_table(F1_micro, extractors, classifiers,
                          output_dir + "micro.csv")
    save_comparison_table(F1_macro, extractors, classifiers,
                          output_dir + "macro.csv")
    save_comparison_table(Time_train, extractors, classifiers,
                          output_dir + "time.csv")

    if options["verbosity"]:
        print("Comparison task complete in " + str(round(time() - t0)) + " s")