Пример #1
0
def get_lsa(config=None):
    '''
    Returns a latent semantic analysis vectorizer.
    If specified, follows the config to setup the vectorizer
    Else follows default lsa setup.
    '''
    extractr_name = ""
    extractr = None

    if not (config):
        extractr_name = "lsa-default"
        extractr = TruncatedSVD( #------------------------- Default Values
            n_components=1000, #---------------------- 2
            algorithm="randomized",
            n_iter=10,
            random_state=42,
            tol=0.
        )

    else:
        extractr_name = config["extractr_name"]
        try:
            extractr = TruncatedSVD(**(config["configuration"]))
        except:
            abort_clean("Features Extractor configuration failed",
                "Configuring " + config["extractr_type"] + " with : " +
                config["configuration"])

    res = (extractr_name, extractr)
    return res
Пример #2
0
def get_nbb(config=None):
    '''
    Returns a Naive Bayes classifier (bernouilly implementation).
    If specified, follows the config to setup the NB classifier
    Else follows default NB classifier setup.
    '''
    clf_name = ""
    clf = None

    if not (config):
        clf_name = "nbb-default"
        clf = BernoulliNB(
            alpha=1.0,
            binarize=.0,
            fit_prior=True,
            class_prior=None)

    else:
        clf_name = config["classifier_name"]
        try:
            clf = BernoulliNB(**(config["configuration"]))
        except:
            abort_clean("Classifier configuration failed",
                "Configuring " + config["classifier_type"] + " with : " + 
                config["configuration"])
        
    return clf_name, clf
Пример #3
0
def get_tfidf(config=None):
    '''
    Returns a tfidf vectorizer.
    If specified, follows the config to setup the vectorizer
    Else follows default tfidf setup.
    '''
    extractr_name = ""
    extractr = None

    if not (config):
        extractr_name = "tfidf-default"
        extractr = TfidfTransformer(
            norm='l2',
            use_idf=True,
            smooth_idf=True,
            sublinear_tf=False)

    else:
        extractr_name = config["extractr_name"]
        try:
            extractr = TfidfTransformer(**(config["configuration"]))
        except:
            abort_clean("Features Extractor configuration failed",
                "Configuring " + config["extractr_type"] + " with : " + 
                config["configuration"])

    res = (extractr_name, extractr)
    return res
Пример #4
0
def get_svm(config=None):
    '''
    Returns a svm classifier.
    If specified, follows the config to setup the svm
    Else follows default svm setup.
    '''
    clf_name = ""
    clf = None

    if not(config):
        clf_name = "svm-default"
        clf = LinearSVC( #---------------------------- Default Value
                    C=1.0,
                    loss='squared_hinge',
                    penalty='l1', #------------------- l2
                    dual=False, #--------------------- True
                    tol=1e-4,
                    multi_class='crammer_singer', #--- ovr
                    fit_intercept=True,
                    intercept_scaling=1,
                    class_weight=None,
                    verbose=0,
                    random_state=None,
                    max_iter=500) #-------------------- 1000 

    else:
        clf_name = config["classifier_name"]
        try:
            clf = LinearSVC(**(config["configuration"]))
        except:
            abort_clean("Classifier configuration failed",
                "Configuring " + config["classifier_type"] + " with : " + 
                config["configuration"])
        
    return clf_name, clf
Пример #5
0
def get_classifier(classifier_str, config=None, verbose=1):
    '''
    Returns a classifier specified in parameter
    Available classifiers are :
        - nbb : NaiveBayes (bernouilly)
        - mlp : Multi-layered Perceptron
        - rfo : Random Forest
        - svm : Support Vector Machine

    A classifier can be specified : (TODO)
        - by its name --> a default ft_extr will be instanciated
        - by a path to a config file, --> a custom ft_extr will be instanciated
    '''

    if verbose and not(config):
        print("Starting loading classifier ... ")
    if config:
        classifier_str = config["classifier_type"]
    
    #--------------------------------------------------------------------------
    # Get required classifier

    clf_name = ""
    clf = None

    if classifier_str == "svm":
        clf_name, clf = get_svm(config)
    
    elif classifier_str == "mlp":
        clf_name, clf = get_mlp(config)

    elif classifier_str == "nbb":
        clf_name, clf = get_nbb(config)

    elif classifier_str == "rfo":
        clf_name, clf = get_rfo(config)

    else:
        try: 
            config = load_config(classifier_str)
        except:
            abort_clean("Cannot load the classifier configuration",
                "Either the clf name is incorrect or the path is invalid : " +
                classifier_str)

        if verbose:
            print("Loading classifier config from file")
        # recursive call with config loaded
        return get_classifier("", config, verbose=verbose)


    
    #--------------------------------------------------------------------------
    # Return classifier
    if(verbose):
        print("classifier loaded: '" + clf_name + "'\n")

    res = (clf_name, clf)
    return res
Пример #6
0
def load_model(filename):
    '''
    Loads a classifier (pipeline) from a file.
    '''
    # Load model
    try:
        pipe = joblib.load(filename)
    except:
        abort_clean("failed to load the classifier")
    return pipe
Пример #7
0
def get_wc2(config=None):
    '''
    Returns a word count (bigram) vectorizer.
    If specified, follows the config to setup the vectorizer
    Else follows default wc2 setup.
    '''
    extractr_name = ""
    extractr = None
    tokenizr = TweetTokenizer(
        preserve_case=True,
        strip_handles=True, 
        reduce_len=False)

    if not (config):
        extractr_name = "wc2-default"
        extractr = CountVectorizer( #----------------- Default Values
            input='content',
            encoding='utf-8',
            decode_error='ignore',
            strip_accents=None,
            analyzer='word',
            preprocessor=None,
            tokenizer=tokenizr.tokenize, #------------ None
            ngram_range=(1, 2), #--------------------- (1, 1)
            stop_words=None,
            lowercase=True,
            token_pattern=r"(?u)\b\w\w+\b",
            max_df=1.0,
            min_df=2, #------------------------------- 1
            max_features=None,
            vocabulary=None,
            binary=False,
            dtype=np.int64)

    else:
        extractr_name = config["extractr_name"]
        try:
            # Adjustements due to JSON incompatibility
            config["configuration"]["ngram_range"] = tuple(
                config["configuration"]["ngram_range"] )
            config["configuration"]["dtype"] = np.int64
            config["configuration"]["tokenizer"] = tokenizr.tokenize

            extractr = CountVectorizer(**(config["configuration"]))
        except:
            abort_clean("Features Extractor configuration failed",
                "Configuring " + config["extractr_type"] + " with : " + 
                config["configuration"])

    res = (extractr_name, extractr)
    return res
Пример #8
0
def get_mlp(config=None):
    '''
    Returns a Multi-Layered Perceptron classifier.
    If specified, follows the config to setup the mlp classifier
    Else follows default mlp classifier setup.
    '''
    clf_name = ""
    clf = None

    if not (config):
        clf_name = "mlp-default"
        clf = MLPClassifier(
            hidden_layer_sizes=(100,),
            activation="relu",
            solver='adam',
            alpha=0.0001,
            batch_size='auto',
            learning_rate="constant",
            learning_rate_init=0.001,
            power_t=0.5,
            max_iter=200,
            shuffle=True,
            random_state=None,
            tol=1e-4,
            verbose=False,
            warm_start=False,
            momentum=0.9,
            nesterovs_momentum=True,
            early_stopping=False,
            validation_fraction=0.1,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-8)

    else:
        clf_name = config["classifier_name"]
        try:
            config["configuration"]["hidden_layer_sizes"] = tuple(
                config["configuration"]["hidden_layer_sizes"] )
            clf = MLPClassifier(**(config["configuration"]))
        except:
            abort_clean("Classifier configuration failed",
                "Configuring " + config["classifier_type"] + " with : " + 
                config["configuration"])

    return clf_name, clf
Пример #9
0
def load_features_extr(features_str, verbose=1):
    '''
    Returns a list of vectorizers to match the specified features_str
    Available features extractors are :
        - wc2   : Word count - bigram
        - tfidf : TF-IDF
        - lsa   : Latent Semantic Analysis

    A feature extractor can be specified :
        - by its name --> a default clf will be instanciated
        - by a path to a config file, --> a custom clf will be instanciated
    '''
    feat_extractors = []

    #--------------------------------------------------------------------------
    # Get required features_extractor

    if features_str == "wc2":
        feat_extractors.append(get_wc2(None))

    elif features_str == "tfidf":
        feat_extractors.append(get_wc2(None))
        feat_extractors.append(get_tfidf(None))

    elif features_str == "lsa":
        feat_extractors.append(get_wc2(None))
        feat_extractors.append(get_tfidf(None))
        feat_extractors.append(get_lsa(None))
    
    else :
        try: 
            config = load_config(features_str)
        except:
            abort_clean("Cannot load the extractors configuration",
                "Either extr name is incorrect or path is invalid : " +
                features_str)
        # Load the config from a file
        if verbose:
            print("Loading features extractor config from file ")
        feat_extractors = load_features_extr_from_file(config, verbose=verbose)

    #--------------------------------------------------------------------------
    # Return features extractors
    return feat_extractors
Пример #10
0
def get_rfo(config=None):
    '''
    Returns a Naive Bayes classifier (bernouilly implementation).
    If specified, follows the config to setup the NB classifier
    Else follows default NB classifier setup.
    '''
    clf_name = ""
    clf = None

    if not (config):
        clf_name = "rfo-default"
        clf = RandomForestClassifier(
            n_estimators=10,
            criterion="gini",
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0.,
            max_features="auto",
            max_leaf_nodes=None,
            min_impurity_split=1e-7,
            bootstrap=True,
            oob_score=False,
            n_jobs=-1, #------------------------------ 1
            random_state=None,
            verbose=0,
            warm_start=False,
            class_weight=None)

    else:
        clf_name = config["classifier_name"]
        try:
            clf = RandomForestClassifier(**(config["configuration"]))
        except:
            abort_clean("Classifier configuration failed",
                "Configuring " + config["classifier_type"] + " with : " + 
                config["configuration"])
        
    return clf_name, clf
Пример #11
0
def optimize(options):
    '''
    Optimize the given classifier or/and features extractor on a specified list
    of parameters
    Will proceed as follows :
        - loads the dataset
        - builds the corpus
        - load the parameters for tuning
        - loads the classifiers
        - loads the features extractors
        - builds the execution pipelines
        - trains and compares the different classifiers on the corpus
        - outputs the best set of parameters found
    '''

    #--------------------------------------------------------------------------
    # Check basic requirements
    if not (options["label-type"]):
        abort_clean("Label type not specified", "expected 'v' or 'g'")

    if not (options["hyper-parameters"]):
        abort_clean("hyper parameters not specified")

    if not (options["aggregation"]):
        abort_clean("Aggregation strategy not specified")

    #--------------------------------------------------------------------------
    # Load the tweets in one language for variety or gender classification
    Authors = parse_tweets_from_dir(input_dir=options["input-dir"],
                                    output_dir=options["processed-tweets-dir"],
                                    label=True,
                                    aggregation=options["aggregation"],
                                    verbosity_level=options["verbosity"])

    if not (Authors):
        abort_clean("Tweets loading failed")

    #--------------------------------------------------------------------------
    # Load the optimize parameters

    try:
        params = load_config(options["hyper-parameters"])
    except:
        abort_clean("Configuration couldn't be loaded",
                    "given path: " + options["hyper-parameters"])

    #--------------------------------------------------------------------------
    # Load the classifier

    t0 = time()
    classifier = get_classifier(classifier_str=params["classifier-call"],
                                config=None,
                                verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Load the features extractors

    features_extr = get_features_extr(
        features_str_list=params["features-extractr-call"],
        verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Build the execution pipeline

    pipeline = get_pipeline(features_extr=features_extr,
                            classifier=classifier,
                            verbose=options["verbosity"])

    # Set the classifier and the parameters to be tuned
    tuning_parameters = get_opt_parameters(params)
    scores = params["scores"]

    if options["verbosity"]:
        print("Starting the optimization process ...")

    # Launch the tuning of hyper parameters
    for score in scores:
        print("Tuning hyper-parameters for %s" % score)

        optimize_corpus = build_corpus(authors=Authors,
                                       label_type=options["label-type"],
                                       verbosity=options["verbosity"])

        clf_optimizer = GridSearchCV(estimator=pipeline,
                                     param_grid=tuning_parameters,
                                     scoring='%s_macro' % score,
                                     fit_params=None,
                                     n_jobs=-1,
                                     pre_dispatch='2*n_jobs',
                                     iid=True,
                                     cv=None,
                                     refit=True,
                                     verbose=options["verbosity"],
                                     error_score='raise',
                                     return_train_score=True)

        # Start optimisation
        clf_optimizer.fit(optimize_corpus["tweets"], optimize_corpus["labels"])

        if options["verbosity"]:
            print("Best parameters set found on development set:")
            best_parameters = clf_optimizer.best_params_
            for param_name in sorted(best_parameters.keys()):
                print("\t%s: %r" % (param_name, best_parameters[param_name]))
            print()

        if options["verbosity"] > 1:
            print("Grid scores on development set:")
            means = clf_optimizer.cv_results_['mean_test_score']
            stds = clf_optimizer.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds,
                                         clf_optimizer.cv_results_['params']):
                print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

        # saving results
        save_optimisation_results(grid=clf_optimizer,
                                  output_dir=options["output-dir"],
                                  score=score,
                                  verbose=options["verbosity"])
Пример #12
0
    #   - input-dir            : input directory for tweet loading
    #   - label-type           : which labels to train on
    #   - no-cross-validation  : assess if the classifier should be cross-valid
    #   - output-dir           : output directory for resulting files
    #   - processed-tweets-dir : (legacy) directory for the parsed tweets
    #   - verbosity            : noise level on the terminal

    trainer_opt = {
        "aggregation": args.aggregation,
        "classifier": args.classifier,
        "cross-validation": args.cross_validation,
        "features": args.features,
        "gensim": args.gensim,
        "hyper-parameters": args.hyper_parameters,
        "input-dir": args.input_dir,
        "label-type": args.label_type,
        "output-dir": args.output_dir,
        "processed-tweets-dir": args.processed_tweets_dir,
        "token-level": args.token_level,
        "verbosity": args.verbosity
    }

    from act_trainer import train
    train(trainer_opt)

#------------------------------------------------------------------------------
# [Contextual] Unknown Request
else:
    abort_clean("ERROR : Unknown user request.",
                "Request found : " + usr_request)
Пример #13
0
def evaluate(options):
    '''
    Evaluates the results of a classification in the context of PAN17
    The input directory must be structured according to PAN17 specifications
    Will proceed as follows :
        - loads the author files
        - loads the truth files (one per language)
        - compares the predicted labels with the truth
        - outputs the results
    '''
    # PAN 17 specifics
    language_dirs = get_language_dir_names()

    #--------------------------------------------------------------------------
    # Check basic requirements
    if not (options["truth-dir"]):
        abort_clean("truth directory not specified")

    #--------------------------------------------------------------------------
    # Load the author files
    if options["verbosity"]:
        print("Loading authors files ...")
        t0 = time()

    Authors = []
    for l_dir in language_dirs:
        l_path = format_dir_name(options["input-dir"] + l_dir)
        file_name_list = [
            f for f in listdir(l_path) if isfile(join(l_path, f))
        ]
        for file_name in file_name_list:
            auth = load_author_file(file_path=l_path + file_name,
                                    verbose=options["verbosity"] > 1)
            Authors.append(auth)

    if options["verbosity"]:
        print("Files loaded : " + str(len(Authors)))
        print("Loading author files --- success in %.3f seconds\n" %
              (time() - t0))

    #--------------------------------------------------------------------------
    # Load the truth files
    if options["verbosity"]:
        print("Loading truth files ...")
        t0 = time()

    truth = dict()
    for lang in language_dirs:
        lang_dir = format_dir_name(options["truth-dir"] + lang)
        try:
            truth_file = open(lang_dir + "truth.txt")
        except:
            abort_clean("Can't open truth file",
                        "Couldn't open " + lang_dir + "truth.txt")

        truth_lines = [x.strip().split(':::') for x in truth_file.readlines()]
        attrs = dict()
        for l in truth_lines:
            attrs[l[0]] = {"gender": l[1], "variety": l[2]}

        truth[lang] = attrs

    if options["verbosity"]:
        print("Files loaded : " + str(len(truth)))
        print("Loading truth files --- success in %.3f seconds\n" %
              (time() - t0))

    #--------------------------------------------------------------------------
    # Compute results
    if options["verbosity"]:
        print("Computing results ...")
        t0 = time()

    # preparing result data-structure
    results = dict()
    for lang in language_dirs:
        var_labels = get_variety_labels(lang)
        var_confusion_matrix = [[0 for x in var_labels] for y in var_labels]
        gdr_labels = get_gender_labels()
        gdr_confusion_matrix = [[0 for x in gdr_labels] for y in gdr_labels]
        results[lang] = {
            "n_files": 0,
            "gdr-labels": gdr_labels,
            "gdr-confusion-matrix": gdr_confusion_matrix,
            "gdr-positive-eval": 0,
            "var-labels": var_labels,
            "var-confusion-matrix": var_confusion_matrix,
            "var-positive-eval": 0
        }

    # Starting computation
    for auth in Authors:
        lang_res = results[auth["lang"]]
        auth_truth = truth[auth["lang"]][auth["id"]]

        results[auth["lang"]]["n_files"] += 1

        auth_gdr_eval = auth_truth["gender"] == auth["gender"]
        auth_var_eval = auth_truth["variety"] == auth["variety"]

        var_labels = lang_res["var-labels"]
        lang_res["var-confusion-matrix"][var_labels.index(
            auth_truth["variety"])][var_labels.index(auth["variety"])] += 1
        gdr_labels = lang_res["gdr-labels"]
        lang_res["gdr-confusion-matrix"][gdr_labels.index(
            auth_truth["gender"])][gdr_labels.index(auth["gender"])] += 1

        results[auth["lang"]]["gdr-positive-eval"] += 1 if auth_gdr_eval else 0
        results[auth["lang"]]["var-positive-eval"] += 1 if auth_var_eval else 0

    if options["verbosity"]:
        print("Computing results --- success in %.3f seconds\n" %
              (time() - t0))

    #--------------------------------------------------------------------------
    # Save results

    save_evaluation_results(results=results,
                            input_dir=options["input-dir"],
                            output_dir=options["output-dir"],
                            verbose=options["verbosity"])
Пример #14
0
def train_model_cross_validation(authors, label_type, pipeline, verbose=1):
    '''
    Takes a pipeline and train it on the specified corpus.
    Processes a cross-validation algorithm (K-fold) in order to evaluate the
    quality of the model.
    Returns the best trained pipeline (in terms of macro f-score).
    '''

    labels = get_labels(lang=authors[0]["lang"], label_type=label_type)

    if not (labels):
        abort_clean("Could not extract labels")
    if verbose:
        print("Labels extraction succeded.")
        print("Available labels : " + " / ".join(labels) + "\n")

    if verbose:
        t0 = time()
        print("Starting model Cross Validation ... (this may take some time)")

    confusion = array([[0 for x in range(len(labels))]
                       for y in range(len(labels))])
    scores = []
    best_f_score = 0
    best_pipeline = None
    scores_micro = []
    scores_macro = []

    # start Kfold cross validation.
    n_run = 1
    k_fold = KFold(n_splits=10, shuffle=True)
    authors = array(authors)
    for train_indices, test_indices in k_fold.split(authors):

        # build train corpus
        train_authors = authors[train_indices]
        train_corpus = build_corpus(authors=train_authors,
                                    label_type=label_type,
                                    verbosity=verbose)

        # build test corpus
        test_authors = authors[test_indices]

        # train model
        pipeline = train_model(corpus=train_corpus,
                               pipeline=pipeline,
                               verbose=0)

        # test model
        truthes = []
        predictions = []
        for author in test_authors:
            var_classes, var_predictions = predict_author_proba(author=author,
                                                                model=pipeline)
            var_max_idx = var_predictions.index(max(var_predictions))
            label_predicted = var_classes[var_max_idx]
            predictions.append(label_predicted)
            truthes.append(author[label_type])

        # compute metrics
        confusion += confusion_matrix(truthes, predictions, labels=labels)
        score_micro = f1_score(truthes,
                               predictions,
                               labels=labels,
                               average="micro")
        score_macro = f1_score(truthes,
                               predictions,
                               labels=labels,
                               average="macro")

        if verbose:
            print("Fold " + str(n_run) + " : micro_f1=" + str(score_micro) +
                  " macrof1=" + str(score_macro))

        # store for avg
        scores_micro.append(score_micro)
        scores_macro.append(score_macro)
        n_run += 1

        # save the pipeline if better than the current one
        if score_macro > best_f_score:
            best_pipeline = clone(pipeline, True)
            best_f_score = score_macro

    if verbose:
        print("Model Cross Validation complete in %.3f seconds.\n" %
              (time() - t0))

    scores = {
        "mean_score_micro": sum(scores_micro) / len(scores_micro),
        "mean_score_macro": sum(scores_macro) / len(scores_macro),
        "confusion_matrix": confusion,
        "best_macro_score": best_f_score,
        "labels": labels
    }

    return best_pipeline, scores
Пример #15
0
def train(inputPath, splitsPath, outputPath, verbosity_level=1):
    '''

    For each language, proceeds as follow:
        - takes in input the corresponding .pkl file
        - train a text-based classifier on the 80% split
        - save the resulting model in outputPath

    :param inputPath:  Path to PAN18 dataset
    :param splitsPath: Path to dir containing the .pkl files produced by 'splitting.py'
    :param outputPath: Path to dir in which the outputs models will be saved
        NB. Create outputPath directory before using this function
    '''

    for lang in ['ar', 'en', 'es']:

        input_dir = join(inputPath, lang)
        output_dir = join(outputPath, lang)

        #print("input_dir ", input_dir)
        #print("output_dir ", output_dir)

        if exists(output_dir):
            rmtree(output_dir)
        makedirs(output_dir)

        # --------------------------------------------------------------------------
        # Load the .pkl file
        with open(splitsPath + "/" + lang + ".pkl", 'rb') as f:
            dic = load(f)
        # Load the tweets in one language
        Authors = parse_tweets_from_dir(input_dir=inputPath + "/" + lang + "/",
                                        label=True,
                                        aggregation=100,
                                        splitDic=dic,
                                        verbosity_level=verbosity_level)

        if not (Authors):
            abort_clean("Tweets loading failed")

        # --------------------------------------------------------------------------
        # Load the classifier

        t0 = time()
        classifier = get_classifier(classifier_str="svm",
                                    config=None,
                                    verbose=verbosity_level)

        # --------------------------------------------------------------------------
        # Load the features extractors

        features_extr = None
        features_extr = get_features_extr(features_str_list="tfidf",
                                          language=lang,
                                          verbose=verbosity_level)
        # --------------------------------------------------------------------------
        # Build the execution pipeline

        pipeline = get_pipeline(features_extr=features_extr,
                                classifier=classifier,
                                verbose=verbosity_level)

        # --------------------------------------------------------------------------
        # Train the execution pipeline

        # train and cross validate results
        if (verbosity_level):
            print("Model Training with cross validation\n")
        pipeline, scores, best_train_indices, best_test_indices = train_model_cross_validation(
            authors=Authors,
            label_type="gender",
            pipeline=pipeline,
            verbose=verbosity_level)

        if verbosity_level:
            print_scores(scores)

        filename = str(
            get_features_extr_name(features_extr) + "+" +
            get_classifier_name(classifier))

        save_scores(scores=scores,
                    output_dir=output_dir + "/",
                    filename=lang,
                    verbose=verbosity_level)

        #--------------------------------------------------------------------------
        # Save the resulting model
        filename = str(
            get_features_extr_name(features_extr) + "+" +
            get_classifier_name(classifier))

        # build train corpus
        authors = array(Authors)
        train_authors = authors[best_train_indices]
        train_corpus = build_corpus(authors=train_authors,
                                    label_type='gender',
                                    verbosity=verbosity_level)
        # build test corpus
        test_authors = authors[best_test_indices]

        # train model
        pipeline = train_model(corpus=train_corpus,
                               pipeline=pipeline,
                               verbose=0)

        save_model(pipeline=pipeline,
                   output_dir=output_dir + "/",
                   filename=filename,
                   verbose=verbosity_level)

        # --------------------------------------------------------------------------
        # End Execution
        if verbosity_level:
            print("Training task complete in " + str(round(time() - t0)) +
                  " s")
Пример #16
0
def train(options):
    '''
    Trains a specified classifier on a specified dataset using specified 
    feature extractors.
    Will proceed as follows :
        - loads the dataset
        - builds the corpus
        - loads the classifier
        - loads the features extractor
        - builds the execution pipeline
        - trains the classifier on the corpus
        - cross-validates the resulting model [optional]
        - saves the resulting model [optional]
    '''

    #--------------------------------------------------------------------------
    # Check basic requirements
    if not (options["label-type"]):
        abort_clean("Labels not specified", "expected 'l', 'g' or 'v'")

    if not (options["features"]) and not (options["gensim"]):
        abort_clean("Features not specified")

    if not (options["classifier"]):
        abort_clean("Classifier not specified")

    if not (options["aggregation"]):
        abort_clean("Aggregation strategy not specified")

    #--------------------------------------------------------------------------
    # Load the tweets in one language for variety or gender classification
    Authors = parse_tweets_from_dir(input_dir=options["input-dir"],
                                    output_dir=options["processed-tweets-dir"],
                                    label=True,
                                    aggregation=options["aggregation"],
                                    verbosity_level=options["verbosity"])

    if not (Authors):
        abort_clean("Tweets loading failed")

    #--------------------------------------------------------------------------
    # Load the classifier

    t0 = time()
    classifier = get_classifier(classifier_str=options["classifier"][0],
                                config=None,
                                verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Load the features extractors

    features_extr = None
    if not (options["gensim"]):
        features_extr = get_features_extr(
            features_str_list=options["features"][0],
            verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Build the execution pipeline

    pipeline = get_pipeline(features_extr=features_extr,
                            classifier=classifier,
                            verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Train the execution pipeline

    # train and cross validate results
    if (options["cross-validation"]):
        if (options["verbosity"]):
            print("Model Training with cross validation\n")

        if options["gensim"]:
            model, pipeline, scores = train_model_gensim_cross_validation(
                authors=Authors,
                label_type=options["label-type"],
                pipeline=pipeline,
                config=options["hyper-parameters"],
                token_level=options["token-level"],
                verbose=options["verbosity"])
        else:
            pipeline, scores = train_model_cross_validation(
                authors=Authors,
                label_type=options["label-type"],
                pipeline=pipeline,
                verbose=options["verbosity"])

        if options["verbosity"]:
            print_scores(scores)
        if options["output-dir"]:
            if options["gensim"]:
                filename = str("doc2vec" + "-siz_" +
                               str(model[0].vector_size) + "-win_" +
                               str(model[0].window) + "-cnt_" +
                               str(model[0].min_count) +
                               get_classifier_name(classifier))
            else:
                filename = str(
                    get_features_extr_name(features_extr) + "+" +
                    get_classifier_name(classifier))
                save_scores(scores=scores,
                            output_dir=options["output-dir"],
                            filename=filename,
                            verbose=options["verbosity"])

    # train without validation --> output-dir required
    else:
        if options["verbosity"]:
            print("Model Training without cross validation\n")
        if not (options["output-dir"]):
            abort_clean("No output directory specified.",
                        "Training without persisting is not allowed")

        train_corpus = build_corpus(authors=Authors,
                                    label_type=options["label-type"],
                                    verbosity=options["verbosity"])

        pipeline = train_model(corpus=train_corpus,
                               pipeline=pipeline,
                               verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # Save the resulting model
    if options["gensim"]:
        filename = "doc2vec+" + get_classifier_name(classifier)
    else:
        filename = str(
            get_features_extr_name(features_extr) + "+" +
            get_classifier_name(classifier))

        save_model(pipeline=pipeline,
                   output_dir=options["output-dir"],
                   filename=filename,
                   verbose=options["verbosity"])

    #--------------------------------------------------------------------------
    # End Execution
    if options["verbosity"]:
        print("Training task complete in " + str(round(time() - t0)) + " s")
Пример #17
0
def classify(options):
    '''
    Classifies a dataset respecting the PAN'17 specification.
    Predicts both language variety and 
    Will proceed as follows :
        - loads the dataset
        - [contextual] for each subdirectory of the dataset, loads the related
            classifiers
        - predicts the different labels for each author within the loaded 
            corpus
        - outputs the result files
        - [contextual] checks it's results
    '''

    #----------------------------------------------------------------------
    # Checking basic requirements
    if not (options["classification-type"]
            and options["classification-type"] in ["loose", "successive"]):
        abort_clean("Classification type incorrectly specified")

    if options["verbosity"]:
        print('Classificationtype is ' + options["classification-type"])

    # PAN 17 specifics
    languages = get_language_dir_names()
    for lang in languages:

        if options["verbosity"]:
            print('---------------------------------------')
            print("Language up for classification: '" + lang + "'\n")

        processed_tweets_dir = (
            "" if not (options["processed-tweets-dir"]) else
            format_dir_name(options["processed-tweets-dir"] + lang))
        classifier_dir_path = format_dir_name(options["classifiers-dir"] +
                                              lang)
        output_subdir_path = format_dir_name(options["output-dir"] + lang)

        #----------------------------------------------------------------------
        # Load the tweets
        Authors = parse_tweets_from_dir(
            input_dir=format_dir_name(options["input-dir"] + lang),
            output_dir=processed_tweets_dir,
            label=False,
            aggregation=options["aggregation"],
            verbosity_level=options["verbosity"])

        if not (Authors):
            abort_clean("Tweets loading failed")

        #----------------------------------------------------------------------
        # Load the classifiers
        classifiers = load_classifiers(
            classifier_dir_path=classifier_dir_path,
            classification_type=options["classification-type"],
            verbose=options["verbosity"])

        #----------------------------------------------------------------------
        # Start classification
        if options["verbosity"]:
            print("Starting authors classification ...")
            t0 = time()

        classify_authors(Authors, classifiers, options["classification-type"],
                         options["verbosity"])

        if options["verbosity"] > 1:
            for auth in Authors:
                print(auth["id"] + ":::" + auth["gender"] + ":::" +
                      auth["variety"])

        if options["verbosity"]:
            print("Classification of '" + lang + "' complete in %.3f seconds" %
                  (time() - t0))
            print('---------------------------------------\n')

        create_dir(output_subdir_path)
        for auth in Authors:
            save_author_file(author=auth,
                             output_dir=output_subdir_path,
                             verbose=options["verbosity"] > 1)

        # for memory issues, free the classifiers objects
        gc.collect()
Пример #18
0
def train_model_gensim_cross_validation(authors,
                                        label_type,
                                        pipeline,
                                        config="",
                                        token_level="word",
                                        verbose=1):
    '''
    Takes a doc2vec model and trains it on the specified corpus.
    Takes a classifier and trains it on the doc2vec model vectors.
    Processes a cross-validation algorithm (K-fold) in order to evaluate the
    quality of the overall model.
    Returns the best trained pipeline (in terms of macro f-score).
    '''
    labels = get_labels(lang=authors[0]["lang"], label_type=label_type)

    if not (labels):
        abort_clean("Could not extract labels")

    if verbose:
        print("Labels extraction succeded.")
        print("Available labels : " + " / ".join(labels) + "\n")

    if verbose:
        t0 = time()
        print("Starting model Cross Validation ... (this may take some time)")

    # load doc2vec conf
    conf = []
    if config:
        conf = load_config(config)["extractors"][0]  # legacy conf files
        if verbose:
            print("loading doc2vec config file from disk :")
            print("  - vector_size = " +
                  str(conf["configuration"]["vector_size"]))
            print("  - window      = " + str(conf["configuration"]["window"]))
            print("  - min_count   = " +
                  str(conf["configuration"]["min_count"]))

    # load the tokenizer
    tknzr = Tokenizer(token_level)
    if verbose:
        print("Selected token level : " + token_level + "\n")

    # Kfold parameters.
    confusion = array([[0 for x in range(len(labels))]
                       for y in range(len(labels))])
    scores = []
    best_f_score = 0
    best_pipeline = None
    best_model = None
    scores_micro = []
    scores_macro = []
    n_run = 1
    k_fold = KFold(n_splits=10, shuffle=True)
    authors = array(authors)

    # start Kfold cross validation.
    for train_indices, test_indices in k_fold.split(authors):

        # import gensim lib (heavy load)
        from gensim import models as gensim_models

        # get doc2vec model
        model_dm = get_doc2vec(conf, 1, verbose)
        model_pv = get_doc2vec(conf, 0, verbose)

        # build train corpus
        train_authors = authors[train_indices]
        train_corpus = build_corpus(authors=train_authors,
                                    label_type=label_type,
                                    verbosity=verbose)

        # build test corpus
        test_authors = authors[test_indices]

        # learn the vocabulary (tokenisation of each tweet)
        tweets = list(zip(train_corpus["labels"], train_corpus["tweets"]))
        processed_tweets = []
        idxs = [0 for l in labels]
        for t in tweets:
            prefix = t[0] + "_" + str(idxs[labels.index(t[0])])
            idxs[labels.index(t[0])] += 1
            processed_tweets.append(
                gensim_models.doc2vec.LabeledSentence(words=tknzr.tokenize(
                    t[1]),
                                                      tags=[prefix]))
        tweets = processed_tweets
        model_dm.build_vocab(tweets)
        model_pv.build_vocab(tweets)

        # train doc2vec model
        shuffle(tweets)
        model_dm.train(sentences=tweets,
                       total_examples=model_dm.corpus_count,
                       epochs=100,
                       start_alpha=0.025,
                       end_alpha=0.0025)
        model_dm.delete_temporary_training_data()
        model_pv.train(sentences=tweets,
                       total_examples=model_pv.corpus_count,
                       epochs=100,
                       start_alpha=0.025,
                       end_alpha=0.0025)
        model_pv.delete_temporary_training_data()

        # train dataset conversion (doc->vectors)
        train_vectors = zeros((sum(idxs), model_dm.vector_size * 2))
        train_labels = []
        for i, tag in enumerate(model_dm.docvecs.doctags):
            train_vectors[i] = concatenate(
                (model_dm.docvecs[tag], model_pv.docvecs[tag]), axis=0)
            train_labels.append(tag.split('_')[0])
        train_labels = array(train_labels)

        # train classifier
        pipeline.fit(train_vectors, train_labels)

        # test models
        truthes = []
        predictions = []
        for author in test_authors:
            # test dataset conversion (doc->vectors)
            tweet_vectors = [
                concatenate((model_dm.infer_vector(tknzr.tokenize(tweet)),
                             model_pv.infer_vector(tknzr.tokenize(tweet))),
                            axis=0) for tweet in author["tweets"]
            ]

            author_tmp = {"tweets": tweet_vectors}
            var_classes, var_predictions = predict_author_proba(
                author=author_tmp, model=pipeline)
            var_max_idx = var_predictions.index(max(var_predictions))
            label_predicted = var_classes[var_max_idx]
            predictions.append(label_predicted)
            truthes.append(author[label_type])

        # compute metrics
        confusion += confusion_matrix(truthes, predictions, labels=labels)
        score_micro = f1_score(truthes,
                               predictions,
                               labels=labels,
                               average="micro")
        score_macro = f1_score(truthes,
                               predictions,
                               labels=labels,
                               average="macro")

        if verbose:
            print("Fold " + str(n_run) + " : micro_f1=" + str(score_micro) +
                  " macrof1=" + str(score_macro))

        # store for avg
        scores_micro.append(score_micro)
        scores_macro.append(score_macro)
        n_run += 1

        # save the pipeline if better than the current one
        if score_macro > best_f_score:
            best_model = [model_dm, model_pv]
            best_pipeline = clone(pipeline, True)
            best_f_score = score_macro

    if verbose:
        print("Model Cross Validation complete in %.3f seconds.\n" %
              (time() - t0))

    scores = {
        "mean_score_micro": sum(scores_micro) / len(scores_micro),
        "mean_score_macro": sum(scores_macro) / len(scores_macro),
        "confusion_matrix": confusion,
        "best_macro_score": best_f_score,
        "labels": labels
    }

    return best_model, best_pipeline, scores
Пример #19
0
def parse_tweets_from_dir_2(input_dir,
                            list_authors,
                            output_dir=None,
                            label=True,
                            aggregation=100,
                            verbosity_level=1):
    '''
    Parses all the xml files directly in the input_dir (no recursion).
    Retrieves the attributes of the author stored in the truth file.
    If specified, the parsed files will be written into the output_dir
    Verbosity level specifies the amount of content displayed:
        0- nothing
        1- Main steps
        2- Files parsed and stats about filtering / tweets available per class
        3- All the parsed content.
    Returns a list containing all the author objects contained within the
    input_dir
    '''
    # vars
    Authors = []
    t0 = time()
    n_files = 0
    n_files_parsed = 0
    n_files_filtered = 0
    n_files_infos_retrieved = 0
    ret = '\n'

    # ---------------------------- FILES LISTING

    if verbosity_level:
        t0 = time()
        print("Starting files Listing ...")
    try:
        xml_files = [
            f for f in listdir(input_dir + "/text")
            if (isfile(join(input_dir + "/text", f)) and f[-4:] == ".xml")
            and f.strip(".xml") in list_authors
        ]
    except:
        abort_clean("Files listing --- failure",
                    "Maybe the directory specified is incorrect ?")

    if verbosity_level:
        print("Files found : " + str(len(xml_files)))
        print("Files listing --- success in %.3f seconds\n" % (time() - t0))

    # ---------------------------- FILES PROCESSING
    if verbosity_level:
        t0 = time()
        print("Starting files processing ...")

    n_files = len(xml_files)

    if output_dir:
        create_dir(output_dir)

    for f in xml_files:

        author = None
        tweets = []
        save_file = output_dir + f if output_dir else None
        try:
            author = parse_file(file_to_parse=input_dir + "/text/" + f,
                                aggregation=aggregation,
                                file_to_save=save_file,
                                verbose=verbosity_level > 2)
        except:
            if verbosity_level > 1:
                print("   Parsing file : " + f + " --- failure")
            continue

        if verbosity_level > 1:
            print("   Parsing file : " + f + " --- success")
        n_files_parsed += 1
        author["id"] = f[:-4]
        Authors.append(author)

    if verbosity_level:
        print("Parsed files : " + str(n_files_parsed) + " out of " +
              str(n_files))
        print("Files Parsing --- success in %.3f seconds\n" % (time() - t0))

    # ---------------------------- AUTHOR ATTRIBUTES RETRIEVING

    if label:
        if verbosity_level:
            t0 = time()
            print("Starting Author Attributes Retrieval ...")
        try:
            truth_file = open(input_dir + "/" + author['lang'] + ".txt")
        except:
            abort_clean("Author Attributes Retrieval --- failure",
                        "Couldn't open truth file")

        truth_lines = [x.strip().split(':::') for x in truth_file.readlines()]
        attrs = dict()
        for l in truth_lines:
            attrs[l[0]] = l[1:]

        for idx, author in enumerate(Authors):
            author["gender"] = attrs[author["id"]][0]
            # author["variety"] = attrs[author["id"]][1]

            if author["gender"]:
                n_files_infos_retrieved += 1

            if verbosity_level > 1:
                print("   author " + Authors[idx]["id"] +
                      " information retrieved : Gender=" +
                      Authors[idx]["gender"] + " Language=" +
                      Authors[idx]["lang"])

        if verbosity_level > 1:
            print("Retreived Information : " + str(n_files_infos_retrieved) +
                  " out of " + str(n_files))
            print("Author Attributes Retrieval --- success in %.3f seconds\n" %
                  (time() - t0))

    # ---------------------------- TWEET FILTERING -- D
    if verbosity_level:
        t0 = time()
        print("Starting Tweets Filtering ...")

    for author in Authors:

        if verbosity_level > 1:
            print("   author " + author["id"] + " filtering")

        try:
            author["tweets"] = filter_tweets(author, verbosity_level > 1)
        except:
            continue

        n_files_filtered += 1

    if verbosity_level:
        print("Filtered files : " + str(n_files_filtered) + " out of " +
              str(n_files))
        print("Tweets Filtering --- success in %.3f seconds\n" % (time() - t0))

    # ---------------------------- RETURNING PROCESSED DATA

    if verbosity_level:
        print("Tweets available : " +
              str(sum([len(a["tweets"]) for a in Authors])) + "\n")

    return Authors
Пример #20
0
def compare(options):
    '''
    Compare a set of specified classifiers on a specified dataset using 
    specified features
    Will proceed as follows :
        - loads the dataset
        - builds the corpus
        - loads the classifiers
        - loads the features extractors
        - builds the execution pipelines
        - trains the different classifiers on the corpus
        - saves the scores obtained by each classifier on each set of features
    '''

    #--------------------------------------------------------------------------
    # Check basic requirements
    if not (options["label-type"]):
        abort_clean("label type not specified", "expected 'l', 'g' or 'v'")

    if not (options["features"]):
        abort_clean("Features not specified")

    if not (options["classifier"]):
        abort_clean("Classifier not specified")

    #--------------------------------------------------------------------------
    # Load the tweets
    Authors = parse_tweets_from_dir(input_dir=options["input-dir"],
                                    output_dir=options["processed-tweets-dir"],
                                    label=True,
                                    aggregation=options["aggregation"],
                                    verbosity_level=options["verbosity"])

    if not (Authors):
        abort_clean("Tweets loading failed")

    #--------------------------------------------------------------------------
    # Load the classifiers

    classifier_str_list = []
    if isinstance(options["classifier"], list):
        classifier_str_list = options["classifier"]
    else:
        classifier_str_list = [options["classifier"]]

    classifiers = [
        get_classifier(classifier_str=clf, config=None, verbose=False)
        for clf in classifier_str_list
    ]

    if options["verbosity"]:
        print("Classifiers Loaded: ")
        for clf in classifiers:
            print("    - '" + clf[0] + "'")
        print()

    #--------------------------------------------------------------------------
    # Load the features extractors

    extractors_str_list = options["features"]

    extractors = [
        get_features_extr(features_str_list=extr, verbose=False)
        for extr in extractors_str_list
    ]

    if options["verbosity"]:
        print("Features extractors Loaded: ")
        for extrs in extractors:
            print("    - '" + extrs[0] + "'")
        print()

    #--------------------------------------------------------------------------
    # Prepare results informations supports

    F1_micro = [[0 for x in classifiers] for y in extractors]
    F1_macro = [[0 for x in classifiers] for y in extractors]
    Time_train = [[0 for x in classifiers] for y in extractors]

    output_dir = options["output-dir"]
    individual_scores_dir = output_dir + "indiv_scores/"
    create_dir(individual_scores_dir)

    #--------------------------------------------------------------------------
    # Start the model comparison

    t0 = time()
    total_iteration = len(classifiers) * len(extractors)
    if options["verbosity"]:
        print("Starting model comparisons")

    # Loop for each pair features-extractor/classifier
    for idx_extr, extr in enumerate(extractors):
        extr_name = get_features_extr_name(extr)

        for idx_clf, clf in enumerate(classifiers):
            clf_name = get_classifier_name(clf)

            if options["verbosity"]:
                iteration_number = (idx_extr) * len(classifiers) + idx_clf + 1
                print("Iteration : " + str(iteration_number) + "/" +
                      str(total_iteration))
                print("Testing : Features: " + extr_name + " | Classifier: " +
                      clf_name)

            t0_step = time()

            # Build pipeline
            pipeline = get_pipeline(features_extr=extr,
                                    classifier=clf,
                                    verbose=False)

            # Start training + cross validation
            try:
                model, step_scores = train_model_cross_validation(
                    authors=Authors,
                    label_type=options["label-type"],
                    pipeline=pipeline,
                    verbose=False)
            except:
                print("some error occured - the features extracted and the \
                    classifier are problably incompatible\n")
                continue

            if options["verbosity"]:
                print("Training complete in " + str(round(time() - t0_step)) +
                      " seconds")
                print_scores(step_scores)
                print()

            # Save scores
            save_scores(scores=step_scores,
                        output_dir=individual_scores_dir,
                        filename=extr_name + "+" + clf_name,
                        verbose=False)
            F1_micro[idx_extr][idx_clf] = step_scores["mean_score_micro"]
            F1_macro[idx_extr][idx_clf] = step_scores["mean_score_macro"]
            Time_train[idx_extr][idx_clf] = round(time() - t0_step)

    # Save final micro and macro measuresand execution time
    save_comparison_table(F1_micro, extractors, classifiers,
                          output_dir + "micro.csv")
    save_comparison_table(F1_macro, extractors, classifiers,
                          output_dir + "macro.csv")
    save_comparison_table(Time_train, extractors, classifiers,
                          output_dir + "time.csv")

    if options["verbosity"]:
        print("Comparison task complete in " + str(round(time() - t0)) + " s")