Exemplo n.º 1
0
def get_lsa(config=None):
    '''
    Returns a latent semantic analysis vectorizer.
    If specified, follows the config to setup the vectorizer
    Else follows default lsa setup.
    '''
    extractr_name = ""
    extractr = None

    if not (config):
        extractr_name = "lsa-default"
        extractr = TruncatedSVD(  #------------------------- Default Values
            n_components=1000,  #---------------------- 2
            algorithm="randomized",
            n_iter=10,
            random_state=42,
            tol=0.)

    else:
        extractr_name = config["extractr_name"]
        try:
            extractr = TruncatedSVD(**(config["configuration"]))
        except:
            abort_clean(
                "Features Extractor configuration failed", "Configuring " +
                config["extractr_type"] + " with : " + config["configuration"])

    res = (extractr_name, extractr)
    return res
Exemplo n.º 2
0
def get_nbb(config=None):
    '''
    Returns a Naive Bayes classifier (bernouilly implementation).
    If specified, follows the config to setup the NB classifier
    Else follows default NB classifier setup.
    '''
    clf_name = ""
    clf = None

    if not (config):
        clf_name = "nbb-default"
        clf = BernoulliNB(alpha=1.0,
                          binarize=.0,
                          fit_prior=True,
                          class_prior=None)

    else:
        clf_name = config["classifier_name"]
        try:
            clf = BernoulliNB(**(config["configuration"]))
        except:
            abort_clean(
                "Classifier configuration failed",
                "Configuring " + config["classifier_type"] + " with : " +
                config["configuration"])

    return clf_name, clf
Exemplo n.º 3
0
def get_tfidf(config=None):
    '''
    Returns a tfidf vectorizer.
    If specified, follows the config to setup the vectorizer
    Else follows default tfidf setup.
    '''
    extractr_name = ""
    extractr = None

    if not (config):
        extractr_name = "tfidf-default"
        extractr = TfidfTransformer(norm='l2',
                                    use_idf=True,
                                    smooth_idf=True,
                                    sublinear_tf=True)

    else:
        extractr_name = config["extractr_name"]
        try:
            #extract parameter from config json file
            use_idf = config["configuration"]["use_idf"]
            smooth_idf = config["configuration"]["smooth_idf"]
            sublinear_tf = config["configuration"]["sublinear_tf"]
            print(use_idf, smooth_idf, sublinear_tf)
            extractr = TfidfTransformer(norm='l2',
                                        use_idf=bool(use_idf),
                                        smooth_idf=bool(smooth_idf),
                                        sublinear_tf=bool(sublinear_tf))
        except:
            abort_clean(
                "Features Extractor configuration failed", "Configuring " +
                config["extractr_type"] + " with : " + config["configuration"])

    res = (extractr_name, extractr)
    return res
Exemplo n.º 4
0
def get_classifier(classifier_str, config=None, verbose=1):
    '''
    Returns a classifier specified in parameter
    Available classifiers are :
        - nbb : NaiveBayes (bernouilly)
        - mlp : Multi-layered Perceptron
        - rfo : Random Forest
        - svm : Support Vector Machine

    A classifier can be specified : (TODO)
        - by its name --> a default ft_extr will be instanciated
        - by a path to a config file, --> a custom ft_extr will be instanciated
    '''

    if verbose and not (config):
        print("Starting loading classifier ... ")
    if config:
        classifier_str = config["classifier_type"]

    #--------------------------------------------------------------------------
    # Get required classifier

    clf_name = ""
    clf = None

    if classifier_str == "svm":
        clf_name, clf = get_svm(config)

    elif classifier_str == "mlp":
        clf_name, clf = get_mlp(config)

    elif classifier_str == "nbb":
        clf_name, clf = get_nbb(config)

    elif classifier_str == "rfo":
        clf_name, clf = get_rfo(config)

    else:
        try:
            config = load_config(classifier_str)
        except:
            abort_clean(
                "Cannot load the classifier configuration",
                "Either the clf name is incorrect or the path is invalid : " +
                classifier_str)

        if verbose:
            print("Loading classifier config from file")
        # recursive call with config loaded
        return get_classifier("", config, verbose=verbose)

    #--------------------------------------------------------------------------
    # Return classifier
    if (verbose):
        print("classifier loaded: '" + clf_name + "'\n")

    res = (clf_name, clf)
    return res
Exemplo n.º 5
0
def load_model(filename):
    '''
    Loads a classifier (pipeline) from a file.
    '''
    # Load model
    try:
        pipe = joblib.load(filename)
    except Exception as e:
        abort_clean("failed to load the classifier: " + str(e))
    return pipe
Exemplo n.º 6
0
def get_wc2(config=None):
    '''
    Returns a word count (bigram) vectorizer.
    If specified, follows the config to setup the vectorizer
    Else follows default wc2 setup.
    '''
    extractr_name = ""
    extractr = None
    tokenizr = TweetTokenizer(preserve_case=True,
                              strip_handles=True,
                              reduce_len=False)

    if not (config):
        extractr_name = "wc2-default"
        extractr = CountVectorizer(  #----------------- Default Values
            input='content',
            encoding='utf-8',
            decode_error='ignore',
            strip_accents=None,
            analyzer='word',
            preprocessor=None,
            tokenizer=tokenizr.tokenize,  #------------ None
            ngram_range=(1, 2),  #--------------------- (1, 1)
            stop_words=None,
            lowercase=True,
            token_pattern=r"(?u)\b\w\w+\b",
            max_df=1.0,
            min_df=2,  #------------------------------- 1
            max_features=None,
            vocabulary=None,
            binary=False,
            dtype=np.int64)

    else:
        extractr_name = config["extractr_name"]
        try:
            # Adjustements due to JSON incompatibility
            config["configuration"]["ngram_range"] = tuple(
                config["configuration"]["ngram_range"])
            config["configuration"]["dtype"] = np.int64
            config["configuration"]["tokenizer"] = tokenizr.tokenize

            extractr = CountVectorizer(**(config["configuration"]))
        except:
            abort_clean(
                "Features Extractor configuration failed", "Configuring " +
                config["extractr_type"] + " with : " + config["configuration"])

    res = (extractr_name, extractr)
    return res
Exemplo n.º 7
0
def load_features_extr(features_str, language=None, verbose=1):
    '''
    Returns a list of vectorizers to match the specified features_str
    Available features extractors are :
        - wc2   : Word count - bigram
        - char_word_ngrams : char and/or word grams
        - tfidf : TF-IDF
        - lsa   : Latent Semantic Analysis

    A feature extractor can be specified :
        - by its name --> a default clf will be instanciated
        - by a path to a config file, --> a custom clf will be instanciated
    '''
    feat_extractors = []

    #--------------------------------------------------------------------------
    # Get required features_extractor

    if features_str == "wc2":
        feat_extractors.append(get_wc2(language=language, config=None))
    if features_str == "char_word_ngrams":
        feat_extractors.append(get_char_words_ngrams(None))
    elif features_str == "tfidf":
        feat_extractors.append(get_wc2(language=language, config=None))
        feat_extractors.append(get_tfidf(None))
    elif features_str == "tfidfv2":
        feat_extractors.append(get_char_words_ngrams(None))
        feat_extractors.append(get_tfidf(None))
    elif features_str == "lsa":
        feat_extractors.append(get_wc2(None))
        feat_extractors.append(get_tfidf(None))
        feat_extractors.append(get_lsa(None))
    else:
        try:
            config = load_config(features_str)
        except:
            abort_clean(
                "Cannot load the extractors configuration",
                "Either extr name is incorrect or path is invalid : " +
                features_str)
        # Load the config from a file
        if verbose:
            print("Loading features extractor config from file ")
        feat_extractors = load_features_extr_from_file(config, verbose=verbose)

    #--------------------------------------------------------------------------
    # Return features extractors
    return feat_extractors
Exemplo n.º 8
0
def get_mlp(config=None):
    '''
    Returns a Multi-Layered Perceptron classifier.
    If specified, follows the config to setup the mlp classifier
    Else follows default mlp classifier setup.
    '''
    clf_name = ""
    clf = None

    if not (config):
        clf_name = "mlp-default"
        clf = MLPClassifier(hidden_layer_sizes=(100, ),
                            activation="relu",
                            solver='adam',
                            alpha=0.0001,
                            batch_size='auto',
                            learning_rate="constant",
                            learning_rate_init=0.001,
                            power_t=0.5,
                            max_iter=200,
                            shuffle=True,
                            random_state=None,
                            tol=1e-4,
                            verbose=False,
                            warm_start=False,
                            momentum=0.9,
                            nesterovs_momentum=True,
                            early_stopping=False,
                            validation_fraction=0.1,
                            beta_1=0.9,
                            beta_2=0.999,
                            epsilon=1e-8)

    else:
        clf_name = config["classifier_name"]
        try:
            config["configuration"]["hidden_layer_sizes"] = tuple(
                config["configuration"]["hidden_layer_sizes"])
            clf = MLPClassifier(**(config["configuration"]))
        except:
            abort_clean(
                "Classifier configuration failed",
                "Configuring " + config["classifier_type"] + " with : " +
                config["configuration"])

    return clf_name, clf
Exemplo n.º 9
0
def get_rfo(config=None):
    '''
    Returns a Naive Bayes classifier (bernouilly implementation).
    If specified, follows the config to setup the NB classifier
    Else follows default NB classifier setup.
    '''
    clf_name = ""
    clf = None

    if not (config):
        clf_name = "rfo-default"
        clf = RandomForestClassifier(
            n_estimators=10,
            criterion="gini",
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0.,
            max_features="auto",
            max_leaf_nodes=None,
            min_impurity_split=1e-7,
            bootstrap=True,
            oob_score=False,
            n_jobs=-1,  #------------------------------ 1
            random_state=None,
            verbose=0,
            warm_start=False,
            class_weight=None)

    else:
        clf_name = config["classifier_name"]
        try:
            clf = RandomForestClassifier(**(config["configuration"]))
        except:
            abort_clean(
                "Classifier configuration failed",
                "Configuring " + config["classifier_type"] + " with : " +
                config["configuration"])

    return clf_name, clf
Exemplo n.º 10
0
def get_svm(config=None):
    '''
    Returns a svm classifier.
    If specified, follows the config to setup the svm
    Else follows default svm setup.
    '''
    clf_name = ""
    clf = None
    from sklearn.calibration import CalibratedClassifierCV
    if not (config):
        clf_name = "svm-default"
        clf = CalibratedClassifierCV(
            LinearSVC(  #---------------------------- Default Value
                C=1.0,
                loss='squared_hinge',
                penalty='l1',  #------------------- l2
                dual=False,  #--------------------- True
                tol=1e-4,
                multi_class='crammer_singer',  #--- ovr
                fit_intercept=True,
                intercept_scaling=1,
                class_weight=None,
                verbose=0,
                random_state=None,
                max_iter=500))  #-------------------- 1000

    else:
        clf_name = config["classifier_name"]
        try:
            clf = CalibratedClassifierCV(
                LinearSVC(**(config["configuration"])))
        except:
            abort_clean(
                "Classifier configuration failed",
                "Configuring " + config["classifier_type"] + " with : " +
                config["configuration"])

    return clf_name, clf
Exemplo n.º 11
0
    def __init__(self,
                 min_char_ngrams=3,
                 max_char_ngrams=5,
                 min_word_ngrams=1,
                 max_word_ngrams=2):
        '''
        I use as default values the ones of the PAN17 winner paper
        '''
        self.onlyWords = (min_char_ngrams == 0) and (max_char_ngrams == 0)
        self.onlyChar = (min_word_ngrams == 0) and (max_word_ngrams == 0)
        if self.onlyWords and self.onlyChar:
            abort_clean("Min and/or Max wrong in ngram configuration file.")
        if min_char_ngrams < 0 or max_char_ngrams < 0 or min_word_ngrams < 0 or max_word_ngrams < 0:
            abort_clean("Min and/or Max wrong in ngram configuration file.")
        if max_char_ngrams < min_char_ngrams or max_word_ngrams < 0 < min_word_ngrams:
            abort_clean("Min and/or Max wrong in ngram configuration file.")

        self.min_char_ngrams = min_char_ngrams
        self.max_char_ngrams = max_char_ngrams
        self.min_word_ngrams = min_word_ngrams
        self.max_word_ngrams = max_word_ngrams
Exemplo n.º 12
0
def get_char_words_ngrams(config=None):
    '''
    Returns a vectorizer based on character and/or word ngrams
    If specified, follows the config to setup the vectorizer (min_char_ngrams, max_char_ngrams, min_word_ngram, max_word_ngram)
    Else follows default char_words_ngrams setup (3,5 1,2)
    '''
    extractr_name = ""
    extractr = None
    if not (config):
        extractr_name = "char_words_ngrams"
        min_char_ngrams = 3
        max_char_ngrams = 5
        min_word_ngrams = 1
        max_word_ngrams = 2
        ngram_extractor = Ngram_extractor(int(min_char_ngrams),
                                          int(max_char_ngrams),
                                          int(min_word_ngrams),
                                          int(max_word_ngrams))
        extractr = CountVectorizer(  #----------------- Default Values
            input='content',
            encoding='utf-8',
            decode_error='ignore',
            strip_accents=None,
            analyzer=ngram_extractor.analyzer,
            preprocessor=None,
            tokenizer=None,
            ngram_range=None,
            stop_words=None,
            lowercase=True,
            max_df=1.0,
            min_df=2,  #-------------------------------
            max_features=None,
            vocabulary=None,
            binary=False,
            dtype=np.int64)
    else:
        extractr_name = config["extractr_name"]
        try:
            # Adjustements due to JSON incompatibility
            config["configuration"]["dtype"] = np.int64
            config["configuration"]["analyzer"] = "ngram_extractor.analyzer"
            min_char_ngrams = config["configuration"]["min_char_ngrams"]
            max_char_ngrams = config["configuration"]["max_char_ngrams"]
            min_word_ngrams = config["configuration"]["min_word_ngrams"]
            max_word_ngrams = config["configuration"]["max_word_ngrams"]
            ngram_extractor = Ngram_extractor(int(min_char_ngrams),
                                              int(max_char_ngrams),
                                              int(min_word_ngrams),
                                              int(max_word_ngrams))
            ngram_extractor.printArgs()
            extractr = CountVectorizer(  # ----------------- Default Values
                input='content',
                encoding='utf-8',
                decode_error='ignore',
                strip_accents=None,
                analyzer=ngram_extractor.analyzer,
                preprocessor=None,
                tokenizer=None,
                ngram_range=None,
                stop_words=None,
                lowercase=True,
                max_df=1.0,
                min_df=2,  # ------------------------------- 1
                max_features=None,
                vocabulary=None,
                binary=False,
                dtype=np.int64)
        except:
            abort_clean(
                "Features Extractor configuration failed", "Configuring " +
                config["extractr_type"] + " with : " + config["configuration"])

    res = (extractr_name, extractr)
    return res
Exemplo n.º 13
0
def get_wc2(language=None, config=None):
    '''
    Returns a word count (bigram) vectorizer.
    If specified, follows the config to setup the vectorizer
    Else follows default wc2 setup.
    '''
    extractr_name = ""
    extractr = None

    if language is None:
        tokenizr = TweetTokenizer(preserve_case=True,
                                  strip_handles=True,
                                  reduce_len=False)
    elif language == 'ar':
        from giovanniScripts.clean_ar_txt import clean_ar_txt
        tokenizr = clean_ar_txt()
    elif language == 'en':
        from giovanniScripts.clean_en_txt import clean_en_txt
        tokenizr = clean_en_txt()
    elif language == 'es':
        from giovanniScripts.clean_es_txt import clean_es_txt
        tokenizr = clean_es_txt()

    # MODIFIED2
    ################################
    '''extractr_name = "char_words_ngrams"
    min_char_ngrams = 3
    max_char_ngrams = 5
    min_word_ngrams = 1
    max_word_ngrams = 2
    ngram_extractor = Ngram_extractor(int(min_char_ngrams), int(max_char_ngrams), int(min_word_ngrams),
                                      int(max_word_ngrams))'''
    ################################

    if not (config):
        extractr_name = "wc2-default"
        extractr = CountVectorizer(  #----------------- Default Values
            input='content',
            encoding='utf-8',
            decode_error='ignore',
            strip_accents=None,
            analyzer='word',  #
            preprocessor=None,
            tokenizer=tokenizr.tokenize,  #------------ None
            ngram_range=(1, 2),  #--------------------- (1, 1)
            stop_words=None,
            lowercase=True,
            token_pattern=r"(?u)\b\w\w+\b",
            max_df=1.0,
            min_df=2,  #------------------------------- 1
            max_features=None,
            vocabulary=None,
            binary=False,
            dtype=np.int64)

    else:
        extractr_name = config["extractr_name"]
        try:
            # Adjustements due to JSON incompatibility
            config["configuration"]["ngram_range"] = tuple(
                config["configuration"]["ngram_range"])
            config["configuration"]["dtype"] = np.int64
            config["configuration"]["tokenizer"] = tokenizr.tokenize

            extractr = CountVectorizer(**(config["configuration"]))
        except:
            abort_clean(
                "Features Extractor configuration failed", "Configuring " +
                config["extractr_type"] + " with : " + config["configuration"])

    res = (extractr_name, extractr)
    return res
def predict(inputPath,
            classifierPath,
            outputPath=None,
            verbosity_level=1,
            languages=['en', 'es'],
            is_dataset_2019=False):
    '''

    Given inputPath and inputDict it return outputDic which contains the prediction results

    :param inputPath:  Path to PAN18 dataset
    :param inputDict: { 'ar':[arUser0, .. , arUserN],
                        'en':[enUser0, .. , enUserN]
                        'es':[esUser0, .. , esUserN]}
    :param classifierPath: Path to the dir containing the classifiers produced by 'text_training.py'
    :param outputPath: Path to the dir that will contain the prediction results
    :return outputDic : { userId: [femaleScore, maleScore]}
    '''

    outputDic = {}
    # PAN 18 specifics
    for lang in languages:

        if verbosity_level:
            print('---------------------------------------')
            print("Language up for classification: '" + lang + "'\n")

        classifier_dir_path = classifierPath + "/" + lang
        if outputPath is not None:
            output_dir_path = format_dir_name(outputPath + lang)

        # ----------------------------------------------------------------------
        # Load the tweets features
        Authors = parse_tweets_from_dir(
            input_dir=format_dir_name(inputPath + "/" + lang + "/"),
            label=False,
            verbosity_level=verbosity_level)

        if not (Authors):
            abort_clean("Tweets loading failed")

        # ----------------------------------------------------------------------
        # Load the classifiers
        classifiers = load_classifiers(classifier_dir_path=classifier_dir_path,
                                       classification_type='loose',
                                       verbose=verbosity_level)
        # ----------------------------------------------------------------------
        # Start classification, 'txt', 'img' or 'comb'
        if verbosity_level:
            print("Starting authors classification ...")
            t0 = time()
        classify_authors(Authors, classifiers, outputDic, int(verbosity_level))
        ''' if verbosity_level > 1:
            for auth in Authors:
                print(auth["id"] + ":::txt:::" + auth["gender_txt"])'''

        if verbosity_level:
            print("Classification of '" + lang + "' complete in %.3f seconds" %
                  (time() - t0))
            print('---------------------------------------\n')

        if outputPath is not None:
            create_dir(output_dir_path)
            if (output_dir_path is not None):
                for auth in Authors:
                    save_author_file(author=auth,
                                     output_dir=output_dir_path,
                                     verbose=verbosity_level > 1)
        # for memory issues, free the classifiers objects
        gc.collect()
    return outputDic