def get_lsa(config=None): ''' Returns a latent semantic analysis vectorizer. If specified, follows the config to setup the vectorizer Else follows default lsa setup. ''' extractr_name = "" extractr = None if not (config): extractr_name = "lsa-default" extractr = TruncatedSVD( #------------------------- Default Values n_components=1000, #---------------------- 2 algorithm="randomized", n_iter=10, random_state=42, tol=0.) else: extractr_name = config["extractr_name"] try: extractr = TruncatedSVD(**(config["configuration"])) except: abort_clean( "Features Extractor configuration failed", "Configuring " + config["extractr_type"] + " with : " + config["configuration"]) res = (extractr_name, extractr) return res
def get_nbb(config=None): ''' Returns a Naive Bayes classifier (bernouilly implementation). If specified, follows the config to setup the NB classifier Else follows default NB classifier setup. ''' clf_name = "" clf = None if not (config): clf_name = "nbb-default" clf = BernoulliNB(alpha=1.0, binarize=.0, fit_prior=True, class_prior=None) else: clf_name = config["classifier_name"] try: clf = BernoulliNB(**(config["configuration"])) except: abort_clean( "Classifier configuration failed", "Configuring " + config["classifier_type"] + " with : " + config["configuration"]) return clf_name, clf
def get_tfidf(config=None): ''' Returns a tfidf vectorizer. If specified, follows the config to setup the vectorizer Else follows default tfidf setup. ''' extractr_name = "" extractr = None if not (config): extractr_name = "tfidf-default" extractr = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True) else: extractr_name = config["extractr_name"] try: #extract parameter from config json file use_idf = config["configuration"]["use_idf"] smooth_idf = config["configuration"]["smooth_idf"] sublinear_tf = config["configuration"]["sublinear_tf"] print(use_idf, smooth_idf, sublinear_tf) extractr = TfidfTransformer(norm='l2', use_idf=bool(use_idf), smooth_idf=bool(smooth_idf), sublinear_tf=bool(sublinear_tf)) except: abort_clean( "Features Extractor configuration failed", "Configuring " + config["extractr_type"] + " with : " + config["configuration"]) res = (extractr_name, extractr) return res
def get_classifier(classifier_str, config=None, verbose=1): ''' Returns a classifier specified in parameter Available classifiers are : - nbb : NaiveBayes (bernouilly) - mlp : Multi-layered Perceptron - rfo : Random Forest - svm : Support Vector Machine A classifier can be specified : (TODO) - by its name --> a default ft_extr will be instanciated - by a path to a config file, --> a custom ft_extr will be instanciated ''' if verbose and not (config): print("Starting loading classifier ... ") if config: classifier_str = config["classifier_type"] #-------------------------------------------------------------------------- # Get required classifier clf_name = "" clf = None if classifier_str == "svm": clf_name, clf = get_svm(config) elif classifier_str == "mlp": clf_name, clf = get_mlp(config) elif classifier_str == "nbb": clf_name, clf = get_nbb(config) elif classifier_str == "rfo": clf_name, clf = get_rfo(config) else: try: config = load_config(classifier_str) except: abort_clean( "Cannot load the classifier configuration", "Either the clf name is incorrect or the path is invalid : " + classifier_str) if verbose: print("Loading classifier config from file") # recursive call with config loaded return get_classifier("", config, verbose=verbose) #-------------------------------------------------------------------------- # Return classifier if (verbose): print("classifier loaded: '" + clf_name + "'\n") res = (clf_name, clf) return res
def load_model(filename): ''' Loads a classifier (pipeline) from a file. ''' # Load model try: pipe = joblib.load(filename) except Exception as e: abort_clean("failed to load the classifier: " + str(e)) return pipe
def get_wc2(config=None): ''' Returns a word count (bigram) vectorizer. If specified, follows the config to setup the vectorizer Else follows default wc2 setup. ''' extractr_name = "" extractr = None tokenizr = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=False) if not (config): extractr_name = "wc2-default" extractr = CountVectorizer( #----------------- Default Values input='content', encoding='utf-8', decode_error='ignore', strip_accents=None, analyzer='word', preprocessor=None, tokenizer=tokenizr.tokenize, #------------ None ngram_range=(1, 2), #--------------------- (1, 1) stop_words=None, lowercase=True, token_pattern=r"(?u)\b\w\w+\b", max_df=1.0, min_df=2, #------------------------------- 1 max_features=None, vocabulary=None, binary=False, dtype=np.int64) else: extractr_name = config["extractr_name"] try: # Adjustements due to JSON incompatibility config["configuration"]["ngram_range"] = tuple( config["configuration"]["ngram_range"]) config["configuration"]["dtype"] = np.int64 config["configuration"]["tokenizer"] = tokenizr.tokenize extractr = CountVectorizer(**(config["configuration"])) except: abort_clean( "Features Extractor configuration failed", "Configuring " + config["extractr_type"] + " with : " + config["configuration"]) res = (extractr_name, extractr) return res
def load_features_extr(features_str, language=None, verbose=1): ''' Returns a list of vectorizers to match the specified features_str Available features extractors are : - wc2 : Word count - bigram - char_word_ngrams : char and/or word grams - tfidf : TF-IDF - lsa : Latent Semantic Analysis A feature extractor can be specified : - by its name --> a default clf will be instanciated - by a path to a config file, --> a custom clf will be instanciated ''' feat_extractors = [] #-------------------------------------------------------------------------- # Get required features_extractor if features_str == "wc2": feat_extractors.append(get_wc2(language=language, config=None)) if features_str == "char_word_ngrams": feat_extractors.append(get_char_words_ngrams(None)) elif features_str == "tfidf": feat_extractors.append(get_wc2(language=language, config=None)) feat_extractors.append(get_tfidf(None)) elif features_str == "tfidfv2": feat_extractors.append(get_char_words_ngrams(None)) feat_extractors.append(get_tfidf(None)) elif features_str == "lsa": feat_extractors.append(get_wc2(None)) feat_extractors.append(get_tfidf(None)) feat_extractors.append(get_lsa(None)) else: try: config = load_config(features_str) except: abort_clean( "Cannot load the extractors configuration", "Either extr name is incorrect or path is invalid : " + features_str) # Load the config from a file if verbose: print("Loading features extractor config from file ") feat_extractors = load_features_extr_from_file(config, verbose=verbose) #-------------------------------------------------------------------------- # Return features extractors return feat_extractors
def get_mlp(config=None): ''' Returns a Multi-Layered Perceptron classifier. If specified, follows the config to setup the mlp classifier Else follows default mlp classifier setup. ''' clf_name = "" clf = None if not (config): clf_name = "mlp-default" clf = MLPClassifier(hidden_layer_sizes=(100, ), activation="relu", solver='adam', alpha=0.0001, batch_size='auto', learning_rate="constant", learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=1e-4, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-8) else: clf_name = config["classifier_name"] try: config["configuration"]["hidden_layer_sizes"] = tuple( config["configuration"]["hidden_layer_sizes"]) clf = MLPClassifier(**(config["configuration"])) except: abort_clean( "Classifier configuration failed", "Configuring " + config["classifier_type"] + " with : " + config["configuration"]) return clf_name, clf
def get_rfo(config=None): ''' Returns a Naive Bayes classifier (bernouilly implementation). If specified, follows the config to setup the NB classifier Else follows default NB classifier setup. ''' clf_name = "" clf = None if not (config): clf_name = "rfo-default" clf = RandomForestClassifier( n_estimators=10, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_split=1e-7, bootstrap=True, oob_score=False, n_jobs=-1, #------------------------------ 1 random_state=None, verbose=0, warm_start=False, class_weight=None) else: clf_name = config["classifier_name"] try: clf = RandomForestClassifier(**(config["configuration"])) except: abort_clean( "Classifier configuration failed", "Configuring " + config["classifier_type"] + " with : " + config["configuration"]) return clf_name, clf
def get_svm(config=None): ''' Returns a svm classifier. If specified, follows the config to setup the svm Else follows default svm setup. ''' clf_name = "" clf = None from sklearn.calibration import CalibratedClassifierCV if not (config): clf_name = "svm-default" clf = CalibratedClassifierCV( LinearSVC( #---------------------------- Default Value C=1.0, loss='squared_hinge', penalty='l1', #------------------- l2 dual=False, #--------------------- True tol=1e-4, multi_class='crammer_singer', #--- ovr fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=500)) #-------------------- 1000 else: clf_name = config["classifier_name"] try: clf = CalibratedClassifierCV( LinearSVC(**(config["configuration"]))) except: abort_clean( "Classifier configuration failed", "Configuring " + config["classifier_type"] + " with : " + config["configuration"]) return clf_name, clf
def __init__(self, min_char_ngrams=3, max_char_ngrams=5, min_word_ngrams=1, max_word_ngrams=2): ''' I use as default values the ones of the PAN17 winner paper ''' self.onlyWords = (min_char_ngrams == 0) and (max_char_ngrams == 0) self.onlyChar = (min_word_ngrams == 0) and (max_word_ngrams == 0) if self.onlyWords and self.onlyChar: abort_clean("Min and/or Max wrong in ngram configuration file.") if min_char_ngrams < 0 or max_char_ngrams < 0 or min_word_ngrams < 0 or max_word_ngrams < 0: abort_clean("Min and/or Max wrong in ngram configuration file.") if max_char_ngrams < min_char_ngrams or max_word_ngrams < 0 < min_word_ngrams: abort_clean("Min and/or Max wrong in ngram configuration file.") self.min_char_ngrams = min_char_ngrams self.max_char_ngrams = max_char_ngrams self.min_word_ngrams = min_word_ngrams self.max_word_ngrams = max_word_ngrams
def get_char_words_ngrams(config=None): ''' Returns a vectorizer based on character and/or word ngrams If specified, follows the config to setup the vectorizer (min_char_ngrams, max_char_ngrams, min_word_ngram, max_word_ngram) Else follows default char_words_ngrams setup (3,5 1,2) ''' extractr_name = "" extractr = None if not (config): extractr_name = "char_words_ngrams" min_char_ngrams = 3 max_char_ngrams = 5 min_word_ngrams = 1 max_word_ngrams = 2 ngram_extractor = Ngram_extractor(int(min_char_ngrams), int(max_char_ngrams), int(min_word_ngrams), int(max_word_ngrams)) extractr = CountVectorizer( #----------------- Default Values input='content', encoding='utf-8', decode_error='ignore', strip_accents=None, analyzer=ngram_extractor.analyzer, preprocessor=None, tokenizer=None, ngram_range=None, stop_words=None, lowercase=True, max_df=1.0, min_df=2, #------------------------------- max_features=None, vocabulary=None, binary=False, dtype=np.int64) else: extractr_name = config["extractr_name"] try: # Adjustements due to JSON incompatibility config["configuration"]["dtype"] = np.int64 config["configuration"]["analyzer"] = "ngram_extractor.analyzer" min_char_ngrams = config["configuration"]["min_char_ngrams"] max_char_ngrams = config["configuration"]["max_char_ngrams"] min_word_ngrams = config["configuration"]["min_word_ngrams"] max_word_ngrams = config["configuration"]["max_word_ngrams"] ngram_extractor = Ngram_extractor(int(min_char_ngrams), int(max_char_ngrams), int(min_word_ngrams), int(max_word_ngrams)) ngram_extractor.printArgs() extractr = CountVectorizer( # ----------------- Default Values input='content', encoding='utf-8', decode_error='ignore', strip_accents=None, analyzer=ngram_extractor.analyzer, preprocessor=None, tokenizer=None, ngram_range=None, stop_words=None, lowercase=True, max_df=1.0, min_df=2, # ------------------------------- 1 max_features=None, vocabulary=None, binary=False, dtype=np.int64) except: abort_clean( "Features Extractor configuration failed", "Configuring " + config["extractr_type"] + " with : " + config["configuration"]) res = (extractr_name, extractr) return res
def get_wc2(language=None, config=None): ''' Returns a word count (bigram) vectorizer. If specified, follows the config to setup the vectorizer Else follows default wc2 setup. ''' extractr_name = "" extractr = None if language is None: tokenizr = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=False) elif language == 'ar': from giovanniScripts.clean_ar_txt import clean_ar_txt tokenizr = clean_ar_txt() elif language == 'en': from giovanniScripts.clean_en_txt import clean_en_txt tokenizr = clean_en_txt() elif language == 'es': from giovanniScripts.clean_es_txt import clean_es_txt tokenizr = clean_es_txt() # MODIFIED2 ################################ '''extractr_name = "char_words_ngrams" min_char_ngrams = 3 max_char_ngrams = 5 min_word_ngrams = 1 max_word_ngrams = 2 ngram_extractor = Ngram_extractor(int(min_char_ngrams), int(max_char_ngrams), int(min_word_ngrams), int(max_word_ngrams))''' ################################ if not (config): extractr_name = "wc2-default" extractr = CountVectorizer( #----------------- Default Values input='content', encoding='utf-8', decode_error='ignore', strip_accents=None, analyzer='word', # preprocessor=None, tokenizer=tokenizr.tokenize, #------------ None ngram_range=(1, 2), #--------------------- (1, 1) stop_words=None, lowercase=True, token_pattern=r"(?u)\b\w\w+\b", max_df=1.0, min_df=2, #------------------------------- 1 max_features=None, vocabulary=None, binary=False, dtype=np.int64) else: extractr_name = config["extractr_name"] try: # Adjustements due to JSON incompatibility config["configuration"]["ngram_range"] = tuple( config["configuration"]["ngram_range"]) config["configuration"]["dtype"] = np.int64 config["configuration"]["tokenizer"] = tokenizr.tokenize extractr = CountVectorizer(**(config["configuration"])) except: abort_clean( "Features Extractor configuration failed", "Configuring " + config["extractr_type"] + " with : " + config["configuration"]) res = (extractr_name, extractr) return res
def predict(inputPath, classifierPath, outputPath=None, verbosity_level=1, languages=['en', 'es'], is_dataset_2019=False): ''' Given inputPath and inputDict it return outputDic which contains the prediction results :param inputPath: Path to PAN18 dataset :param inputDict: { 'ar':[arUser0, .. , arUserN], 'en':[enUser0, .. , enUserN] 'es':[esUser0, .. , esUserN]} :param classifierPath: Path to the dir containing the classifiers produced by 'text_training.py' :param outputPath: Path to the dir that will contain the prediction results :return outputDic : { userId: [femaleScore, maleScore]} ''' outputDic = {} # PAN 18 specifics for lang in languages: if verbosity_level: print('---------------------------------------') print("Language up for classification: '" + lang + "'\n") classifier_dir_path = classifierPath + "/" + lang if outputPath is not None: output_dir_path = format_dir_name(outputPath + lang) # ---------------------------------------------------------------------- # Load the tweets features Authors = parse_tweets_from_dir( input_dir=format_dir_name(inputPath + "/" + lang + "/"), label=False, verbosity_level=verbosity_level) if not (Authors): abort_clean("Tweets loading failed") # ---------------------------------------------------------------------- # Load the classifiers classifiers = load_classifiers(classifier_dir_path=classifier_dir_path, classification_type='loose', verbose=verbosity_level) # ---------------------------------------------------------------------- # Start classification, 'txt', 'img' or 'comb' if verbosity_level: print("Starting authors classification ...") t0 = time() classify_authors(Authors, classifiers, outputDic, int(verbosity_level)) ''' if verbosity_level > 1: for auth in Authors: print(auth["id"] + ":::txt:::" + auth["gender_txt"])''' if verbosity_level: print("Classification of '" + lang + "' complete in %.3f seconds" % (time() - t0)) print('---------------------------------------\n') if outputPath is not None: create_dir(output_dir_path) if (output_dir_path is not None): for auth in Authors: save_author_file(author=auth, output_dir=output_dir_path, verbose=verbosity_level > 1) # for memory issues, free the classifiers objects gc.collect() return outputDic