class Wordk2VecSA:

    def __init__(self):
        print("Start Word2Vec - Sentiment Analysis")
        self.sp = Support()
        file_model = ROOT_DIR_DATA_EMBEDDING + "SBW-vectors-300-min5.txt"
        model_tmp = self.sp.load_vectors_from_csv(file_model)
        self.model = Word2Vec.load(model_tmp)
        self.data, self.label = self.sp.process_data(filename='bank_message', size_msg=3, clean=True, replace_text=True,
                                                     stemmed=None, lemmatize=None, spelling=None)


    def baseline(self,  num_features=300, fold=5, iteration=3):
        for model_type, classifier in wor2vec_model.items():
            sum_recall = 0.0
            sum_precision = 0.0
            sum_f1 = 0.0
            sum_accuracy = 0.0
            for i in range(0, iteration):
                x_train, x_test, y_train, y_test = train_test_split(self.data, self.label, test_size=0.25, random_state=1000)
                x_train, y_train = self.sp.balanced_data(x_train, y_train)
                x_test, y_test = self.sp.balanced_data(x_test, y_test)


                trainDataVecs = self.sp.getAvgFeatureVecs(x_train, self.model, num_features)
                testDataVecs = self.sp.getAvgFeatureVecs(x_test, self.model, num_features)

                classifier.fit(trainDataVecs, y_train)
                predict = classifier.predict(testDataVecs)

                # Recall Scores
                recall_scores = cross_val_score(classifier, trainDataVecs, y_train, cv=fold, scoring='recall_macro')
                sum_recall += recall_scores
                # Precision Score
                precision_score = cross_val_score(classifier, trainDataVecs, y_train, cv=fold, scoring='precision_weighted')
                sum_precision += precision_score
                # F1 Score
                f1_score = cross_val_score(classifier, trainDataVecs, y_train, cv=fold, scoring='f1_weighted')
                sum_f1 += f1_score
                # Accuracy Score
                accuracy_score = cross_val_score(classifier, trainDataVecs, y_train, cv=fold, scoring='balanced_accuracy')
                sum_accuracy += accuracy_score

            recall = sum_recall / iteration
            precision = sum_precision / iteration
            f1 = sum_f1 / iteration
            accuracy = sum_accuracy / iteration
            self.sp.print_score(model_type, predicted_classes=predict, recall=recall, precision=precision, f1=f1,
                                   accuracy=accuracy, test=y_test)
class SenticNetSA:

    def __init__(self):
        print("Start SenticNet - Sentiment Analysis")
        self.sp = Support()
        self.sn = SenticNet()
        self.corpus = self.sp.import_corpus_bank()
        self.terminology = self.sp.import_bank_terminology(filename='bank_terminology')
        self.data, self.label = self.sp.process_data(filename='bank_message',
                                                size_msg=3,
                                                clean=True,
                                                replace_text=True,
                                                stemmed=None,
                                                lemmatize=None,
                                                spelling=None)

    def baseline(self):
        TP = 0
        FP = 0
        FN = 0
        x_train, x_test, y_train, y_test = train_test_split(self.data, self.label, test_size=0.20, random_state=1000)
        for i in range (0, len(x_train)):
            msg = str(x_train[i])
            value = float(y_train[i])
            result = self.sn.message_concept(msg)
            polarity_value = float(result['polarity_value'])
            polarity_value = 0.0 if polarity_value < 0.10 or polarity_value > -0.1 else polarity_value
            if value == polarity_value:
                TP += 1
            else:
                FP += 1
                if value == 1 and (polarity_value == 0.0 or polarity_value == -1.0):
                    FN += 1
                elif value == 0.0 and (polarity_value == 1 or polarity_value == -1.0):
                    FN += 1
                elif value == -1.0 and (polarity_value == 0.0 or polarity_value == 1.0):
                    FN += 1

        precision = TP/(TP + FP)
        recall = TP / (TP + FN)
        f1 = 2*((precision*recall) / (precision + recall))
        print("f1-score : {}%".format(round(f1 * 100, 2)))
 def __init__(self, clean_data=True):
     self.data = Support.process_data(filename='bank_message', size_msg=3, clean_data=clean_data,
                                      replace_text=True, stemmed=None, lemmatize=None, spelling=None)
 def __init__(self):
     self.sp = Support()
     self.lexicon = self.sp.load_file_polarity()
     self.cores = multiprocessing.cpu_count()
     self.data = Support.process_data(filename='bank_message', size_msg=3, clean_data=True, replace_text=True,
                                      stemmed=None, lemmatize=None, spelling=None)
Exemplo n.º 5
0
class DomainOfWord:
    def __init__(self, clean_data=True):
        self.sp = Support()
        self.data = self.sp.process_data(filename='bank_message',
                                         size_msg=3,
                                         clean_data=clean_data,
                                         replace_text=True,
                                         stemmed=None,
                                         lemmatize=None,
                                         spelling=None)

    def baseline(self,
                 model_name,
                 list_classifier,
                 ngram=None,
                 min_df=3,
                 max_features=5000,
                 fold=5,
                 stop_words=None,
                 iteration=3):

        try:
            start_time = time.time()
            result = []
            ngram = (1, 2) if ngram is None else ngram
            stopw = set(
                stopwords.words("spanish")) if stop_words is not None else None
            models = ClassifierModels(list_classifier).classifiers
            for classifier_name, classifier in models.items():
                dict_data = {}
                x_train, x_test, y_train, y_test = train_test_split(
                    self.data['msg'],
                    self.data['label'],
                    test_size=0.25,
                    random_state=1000)
                # training and sa_test are balanced
                df_train = pd.DataFrame({'msg': x_train, 'label': y_train})
                df_test = pd.DataFrame({'msg': x_test, 'label': y_test})
                train = self.sp.balanced_data(df_train)
                x_train = train['msg']
                y_train = train['label']
                test = self.sp.balanced_data(df_test)
                x_test = test['msg']
                y_test = test['label']
                vec_bow = CountVectorizer(analyzer='word',
                                          lowercase=True,
                                          encoding='utf-8',
                                          ngram_range=ngram,
                                          min_df=min_df,
                                          max_features=max_features,
                                          stop_words=stopw)

                vec_tfidf = TfidfVectorizer(analyzer='word',
                                            lowercase=True,
                                            encoding='utf-8',
                                            ngram_range=ngram,
                                            min_df=min_df,
                                            max_features=max_features,
                                            stop_words=stopw)

                lexicon = LexiconExtractor()
                terminology = TerminologyExtractor()

                vec_bow.fit(x_train)
                vec_tfidf.fit(x_train)
                lexicon.fit(x_train)
                terminology.fit(x_train)

                pipeline = Pipeline([
                    (
                        'feats',
                        FeatureUnion(
                            [('bow', vec_bow), ('tfidf', vec_tfidf),
                             ('lexicon', lexicon),
                             ('terminology', terminology)],

                            # weight components in FeatureUnion
                            transformer_weights={
                                'bow': 0.2,
                                'tfidf': 0.2,
                                'lexicon': 0.3,
                                'terminology': 0.3
                            })),
                    (classifier_name, classifier)  # classifier
                ])

                pipeline.fit(x_train, y_train)
                predict = pipeline.predict(x_test)
                sum_recall = 0.0
                sum_precision = 0.0
                sum_f1 = 0.0
                sum_accuracy = 0.0
                for i in range(0, iteration):
                    print('Iteration {0} for Cross-Validation'.format(i + 1))
                    #Recall
                    recall_scores = cross_val_score(pipeline,
                                                    x_train,
                                                    y_train,
                                                    cv=fold,
                                                    scoring='recall_macro',
                                                    n_jobs=-1)
                    sum_recall += recall_scores
                    #Precision
                    precision_score = cross_val_score(
                        pipeline,
                        x_train,
                        y_train,
                        cv=fold,
                        scoring='precision_weighted',
                        n_jobs=-1)
                    sum_precision += precision_score
                    #F1
                    f1_score = cross_val_score(pipeline,
                                               x_train,
                                               y_train,
                                               cv=fold,
                                               scoring='f1_weighted',
                                               n_jobs=-1)
                    sum_f1 += f1_score
                    #Accuracy
                    accuracy_score = cross_val_score(
                        pipeline,
                        x_train,
                        y_train,
                        cv=fold,
                        scoring='balanced_accuracy',
                        n_jobs=-1)
                    sum_accuracy += accuracy_score

                # Calculated Scores
                dict_data['classifier_name'] = classifier_name

                recall = sum_recall / iteration
                dict_data['recall'] = round(np.mean(recall) * 100, 2)

                precision = sum_precision / iteration
                dict_data['precision'] = round(np.mean(precision) * 100, 2)

                f1 = sum_f1 / iteration
                dict_data['f1'] = round(np.mean(f1) * 100, 2)

                accuracy = sum_accuracy / iteration
                dict_data['accuracy'] = round(np.mean(accuracy) * 100, 2)

                # Calculated Time processing
                t_sec = round(time.time() - start_time)
                (t_min, t_sec) = divmod(t_sec, 60)
                (t_hour, t_min) = divmod(t_min, 60)
                time_processing = '{} hour:{} min:{} sec'.format(
                    t_hour, t_min, t_sec)
                dict_data['time_processing'] = time_processing

                Support.print_score(dict_data=dict_data)
                dict_data['model_name'] = model_name
                result.append(dict_data)
            return result
        except Exception as e:
            print("\t ERROR baseline DomainOfWord: ", e)
class TFIDFSA:
    def __init__(self, clean_data=True):
        self.sp = Support()
        self.data = self.sp.process_data(filename='bank_message',
                                         size_msg=3,
                                         clean_data=clean_data,
                                         replace_text=True,
                                         stemmed=None,
                                         lemmatize=None,
                                         spelling=None)

    def baseline(self,
                 model_name,
                 list_classifier,
                 analyzer='word',
                 ngram=None,
                 min_df=3,
                 max_features=None,
                 fold=5,
                 stop_words=None,
                 iteration=3):
        try:
            start_time = time.time()
            result = []
            ngram = (1, 2) if ngram is None else ngram
            stopw = set(
                stopwords.words("spanish")) if stop_words is not None else None
            models = ClassifierModels(list_classifier).classifiers
            for classifier_name, classifier in models.items():
                dict_data = {}
                # Se divide en cada iteración el dataset
                x_train, x_test, y_train, y_test = train_test_split(
                    self.data['msg'],
                    self.data['label'],
                    test_size=0.25,
                    random_state=1000)
                # Se balancean las instancias
                # training and sa_test are balanced
                df_train = pd.DataFrame({'msg': x_train, 'label': y_train})
                df_test = pd.DataFrame({'msg': x_test, 'label': y_test})
                train = self.sp.balanced_data(df_train)
                x_train = train['msg']
                y_train = train['label']
                test = self.sp.balanced_data(df_test)
                x_test = test['msg']
                y_test = test['label']

                vectorizer = TfidfVectorizer(analyzer=analyzer,
                                             lowercase=True,
                                             encoding='utf-8',
                                             min_df=min_df,
                                             ngram_range=ngram,
                                             max_features=max_features,
                                             stop_words=stopw)
                vectorizer.fit(x_train)
                x_train = vectorizer.transform(x_train)
                x_test = vectorizer.transform(x_test)
                classifier.fit(x_train, y_train)
                predict = classifier.predict(x_test)

                sum_recall = 0.0
                sum_precision = 0.0
                sum_f1 = 0.0
                sum_accuracy = 0.0
                for i in range(0, iteration):
                    # Recall Scores
                    recall_scores = cross_val_score(classifier,
                                                    x_train,
                                                    y_train,
                                                    cv=fold,
                                                    scoring='recall_macro')
                    sum_recall += recall_scores
                    # Precision Score
                    precision_score = cross_val_score(
                        classifier,
                        x_train,
                        y_train,
                        cv=fold,
                        scoring='precision_weighted')
                    sum_precision += precision_score
                    # F1 Score
                    f1_score = cross_val_score(classifier,
                                               x_train,
                                               y_train,
                                               cv=fold,
                                               scoring='f1_weighted')
                    sum_f1 += f1_score
                    # Accuracy Score
                    accuracy_score = cross_val_score(
                        classifier,
                        x_train,
                        y_train,
                        cv=fold,
                        scoring='balanced_accuracy')
                    sum_accuracy += accuracy_score

                #Calculated Scores
                dict_data['classifier_name'] = classifier_name

                recall = sum_recall / iteration
                dict_data['recall'] = round(np.mean(recall) * 100, 2)

                precision = sum_precision / iteration
                dict_data['precision'] = round(np.mean(precision) * 100, 2)

                f1 = sum_f1 / iteration
                dict_data['f1'] = round(np.mean(f1) * 100, 2)

                accuracy = sum_accuracy / iteration
                dict_data['accuracy'] = round(np.mean(accuracy) * 100, 2)

                #Calculated Time processing
                t_sec = round(time.time() - start_time)
                (t_min, t_sec) = divmod(t_sec, 60)
                (t_hour, t_min) = divmod(t_min, 60)
                time_processing = '{} hour:{} min:{} sec'.format(
                    t_hour, t_min, t_sec)
                dict_data['time_processing'] = time_processing

                Support.print_score(dict_data=dict_data)
                dict_data['model_name'] = model_name
                result.append(dict_data)
            return result
        except Exception as e:
            print("\t ERROR baseline TFIDF: ", e)