def generate_embedding(self, size=300, min_count=3, window=5, downsampling=1e-3, negative=3):
        try:
            full_corpus = []
            corpus_ccd = Support.import_corpus_ccd()
            corpus_wikipedia_web = Support.corpus_bank_xml()
            corpus_bank = Support.import_dataset_bank()
            dataset = self.data['msg'].values.tolist()
            full_corpus.extend(corpus_ccd)
            full_corpus.extend(corpus_wikipedia_web)
            full_corpus.extend(corpus_bank)
            full_corpus.extend(dataset)
            print('Transform sentences to vectors ...')
            corpus = self.sp.sentence_vec(full_corpus)

            terminology = self.sp.import_terminology_embedding(filename='bank_terminology')
            print('Generate Embedding...')
            model = Word2Vec(corpus, cbow_mean=1, workers=self.cores, size=size,min_count=min_count,
                             window=window, sample=downsampling, negative=negative, iter=30)
            model.train(terminology, total_examples=model.corpus_count, epochs=model.epochs)

            model.init_sims(replace=True)
            # Saving the model for later use. Can be loaded using Word2Vec.load()
            model_name = ROOT_DIR_DATA_EMBEDDING + "bank2vec.model"
            model.save(model_name)
            print('Model generated sucesfull!')
        except Exception as e:
            print("\t ERROR generate_embedding: ", e)
            return None
 def __init__(self):
     print("Start Word2Vec - Sentiment Analysis")
     self.sp = Support()
     file_model = ROOT_DIR_DATA_EMBEDDING + "SBW-vectors-300-min5.txt"
     model_tmp = self.sp.load_vectors_from_csv(file_model)
     self.model = Word2Vec.load(model_tmp)
     self.data, self.label = self.sp.process_data(filename='bank_message', size_msg=3, clean=True, replace_text=True,
                                                  stemmed=None, lemmatize=None, spelling=None)
Пример #3
0
 def __init__(self, clean_data=True):
     self.sp = Support()
     self.data = self.sp.process_data(filename='bank_message',
                                      size_msg=3,
                                      clean_data=clean_data,
                                      replace_text=True,
                                      stemmed=None,
                                      lemmatize=None,
                                      spelling=None)
 def __init__(self):
     print("Start SenticNet - Sentiment Analysis")
     self.sp = Support()
     self.sn = SenticNet()
     self.corpus = self.sp.import_corpus_bank()
     self.terminology = self.sp.import_bank_terminology(filename='bank_terminology')
     self.data, self.label = self.sp.process_data(filename='bank_message',
                                             size_msg=3,
                                             clean=True,
                                             replace_text=True,
                                             stemmed=None,
                                             lemmatize=None,
                                             spelling=None)
class SenticNetSA:

    def __init__(self):
        print("Start SenticNet - Sentiment Analysis")
        self.sp = Support()
        self.sn = SenticNet()
        self.corpus = self.sp.import_corpus_bank()
        self.terminology = self.sp.import_bank_terminology(filename='bank_terminology')
        self.data, self.label = self.sp.process_data(filename='bank_message',
                                                size_msg=3,
                                                clean=True,
                                                replace_text=True,
                                                stemmed=None,
                                                lemmatize=None,
                                                spelling=None)

    def baseline(self):
        TP = 0
        FP = 0
        FN = 0
        x_train, x_test, y_train, y_test = train_test_split(self.data, self.label, test_size=0.20, random_state=1000)
        for i in range (0, len(x_train)):
            msg = str(x_train[i])
            value = float(y_train[i])
            result = self.sn.message_concept(msg)
            polarity_value = float(result['polarity_value'])
            polarity_value = 0.0 if polarity_value < 0.10 or polarity_value > -0.1 else polarity_value
            if value == polarity_value:
                TP += 1
            else:
                FP += 1
                if value == 1 and (polarity_value == 0.0 or polarity_value == -1.0):
                    FN += 1
                elif value == 0.0 and (polarity_value == 1 or polarity_value == -1.0):
                    FN += 1
                elif value == -1.0 and (polarity_value == 0.0 or polarity_value == 1.0):
                    FN += 1

        precision = TP/(TP + FP)
        recall = TP / (TP + FN)
        f1 = 2*((precision*recall) / (precision + recall))
        print("f1-score : {}%".format(round(f1 * 100, 2)))
class Wordk2VecSA:

    def __init__(self):
        print("Start Word2Vec - Sentiment Analysis")
        self.sp = Support()
        file_model = ROOT_DIR_DATA_EMBEDDING + "SBW-vectors-300-min5.txt"
        model_tmp = self.sp.load_vectors_from_csv(file_model)
        self.model = Word2Vec.load(model_tmp)
        self.data, self.label = self.sp.process_data(filename='bank_message', size_msg=3, clean=True, replace_text=True,
                                                     stemmed=None, lemmatize=None, spelling=None)


    def baseline(self,  num_features=300, fold=5, iteration=3):
        for model_type, classifier in wor2vec_model.items():
            sum_recall = 0.0
            sum_precision = 0.0
            sum_f1 = 0.0
            sum_accuracy = 0.0
            for i in range(0, iteration):
                x_train, x_test, y_train, y_test = train_test_split(self.data, self.label, test_size=0.25, random_state=1000)
                x_train, y_train = self.sp.balanced_data(x_train, y_train)
                x_test, y_test = self.sp.balanced_data(x_test, y_test)


                trainDataVecs = self.sp.getAvgFeatureVecs(x_train, self.model, num_features)
                testDataVecs = self.sp.getAvgFeatureVecs(x_test, self.model, num_features)

                classifier.fit(trainDataVecs, y_train)
                predict = classifier.predict(testDataVecs)

                # Recall Scores
                recall_scores = cross_val_score(classifier, trainDataVecs, y_train, cv=fold, scoring='recall_macro')
                sum_recall += recall_scores
                # Precision Score
                precision_score = cross_val_score(classifier, trainDataVecs, y_train, cv=fold, scoring='precision_weighted')
                sum_precision += precision_score
                # F1 Score
                f1_score = cross_val_score(classifier, trainDataVecs, y_train, cv=fold, scoring='f1_weighted')
                sum_f1 += f1_score
                # Accuracy Score
                accuracy_score = cross_val_score(classifier, trainDataVecs, y_train, cv=fold, scoring='balanced_accuracy')
                sum_accuracy += accuracy_score

            recall = sum_recall / iteration
            precision = sum_precision / iteration
            f1 = sum_f1 / iteration
            accuracy = sum_accuracy / iteration
            self.sp.print_score(model_type, predicted_classes=predict, recall=recall, precision=precision, f1=f1,
                                   accuracy=accuracy, test=y_test)
    def baseline(self, model_name, list_classifier, analyzer='word', ngram= None, min_df=3,
                 max_features=None, fold=5, stop_words=None, iteration=3):
        try:
            start_time = time.time()
            result = []
            ngram = (1, 2) if ngram is None else ngram
            stopw = set(stopwords.words("spanish")) if stop_words is not None else None
            models = ClassifierModels(list_classifier).classifiers
            for classifier_name, classifier in models.items():
                dict_data = {}
                x_train, x_test, y_train, y_test = train_test_split(self.data['msg'], self.data['label'], test_size=0.25,
                                                                    random_state=1000)
                # Se balancean las instancias del training y el sa_test
                # training and sa_test are balanced
                df_train = pd.DataFrame({'msg': x_train, 'label': y_train})
                df_test = pd.DataFrame({'msg': x_test, 'label': y_test})

                train = Support.balanced_data(df_train)
                x_train = train['msg']
                y_train = train['label']

                test = Support.balanced_data(df_test)
                x_test = test['msg']
                y_test = test['label']

                vectorizer = CountVectorizer(analyzer=analyzer, lowercase=True, encoding='utf-8', min_df=min_df,
                                                 ngram_range=ngram, max_features=max_features, stop_words=stopw)

                vectorizer.fit(x_train)
                x_train = vectorizer.transform(x_train)
                x_test = vectorizer.transform(x_test)
                classifier.fit(x_train, y_train)
                predict = classifier.predict(x_test)

                sum_recall = 0.0
                sum_precision = 0.0
                sum_f1 = 0.0
                sum_accuracy = 0.0
                for i in range(0, iteration):
                    # Recall Scores
                    recall_scores = cross_val_score(classifier, x_train, y_train, cv=fold, scoring='recall_macro')
                    sum_recall += recall_scores
                    # Precision Score
                    precision_score = cross_val_score(classifier, x_train, y_train, cv=fold, scoring='precision_weighted')
                    sum_precision += precision_score
                    # F1 Score
                    f1_score = cross_val_score(classifier, x_train, y_train, cv=fold, scoring='f1_weighted')
                    sum_f1 += f1_score
                    # Accuracy Score
                    accuracy_score = cross_val_score(classifier, x_train, y_train, cv=fold, scoring='balanced_accuracy')
                    sum_accuracy += accuracy_score

                #Calculated Scores
                dict_data['classifier_name'] = classifier_name

                recall = sum_recall/iteration
                dict_data['recall'] = round(np.mean(recall) * 100, 2)

                precision = sum_precision/iteration
                dict_data['precision'] = round(np.mean(precision) * 100, 2)

                f1 = sum_f1/iteration
                dict_data['f1'] = round(np.mean(f1) * 100, 2)

                accuracy = sum_accuracy/iteration
                dict_data['accuracy'] = round(np.mean(accuracy) * 100, 2)

                #Calculated Time processing
                t_sec = round(time.time() - start_time)
                (t_min, t_sec) = divmod(t_sec, 60)
                (t_hour, t_min) = divmod(t_min, 60)
                time_processing = '{} hour:{} min:{} sec'.format(t_hour, t_min, t_sec)
                dict_data['time_processing'] = time_processing

                Support.print_score(dict_data=dict_data)
                dict_data['model_name'] = model_name
            return result
        except Exception as e:
            print("\t ERROR baseline BoW: ", e)
 def __init__(self):
     self.sp = Support()
     self.lexicon = self.sp.load_file_polarity()
     self.cores = multiprocessing.cpu_count()
     self.data = Support.process_data(filename='bank_message', size_msg=3, clean_data=True, replace_text=True,
                                      stemmed=None, lemmatize=None, spelling=None)
Пример #9
0
    def baseline(self,
                 model_name,
                 list_classifier,
                 ngram=None,
                 min_df=3,
                 max_features=5000,
                 fold=5,
                 stop_words=None,
                 iteration=3):

        try:
            start_time = time.time()
            result = []
            ngram = (1, 2) if ngram is None else ngram
            stopw = set(
                stopwords.words("spanish")) if stop_words is not None else None
            models = ClassifierModels(list_classifier).classifiers
            for classifier_name, classifier in models.items():
                dict_data = {}
                x_train, x_test, y_train, y_test = train_test_split(
                    self.data['msg'],
                    self.data['label'],
                    test_size=0.25,
                    random_state=1000)
                # training and sa_test are balanced
                df_train = pd.DataFrame({'msg': x_train, 'label': y_train})
                df_test = pd.DataFrame({'msg': x_test, 'label': y_test})
                train = self.sp.balanced_data(df_train)
                x_train = train['msg']
                y_train = train['label']
                test = self.sp.balanced_data(df_test)
                x_test = test['msg']
                y_test = test['label']
                vec_bow = CountVectorizer(analyzer='word',
                                          lowercase=True,
                                          encoding='utf-8',
                                          ngram_range=ngram,
                                          min_df=min_df,
                                          max_features=max_features,
                                          stop_words=stopw)

                vec_tfidf = TfidfVectorizer(analyzer='word',
                                            lowercase=True,
                                            encoding='utf-8',
                                            ngram_range=ngram,
                                            min_df=min_df,
                                            max_features=max_features,
                                            stop_words=stopw)

                lexicon = LexiconExtractor()
                terminology = TerminologyExtractor()

                vec_bow.fit(x_train)
                vec_tfidf.fit(x_train)
                lexicon.fit(x_train)
                terminology.fit(x_train)

                pipeline = Pipeline([
                    (
                        'feats',
                        FeatureUnion(
                            [('bow', vec_bow), ('tfidf', vec_tfidf),
                             ('lexicon', lexicon),
                             ('terminology', terminology)],

                            # weight components in FeatureUnion
                            transformer_weights={
                                'bow': 0.2,
                                'tfidf': 0.2,
                                'lexicon': 0.3,
                                'terminology': 0.3
                            })),
                    (classifier_name, classifier)  # classifier
                ])

                pipeline.fit(x_train, y_train)
                predict = pipeline.predict(x_test)
                sum_recall = 0.0
                sum_precision = 0.0
                sum_f1 = 0.0
                sum_accuracy = 0.0
                for i in range(0, iteration):
                    print('Iteration {0} for Cross-Validation'.format(i + 1))
                    #Recall
                    recall_scores = cross_val_score(pipeline,
                                                    x_train,
                                                    y_train,
                                                    cv=fold,
                                                    scoring='recall_macro',
                                                    n_jobs=-1)
                    sum_recall += recall_scores
                    #Precision
                    precision_score = cross_val_score(
                        pipeline,
                        x_train,
                        y_train,
                        cv=fold,
                        scoring='precision_weighted',
                        n_jobs=-1)
                    sum_precision += precision_score
                    #F1
                    f1_score = cross_val_score(pipeline,
                                               x_train,
                                               y_train,
                                               cv=fold,
                                               scoring='f1_weighted',
                                               n_jobs=-1)
                    sum_f1 += f1_score
                    #Accuracy
                    accuracy_score = cross_val_score(
                        pipeline,
                        x_train,
                        y_train,
                        cv=fold,
                        scoring='balanced_accuracy',
                        n_jobs=-1)
                    sum_accuracy += accuracy_score

                # Calculated Scores
                dict_data['classifier_name'] = classifier_name

                recall = sum_recall / iteration
                dict_data['recall'] = round(np.mean(recall) * 100, 2)

                precision = sum_precision / iteration
                dict_data['precision'] = round(np.mean(precision) * 100, 2)

                f1 = sum_f1 / iteration
                dict_data['f1'] = round(np.mean(f1) * 100, 2)

                accuracy = sum_accuracy / iteration
                dict_data['accuracy'] = round(np.mean(accuracy) * 100, 2)

                # Calculated Time processing
                t_sec = round(time.time() - start_time)
                (t_min, t_sec) = divmod(t_sec, 60)
                (t_hour, t_min) = divmod(t_min, 60)
                time_processing = '{} hour:{} min:{} sec'.format(
                    t_hour, t_min, t_sec)
                dict_data['time_processing'] = time_processing

                Support.print_score(dict_data=dict_data)
                dict_data['model_name'] = model_name
                result.append(dict_data)
            return result
        except Exception as e:
            print("\t ERROR baseline DomainOfWord: ", e)
Пример #10
0
 def __init__(self):
     sp = Support()
     self.terminology = sp.import_terminology(filename='bank_terminology')
                                             iteration=3)
            result_models.extend(result_tfidf)

            model_name_dow = 'DoW_N@' + str(ngram) + '_df@' + str(
                min_df) + '_fold@' + str(item['fold'])
            model_name_dow += '_clean_data' if item[
                'clean_data'] else '_raw_data'
            model_name_dow += '_stop_words' if item[
                'stop_words'] else '_without_stop_words'
            print("#" * 10 + '| Model:' + model_name_dow + ' |' + "#" * 10)
            result_dow = dow.baseline(model_name=model_name_dow,
                                      list_classifier=list_classifier,
                                      ngram=(1, ngram),
                                      min_df=min_df,
                                      max_features=5000,
                                      stop_words=item['stop_words'],
                                      fold=item['fold'],
                                      iteration=3)
            result_models.extend(result_dow)

    model_name_b2v = 'Bank2Vec' + '_fold@' + str(item['fold'])
    model_name_b2v += '_clean_data' if item['clean_data'] else '_raw_data'
    print("#" * 10 + '| Model: ' + model_name_b2v + ' |' + "#" * 10)
    result_wv = wv.baseline(model_name=model_name_b2v,
                            list_classifier=list_classifier,
                            fold=item['fold'],
                            iteration=3)
    result_models.extend(result_wv)

Support.save_to_csv(result_models)
Пример #12
0
 def __init__(self):
     sp = Support()
     self.lexicon = sp.load_file_polarity()