def generate_embedding(self, size=300, min_count=3, window=5, downsampling=1e-3, negative=3): try: full_corpus = [] corpus_ccd = Support.import_corpus_ccd() corpus_wikipedia_web = Support.corpus_bank_xml() corpus_bank = Support.import_dataset_bank() dataset = self.data['msg'].values.tolist() full_corpus.extend(corpus_ccd) full_corpus.extend(corpus_wikipedia_web) full_corpus.extend(corpus_bank) full_corpus.extend(dataset) print('Transform sentences to vectors ...') corpus = self.sp.sentence_vec(full_corpus) terminology = self.sp.import_terminology_embedding(filename='bank_terminology') print('Generate Embedding...') model = Word2Vec(corpus, cbow_mean=1, workers=self.cores, size=size,min_count=min_count, window=window, sample=downsampling, negative=negative, iter=30) model.train(terminology, total_examples=model.corpus_count, epochs=model.epochs) model.init_sims(replace=True) # Saving the model for later use. Can be loaded using Word2Vec.load() model_name = ROOT_DIR_DATA_EMBEDDING + "bank2vec.model" model.save(model_name) print('Model generated sucesfull!') except Exception as e: print("\t ERROR generate_embedding: ", e) return None
def __init__(self): print("Start Word2Vec - Sentiment Analysis") self.sp = Support() file_model = ROOT_DIR_DATA_EMBEDDING + "SBW-vectors-300-min5.txt" model_tmp = self.sp.load_vectors_from_csv(file_model) self.model = Word2Vec.load(model_tmp) self.data, self.label = self.sp.process_data(filename='bank_message', size_msg=3, clean=True, replace_text=True, stemmed=None, lemmatize=None, spelling=None)
def __init__(self, clean_data=True): self.sp = Support() self.data = self.sp.process_data(filename='bank_message', size_msg=3, clean_data=clean_data, replace_text=True, stemmed=None, lemmatize=None, spelling=None)
def __init__(self): print("Start SenticNet - Sentiment Analysis") self.sp = Support() self.sn = SenticNet() self.corpus = self.sp.import_corpus_bank() self.terminology = self.sp.import_bank_terminology(filename='bank_terminology') self.data, self.label = self.sp.process_data(filename='bank_message', size_msg=3, clean=True, replace_text=True, stemmed=None, lemmatize=None, spelling=None)
class SenticNetSA: def __init__(self): print("Start SenticNet - Sentiment Analysis") self.sp = Support() self.sn = SenticNet() self.corpus = self.sp.import_corpus_bank() self.terminology = self.sp.import_bank_terminology(filename='bank_terminology') self.data, self.label = self.sp.process_data(filename='bank_message', size_msg=3, clean=True, replace_text=True, stemmed=None, lemmatize=None, spelling=None) def baseline(self): TP = 0 FP = 0 FN = 0 x_train, x_test, y_train, y_test = train_test_split(self.data, self.label, test_size=0.20, random_state=1000) for i in range (0, len(x_train)): msg = str(x_train[i]) value = float(y_train[i]) result = self.sn.message_concept(msg) polarity_value = float(result['polarity_value']) polarity_value = 0.0 if polarity_value < 0.10 or polarity_value > -0.1 else polarity_value if value == polarity_value: TP += 1 else: FP += 1 if value == 1 and (polarity_value == 0.0 or polarity_value == -1.0): FN += 1 elif value == 0.0 and (polarity_value == 1 or polarity_value == -1.0): FN += 1 elif value == -1.0 and (polarity_value == 0.0 or polarity_value == 1.0): FN += 1 precision = TP/(TP + FP) recall = TP / (TP + FN) f1 = 2*((precision*recall) / (precision + recall)) print("f1-score : {}%".format(round(f1 * 100, 2)))
class Wordk2VecSA: def __init__(self): print("Start Word2Vec - Sentiment Analysis") self.sp = Support() file_model = ROOT_DIR_DATA_EMBEDDING + "SBW-vectors-300-min5.txt" model_tmp = self.sp.load_vectors_from_csv(file_model) self.model = Word2Vec.load(model_tmp) self.data, self.label = self.sp.process_data(filename='bank_message', size_msg=3, clean=True, replace_text=True, stemmed=None, lemmatize=None, spelling=None) def baseline(self, num_features=300, fold=5, iteration=3): for model_type, classifier in wor2vec_model.items(): sum_recall = 0.0 sum_precision = 0.0 sum_f1 = 0.0 sum_accuracy = 0.0 for i in range(0, iteration): x_train, x_test, y_train, y_test = train_test_split(self.data, self.label, test_size=0.25, random_state=1000) x_train, y_train = self.sp.balanced_data(x_train, y_train) x_test, y_test = self.sp.balanced_data(x_test, y_test) trainDataVecs = self.sp.getAvgFeatureVecs(x_train, self.model, num_features) testDataVecs = self.sp.getAvgFeatureVecs(x_test, self.model, num_features) classifier.fit(trainDataVecs, y_train) predict = classifier.predict(testDataVecs) # Recall Scores recall_scores = cross_val_score(classifier, trainDataVecs, y_train, cv=fold, scoring='recall_macro') sum_recall += recall_scores # Precision Score precision_score = cross_val_score(classifier, trainDataVecs, y_train, cv=fold, scoring='precision_weighted') sum_precision += precision_score # F1 Score f1_score = cross_val_score(classifier, trainDataVecs, y_train, cv=fold, scoring='f1_weighted') sum_f1 += f1_score # Accuracy Score accuracy_score = cross_val_score(classifier, trainDataVecs, y_train, cv=fold, scoring='balanced_accuracy') sum_accuracy += accuracy_score recall = sum_recall / iteration precision = sum_precision / iteration f1 = sum_f1 / iteration accuracy = sum_accuracy / iteration self.sp.print_score(model_type, predicted_classes=predict, recall=recall, precision=precision, f1=f1, accuracy=accuracy, test=y_test)
def baseline(self, model_name, list_classifier, analyzer='word', ngram= None, min_df=3, max_features=None, fold=5, stop_words=None, iteration=3): try: start_time = time.time() result = [] ngram = (1, 2) if ngram is None else ngram stopw = set(stopwords.words("spanish")) if stop_words is not None else None models = ClassifierModels(list_classifier).classifiers for classifier_name, classifier in models.items(): dict_data = {} x_train, x_test, y_train, y_test = train_test_split(self.data['msg'], self.data['label'], test_size=0.25, random_state=1000) # Se balancean las instancias del training y el sa_test # training and sa_test are balanced df_train = pd.DataFrame({'msg': x_train, 'label': y_train}) df_test = pd.DataFrame({'msg': x_test, 'label': y_test}) train = Support.balanced_data(df_train) x_train = train['msg'] y_train = train['label'] test = Support.balanced_data(df_test) x_test = test['msg'] y_test = test['label'] vectorizer = CountVectorizer(analyzer=analyzer, lowercase=True, encoding='utf-8', min_df=min_df, ngram_range=ngram, max_features=max_features, stop_words=stopw) vectorizer.fit(x_train) x_train = vectorizer.transform(x_train) x_test = vectorizer.transform(x_test) classifier.fit(x_train, y_train) predict = classifier.predict(x_test) sum_recall = 0.0 sum_precision = 0.0 sum_f1 = 0.0 sum_accuracy = 0.0 for i in range(0, iteration): # Recall Scores recall_scores = cross_val_score(classifier, x_train, y_train, cv=fold, scoring='recall_macro') sum_recall += recall_scores # Precision Score precision_score = cross_val_score(classifier, x_train, y_train, cv=fold, scoring='precision_weighted') sum_precision += precision_score # F1 Score f1_score = cross_val_score(classifier, x_train, y_train, cv=fold, scoring='f1_weighted') sum_f1 += f1_score # Accuracy Score accuracy_score = cross_val_score(classifier, x_train, y_train, cv=fold, scoring='balanced_accuracy') sum_accuracy += accuracy_score #Calculated Scores dict_data['classifier_name'] = classifier_name recall = sum_recall/iteration dict_data['recall'] = round(np.mean(recall) * 100, 2) precision = sum_precision/iteration dict_data['precision'] = round(np.mean(precision) * 100, 2) f1 = sum_f1/iteration dict_data['f1'] = round(np.mean(f1) * 100, 2) accuracy = sum_accuracy/iteration dict_data['accuracy'] = round(np.mean(accuracy) * 100, 2) #Calculated Time processing t_sec = round(time.time() - start_time) (t_min, t_sec) = divmod(t_sec, 60) (t_hour, t_min) = divmod(t_min, 60) time_processing = '{} hour:{} min:{} sec'.format(t_hour, t_min, t_sec) dict_data['time_processing'] = time_processing Support.print_score(dict_data=dict_data) dict_data['model_name'] = model_name return result except Exception as e: print("\t ERROR baseline BoW: ", e)
def __init__(self): self.sp = Support() self.lexicon = self.sp.load_file_polarity() self.cores = multiprocessing.cpu_count() self.data = Support.process_data(filename='bank_message', size_msg=3, clean_data=True, replace_text=True, stemmed=None, lemmatize=None, spelling=None)
def baseline(self, model_name, list_classifier, ngram=None, min_df=3, max_features=5000, fold=5, stop_words=None, iteration=3): try: start_time = time.time() result = [] ngram = (1, 2) if ngram is None else ngram stopw = set( stopwords.words("spanish")) if stop_words is not None else None models = ClassifierModels(list_classifier).classifiers for classifier_name, classifier in models.items(): dict_data = {} x_train, x_test, y_train, y_test = train_test_split( self.data['msg'], self.data['label'], test_size=0.25, random_state=1000) # training and sa_test are balanced df_train = pd.DataFrame({'msg': x_train, 'label': y_train}) df_test = pd.DataFrame({'msg': x_test, 'label': y_test}) train = self.sp.balanced_data(df_train) x_train = train['msg'] y_train = train['label'] test = self.sp.balanced_data(df_test) x_test = test['msg'] y_test = test['label'] vec_bow = CountVectorizer(analyzer='word', lowercase=True, encoding='utf-8', ngram_range=ngram, min_df=min_df, max_features=max_features, stop_words=stopw) vec_tfidf = TfidfVectorizer(analyzer='word', lowercase=True, encoding='utf-8', ngram_range=ngram, min_df=min_df, max_features=max_features, stop_words=stopw) lexicon = LexiconExtractor() terminology = TerminologyExtractor() vec_bow.fit(x_train) vec_tfidf.fit(x_train) lexicon.fit(x_train) terminology.fit(x_train) pipeline = Pipeline([ ( 'feats', FeatureUnion( [('bow', vec_bow), ('tfidf', vec_tfidf), ('lexicon', lexicon), ('terminology', terminology)], # weight components in FeatureUnion transformer_weights={ 'bow': 0.2, 'tfidf': 0.2, 'lexicon': 0.3, 'terminology': 0.3 })), (classifier_name, classifier) # classifier ]) pipeline.fit(x_train, y_train) predict = pipeline.predict(x_test) sum_recall = 0.0 sum_precision = 0.0 sum_f1 = 0.0 sum_accuracy = 0.0 for i in range(0, iteration): print('Iteration {0} for Cross-Validation'.format(i + 1)) #Recall recall_scores = cross_val_score(pipeline, x_train, y_train, cv=fold, scoring='recall_macro', n_jobs=-1) sum_recall += recall_scores #Precision precision_score = cross_val_score( pipeline, x_train, y_train, cv=fold, scoring='precision_weighted', n_jobs=-1) sum_precision += precision_score #F1 f1_score = cross_val_score(pipeline, x_train, y_train, cv=fold, scoring='f1_weighted', n_jobs=-1) sum_f1 += f1_score #Accuracy accuracy_score = cross_val_score( pipeline, x_train, y_train, cv=fold, scoring='balanced_accuracy', n_jobs=-1) sum_accuracy += accuracy_score # Calculated Scores dict_data['classifier_name'] = classifier_name recall = sum_recall / iteration dict_data['recall'] = round(np.mean(recall) * 100, 2) precision = sum_precision / iteration dict_data['precision'] = round(np.mean(precision) * 100, 2) f1 = sum_f1 / iteration dict_data['f1'] = round(np.mean(f1) * 100, 2) accuracy = sum_accuracy / iteration dict_data['accuracy'] = round(np.mean(accuracy) * 100, 2) # Calculated Time processing t_sec = round(time.time() - start_time) (t_min, t_sec) = divmod(t_sec, 60) (t_hour, t_min) = divmod(t_min, 60) time_processing = '{} hour:{} min:{} sec'.format( t_hour, t_min, t_sec) dict_data['time_processing'] = time_processing Support.print_score(dict_data=dict_data) dict_data['model_name'] = model_name result.append(dict_data) return result except Exception as e: print("\t ERROR baseline DomainOfWord: ", e)
def __init__(self): sp = Support() self.terminology = sp.import_terminology(filename='bank_terminology')
iteration=3) result_models.extend(result_tfidf) model_name_dow = 'DoW_N@' + str(ngram) + '_df@' + str( min_df) + '_fold@' + str(item['fold']) model_name_dow += '_clean_data' if item[ 'clean_data'] else '_raw_data' model_name_dow += '_stop_words' if item[ 'stop_words'] else '_without_stop_words' print("#" * 10 + '| Model:' + model_name_dow + ' |' + "#" * 10) result_dow = dow.baseline(model_name=model_name_dow, list_classifier=list_classifier, ngram=(1, ngram), min_df=min_df, max_features=5000, stop_words=item['stop_words'], fold=item['fold'], iteration=3) result_models.extend(result_dow) model_name_b2v = 'Bank2Vec' + '_fold@' + str(item['fold']) model_name_b2v += '_clean_data' if item['clean_data'] else '_raw_data' print("#" * 10 + '| Model: ' + model_name_b2v + ' |' + "#" * 10) result_wv = wv.baseline(model_name=model_name_b2v, list_classifier=list_classifier, fold=item['fold'], iteration=3) result_models.extend(result_wv) Support.save_to_csv(result_models)
def __init__(self): sp = Support() self.lexicon = sp.load_file_polarity()