Exemplo n.º 1
0
from FileLoader import *
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
from textblob import TextBlob
import warnings
import tkinter
from tkinter import *


warnings.filterwarnings("ignore")

file = FileLoader("reuters_headlines.csv")
data = file.read_file()
data.insert(data.shape[1], 'Sentiment', 0)

for i in range(len(data)):
    corpus = TextBlob(data['Headlines'][i] + ' ' + data['Description'][i])
    if(corpus.sentiment.polarity > 0):
        data['Sentiment'][i] = "Positive"
    elif(corpus.sentiment.polarity < 0):
        data['Sentiment'][i] = "Negative"
    else:
        data['Sentiment'][i] = "Neutral"

# print(data.info())
# print(data.Sentiment.value_counts())
data_copy = data.copy()
Exemplo n.º 2
0
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.metrics import accuracy_score

pd.set_option('display.width', 150)
pd.set_option('display.max_colwidth', 150)

training_data = FileLoader("Covid_train_data.csv")
df_training = training_data.read_file()

testing_data = FileLoader("Covid_test_data.csv")
df_testing = testing_data.read_file()

print(df_training.info())
print(df_testing.info())

df_training["Sentiment"].value_counts().plot(kind='bar')
plt.xlabel("Sentiment")
plt.ylabel("Counts")
plt.title("Proportion of sentiments")
plt.show()

def sentiment_extraction(df, column, label):
Exemplo n.º 3
0
        ndarray = np.pad(ndarray,
                         ((int((max_corpus_length - len(list)) / 2),
                           int((max_corpus_length - len(list)) / 2)), (0, 0)),
                         'constant',
                         constant_values=(0, 0))
    else:
        ndarray = np.pad(ndarray,
                         ((int((max_corpus_length - len(list)) / 2),
                           int((max_corpus_length - len(list)) / 2) + 1),
                          (0, 0)),
                         'constant',
                         constant_values=(0, 0))
    return ndarray


file = FileLoader("Womens Clothing E-Commerce Reviews.csv")
data = file.read_file()
print(data.info())

data.drop(labels=['Clothing ID', 'Title'], axis=1, inplace=True)
data = data[~data['Review Text'].isnull()]

# ros = RandomOverSampler(random_state=0)
# data_resampled, label_resampled = ros.fit_resample(pd.DataFrame(data['Review Text']), data["Recommended IND"])
# duplicate = data[data["Recommended IND"].isin([0])]
# print(duplicate)
# data = pd.concat([data,duplicate,duplicate])
print(data)

preprocessed_data = Preprocessing(data)
preprocessed_data.error_cleaning("Review Text")
Exemplo n.º 4
0
Arquivo: Main.py Projeto: jbean21/FHN
import glob
import FileLoader
import OrderParameter
#import CorrelationPlots
#import ContourPlots
#import OrderVsDisorder

print("Starting Analysis")

FileLoader.OPinit()
progress_counter = 0
for i in sorted(
        glob.glob(
            "/media/jay/0FD90FF80FD90FF83/PROJECTDATA/2DPERIODIC_2/Data/*.csv")
):
    #Track progress of analysis
    print(progress_counter)
    #Check if first letter of filename is F for FHN, index 38 to remove path from consideration.
    if i[59] == "v":
        df, params = FileLoader.FileLoader(i)
        print(params)
        OrderParameter.OP(i, df, params[0], params[1], params[2], params[3],
                          params[4])

    progress_counter += 1
op_csv = FileLoader.OPread()
#CorrelationPlots.Correlation(op_csv)
#ContourPlots.ContourPlots(op_csv)
#OrderVsDisorder.OrderVsDisorder(op_csv)