def main(): print("Number of processors: ", mp.cpu_count()) pool = mp.Pool(mp.cpu_count()) # for parallel processing df = pd.DataFrame( columns=['comment', 'otherMetadata', 'likeDislikeRatio', 'sentiment']) i = 0 for x in range(751): with open("../output/" + str(x) + ".json", encoding='utf-8') as f: data = json.load(f) # global commentsText commentsText = "" # tags = data[0]["tags"] sentiment = data[0]['sentiment'] if 'tags' in data[0].keys(): tags = str(' '.join(data[0]["tags"])) # print("tags" + tags) otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + " " + tags) else: otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"]) likes = int(data[0]["likeCount"]) dislikes = int(data[0]["dislikeCount"]) likeDislikeRatio = str(float(likes / dislikes)) results = pool.map(preprocess, [item for item in data[0]["comments"]]) for result in results: if result is not None: commentsText += result[0] df.loc[i] = [commentsText] + [otherMetaData] + [likeDislikeRatio ] + [sentiment] # print(str(i)+" : "+df['posToNegCommentRatio'].loc[i]) # print(df['otherMetadata'].iloc[0]) print(i) i += 1 df['sentiment_one_hot'] = df['sentiment'].apply(lambda x: 0 if x == 'N' else 1) df['data'] = df['comment'] + ' ' + df['otherMetadata'] traindf, testdf = train_test_split(df, test_size=0.2) x_train, x_test, y_train, y_test = train_test_split( df['data'], df['sentiment_one_hot'], test_size=0.2) NBModel = NaiveBayesClassifier() NBModel.train(x_train, y_train, alpha=1) print(y_test) # hateVideoComments = df.loc[18]['comment'] # print(hateVideoComments) levelOfHate = NBModel.getHateLevel(x_test) print(levelOfHate)
def main(): print("Number of processors: ", mp.cpu_count()) pool = mp.Pool(mp.cpu_count()) # for parallel processing df = pd.DataFrame( columns=['comment', 'otherMetadata', 'likeDislikeRatio', 'sentiment']) i = 0 for x in range(751): with open("../output/" + str(x) + ".json", encoding='utf-8') as f: data = json.load(f) # global commentsText commentsText = "" # tags = data[0]["tags"] sentiment = data[0]['sentiment'] if 'tags' in data[0].keys(): tags = str(' '.join(data[0]["tags"])) # print("tags" + tags) otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + " " + tags) else: otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"]) likes = int(data[0]["likeCount"]) dislikes = int(data[0]["dislikeCount"]) likeDislikeRatio = str(float(likes / dislikes)) results = pool.map(preprocess, [item for item in data[0]["comments"]]) for result in results: if result is not None: commentsText += result[0] df.loc[i] = [commentsText] + [otherMetaData] + [likeDislikeRatio ] + [sentiment] # print(str(i)+" : "+df['posToNegCommentRatio'].loc[i]) # print(df['otherMetadata'].iloc[0]) print(i) i += 1 df['sentiment_one_hot'] = df['sentiment'].apply(lambda x: 0 if x == 'N' else 1) df['data'] = df['comment'] + ' ' + df['otherMetadata'] traindf, testdf = train_test_split(df, test_size=0.2) countVec_comment = CountVectorizer() countVec_comment.fit(df['data']) negative_score_comment = train(traindf, countVec_comment) prediction = predict(testdf, negative_score_comment) print(classification_report(testdf['sentiment_one_hot'], prediction)) pool.close()
def preprocess(item): comment = str(item["comment"]) processed_comment = pr.process(comment) if (processed_comment != "None") and (processed_comment is not None) and ( processed_comment != ""): transformed = comment_feature_mapper.transform([processed_comment]) sentiment = int(comment_sentiModel.predict(transformed)) PCount = 0 RECount = 0 SGCount = 0 wordCount = 0 for w in processed_comment.split(): wordCount = wordCount + 1 word = w if "/" in word: word = w[:w.index('/')] if word in political_vocabulary: PCount = PCount + 1 elif word in relious_ethnic_vocabulary: RECount = RECount + 1 elif word in sex_gender_vocabulary: SGCount = SGCount + 1 # positive_count = positive_count / word_count # negative_count = negative_count / word_count # print(PCount, RECount, SGCount, wordCount) return processed_comment, sentiment, PCount, RECount, SGCount, wordCount # return processed_comment, sentiment else: return None, None, None, None, None, None
def preprocess(item): comment = str(item["comment"]) processed_comment = pr.process(comment) # print(processed_comment) if (processed_comment != "None") and (processed_comment is not None) and (processed_comment != ""): return processed_comment else: return None
def preprocess(item): comment = str(item) processed_comment = pr.process(comment) # print(processed_comment) if (processed_comment != "None") and (processed_comment is not None): # commentsText += processed_comment return processed_comment else: return None
def preprocess(item): """Returns how many numbers lie within `maximum` and `minimum` in a given `row`""" comment = str(item) processed_comment = pr.process(comment) # print(processed_comment) if (processed_comment != "None") and (processed_comment is not None): # commentsText += processed_comment return processed_comment else: return None
def preprocess(item): comment = str(item["comment"]) processed_comment = pr.process(comment) # print(processed_comment) if (processed_comment != "None") and (processed_comment is not None): transformed = comment_feature_mapper.transform([processed_comment]) sentiment = int(comment_sentiModel.predict(transformed)) return processed_comment, sentiment else: return None, None
def preprocess(item): """Returns how many numbers lie within `maximum` and `minimum` in a given `row`""" comment = str(item["comment"]) processed_comment = pr.process(comment) # print(processed_comment) if (processed_comment != "None") and (processed_comment is not None) and (processed_comment.strip() != ''): positive_count = 0 negative_count = 0 word_count = 0 comment_neg = processed_comment for i, w in enumerate(processed_comment.split()): word_count = word_count + 1 word = w if "/" in word: word = w[:w.index('/')] if word in positive_vocabulary: positive_count = positive_count + 1 elif word in negative_vocabulary: negative_count = negative_count + 1 elif word in negation_words: # print("comment :" + comment) word = processed_comment.split()[i] # print("Negation word :" + word) previous_word = word = processed_comment.split()[i - 1] new_previous_word = "not_" + previous_word # print("Previous Word : " + previous_word) before_previous_word = processed_comment[:processed_comment.find(previous_word)] # print("Before Previous Word: " + before_previous_word) after_previous_word = processed_comment[processed_comment.find(previous_word) + len(previous_word):] # print("After previous Word:" + after_previous_word) comment_neg = before_previous_word + ' ' + new_previous_word + ' ' + after_previous_word positive_count = positive_count / word_count negative_count = negative_count / word_count # data = processed_comment, [positive_count], [negative_count] data = {'preprocessed_text': [comment_neg], 'positive_count': [positive_count], 'Negative_count': [negative_count]} comment_df = pd.DataFrame(data) # print(comment_df) # comment_df = pd.DataFrame(data, columns=['preprocessed_text', 'positive_count', 'Negative_count']) transformed = comment_feature_mapper.transform(comment_df) sentiment = int(comment_sentiModel.predict(transformed)) # print('sentiment: ') # print(sentiment) # commentsText += processed_comment return processed_comment, sentiment else: return None, None
def image_process(): #req_data = request.get_json() #print(req_data) # video_url = req_data['url'] vid = request.args.get("url") # vid= req_data['url'] print(vid) # seed(42) keys = pd.read_csv(os.getcwd() + "\\keys.csv", encoding='utf-') # url = video_url noOfKeys = keys.shape[0] print(noOfKeys) # print(noOfKeys) # video_id = urlparse(url) # q = parse_qs(video_id.query) # vid = q["v"][0] key = keys.iloc[randint(0, noOfKeys - 1), 0] print("_______________________________________________") print(key) # try: vc = VideoData(vid, key) vc.get_video_comments() # except Exception: # print("Missing Meta Data") # return 0 urllib.request.urlretrieve( "https://img.youtube.com/vi/" + vid + "/hqdefault.jpg", "temp/img.jpg") thumbnail_text = imgprocess() # thumbnail_text = "Something" print("Number of processors: ", mp.cpu_count()) pool = mp.Pool(6) # for parallel processing df = pd.DataFrame(columns=[ 'comment', 'otherMetadata', 'likeDislikeRatio', 'posToNegCommentRatio', 'pcount', 'recount', 'sgcount' ]) with open("temp/data.json", encoding='utf-8') as f: data = json.load(f) pcount = 0 recount = 0 sgcount = 0 wordcount = 0 # global commentsText commentsText = "" # tags = data[0]["tags"] if 'tags' in data[0].keys(): tags = str(' '.join(data[0]["tags"])) # print("tags" + tags) otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + " " + tags + " " + thumbnail_text) else: otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + " " + thumbnail_text) likes = int(data[0]["likeCount"]) dislikes = int(data[0]["dislikeCount"]) likeDislikeRatio = str(float(likes / dislikes)) results = pool.map(preprocess, [item for item in data[0]["comments"]]) pool.close() postiveCount = 1 negativeCount = 1 for result in results: if result[0] is not None: commentsText = commentsText + " " + result[0] if result[1] == 1: postiveCount = postiveCount + 1 else: negativeCount = negativeCount + 1 if result[2] is not None: pcount = pcount + int(result[2]) if result[3] is not None: recount = recount + int(result[3]) if result[4] is not None: sgcount = sgcount + int(result[4]) if result[5] is not None: wordcount = wordcount + int(result[5]) p_count = str(float((pcount * 100 / wordcount))) re_count = str(float((recount * 100 / wordcount))) sg_count = str(float((sgcount * 100 / wordcount))) posToNegCommentRatio = str(float(postiveCount / negativeCount)) df.loc[0] = [commentsText] + [otherMetaData] + [likeDislikeRatio] + [ posToNegCommentRatio ] + [p_count] + [re_count] + [sg_count] # print(df['otherMetadata'].iloc[0]) # Classify # Classification_Model = joblib.load("classification_Lr.pkl") # Classification_vectorizer = joblib.load("vectorizer_mapper.pkl") # # tranformed = Classification_vectorizer.transform(df) # print(tranformed[0][2]) # prediction = Classification_Model.predict(tranformed) # print(prediction) keras.backend.clear_session() HateDetection_feature_mapper = joblib.load( "models/HD_featureMapper_LR.pkl") HateDetection_Model = joblib.load("models/HateDetection_LR.pkl") tranformed = HateDetection_feature_mapper.transform(df) hate = HateDetection_Model.predict(tranformed) print("Hate Detection :" + str(hate)) clf = HateDetection_Model z = np.dot(clf.coef_, tranformed.T) + clf.intercept_ hypo = 1 / (1 + np.exp(-z)) hate_level = 1 - float(hypo) print("Level of hate =" + str(hate_level)) print(hate_level) df['data'] = df['comment'] + " " + df['otherMetadata'] keras.backend.clear_session() Domain_feature_mapper = joblib.load("models/Domain_feature_mapper.pkl") Domain_Classification_Model = tf.keras.models.load_model( 'models/Domain_ann.pkl') Domaintranformed = Domain_feature_mapper.transform(df) Domain = Domain_Classification_Model.predict_classes(Domaintranformed) Domain_label_encoder = joblib.load("models/Domain_label_encoder.pkl") domain_label = Domain_label_encoder.inverse_transform(Domain) print("Domain :" + str(domain_label)) print(domain_label) print(hate_level) if domain_label[0] == 'P': domain = 'Political' elif domain_label[0] == 'SG': domain = 'Sex & Gender' elif domain_label[0] == 'RE': domain = 'Religious & Ethnic' else: domain = 'Other' if hate[0] == 1: sentiment = 'Not Hate' # sentiment = 'Low Hate' else: sentiment = 'Hate' # sentiment = 'High Hate' x = Decimal(hate_level) level_of_hate = float(round(x, 2)) print(level_of_hate) return jsonify(sentiment=sentiment, hateLevel=level_of_hate, category=domain, thumbnail_text=thumbnail_text)
def preprocess(item): """Returns how many numbers lie within `maximum` and `minimum` in a given `row`""" comment = str(item["comment"]) processed_comment = pr.process(comment) print(processed_comment) if (processed_comment != "None") and (processed_comment is not None) and ( processed_comment != ""): # transformed = comment_feature_mapper.transform([processed_comment]) # sentiment = int(comment_sentiModel.predict(transformed)) PCount = 0 RECount = 0 SGCount = 0 wordCount = 0 positive_count = 0 negative_count = 0 comment_neg = processed_comment for i, w in enumerate(processed_comment.split()): wordCount = wordCount + 1 word = w if "/" in word: word = w[:w.index('/')] if word in positive_vocabulary: positive_count = positive_count + 1 elif word in negative_vocabulary: negative_count = negative_count + 1 elif word in political_vocabulary: PCount = PCount + 1 elif word in relious_ethnic_vocabulary: RECount = RECount + 1 elif word in sex_gender_vocabulary: SGCount = SGCount + 1 elif word in negation_words: print("comment :" + comment) word = processed_comment.split()[i] print("Negation word :" + word) previous_word = word = processed_comment.split()[i - 1] new_previous_word = "not_" + previous_word print("Previous Word : " + previous_word) before_previous_word = processed_comment[:processed_comment. find(previous_word)] # print("Before Previous Word: " + before_previous_word) after_previous_word = processed_comment[processed_comment. find(previous_word) + len(previous_word):] # print("After previous Word:" + after_previous_word) comment_neg = before_previous_word + ' ' + new_previous_word + ' ' + after_previous_word positive_count = positive_count / wordCount negative_count = negative_count / wordCount data = { 'preprocessed_text': [comment_neg], 'positive_count': [positive_count], 'Negative_count': [negative_count] } comment_df = pd.DataFrame(data) transformed = comment_feature_mapper.transform(comment_df) sentiment = int(comment_sentiModel.predict(transformed)) print(PCount, RECount, SGCount, wordCount) return processed_comment, sentiment, PCount, RECount, SGCount, wordCount # return processed_comment, sentiment else: return None, None, None, None, None, None
def main(): print("Number of processors: ", mp.cpu_count()) pool = mp.Pool(mp.cpu_count()) # for parallel processing df = pd.DataFrame(columns=['comment', 'otherMetadata', 'likeDislikeRatio', 'posToNegCommentRatio', 'sentiment']) i = 0 for x in range(1000): with open("../output/" + str(x) + ".json", encoding='utf-8') as f: data = json.load(f) # global commentsText commentsText = "" # tags = data[0]["tags"] sentiment = data[0]['sentiment'] thumbnail_text = df_thumbnail['Thumbnail'].iloc[i] if 'tags' in data[0].keys(): tags = str(' '.join(data[0]["tags"])) # print("tags" + tags) otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + " " + tags + ' ' + thumbnail_text) else: otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + ' ' + thumbnail_text) likes = int(data[0]["likeCount"]) dislikes = int(data[0]["dislikeCount"]) likeDislikeRatio = str(float(likes / dislikes)) results = pool.map(preprocess, [item for item in data[0]["comments"]]) postiveCount = 1 negativeCount = 1 for result in results: if result[0] is not None: commentsText += result[0] if result[1] == 1: postiveCount = postiveCount + 1 else: negativeCount = negativeCount + 1 posToNegCommentRatio = str(float(postiveCount / negativeCount)) df.loc[i] = [commentsText] + [otherMetaData] + [likeDislikeRatio] + [posToNegCommentRatio] + [sentiment] print(str(i) + " : " + df['posToNegCommentRatio'].loc[i]) # print(df['otherMetadata'].iloc[0]) i += 1 # Classify # Classification_Model = joblib.load("classification_Lr.pkl") # Classification_vectorizer = joblib.load("vectorizer_mapper.pkl") # # tranformed = Classification_vectorizer.transform(df) # print(tranformed[0][2]) # prediction = Classification_Model.predict(tranformed) # print(prediction) # print(df) pool.close() df['sentiment_one_hot'] = df['sentiment'].apply(lambda x: 0 if x == 'N' else 1) df['sentiment_one_level_hate'] = df['sentiment'].apply(lambda x: 1 if x == 'N' else 0) # mapper = DataFrameMapper([ # (['posToNegCommentRatio'], StandardScaler()), # ('otherMetadata', TfidfVectorizer(ngram_range=(1, 3), max_features=5000)), # (['likeDislikeRatio'], StandardScaler()), # ]) mapper = DataFrameMapper([ (['posToNegCommentRatio'], StandardScaler()), ('otherMetadata', TfidfVectorizer(ngram_range=(1, 3), max_features=5000)), (['likeDislikeRatio'], StandardScaler()), ]) mapper.fit(df) label = df['sentiment_one_hot'] features = mapper.transform(df) # x, x_test, y, y_test = train_test_split(features, label, test_size=0.2, train_size=0.8, random_state=0) print("logistic regression") clf = LogisticRegression() clf.fit(features, label) # predicted = clf.predict(x_test) # print(classification_report(y_test, predicted)) # x1, x_test1, y1, y_test1 = train_test_split(features, df['sentiment_one_level_hate'], test_size=0.2, train_size=0.8, random_state=0) print("logistic regression") clf_level = LogisticRegression() clf_level.fit(features, df['sentiment_one_level_hate']) # predicted1 = clf_level.predict(x_test1) # print(classification_report(y_test1, predicted1)) # print("SVM") # # SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto') # SVM.fit(x_test, y) # # predict the labels on validation dataset # predictions_SVM = SVM.predict(x_test) # print(classification_report(y_test, predictions_SVM)) # print("Random Forest") # clf = RandomForestClassifier(n_estimators = 300, criterion = "entropy", random_state = 0) # clf.fit(x, y) # predicted = clf.predict(x_test) # print(classification_report(y_test, predicted)) joblib.dump(clf, "HateDetection_LR.pkl") joblib.dump(clf_level, "Level_of Hate_LR.pkl") joblib.dump(mapper, "HD_featureMapper_LR.pkl")
def main1(): # seed(42) keys = pd.read_csv(os.getcwd() + "\\keys.csv", encoding='utf-') url = "https://www.youtube.com/watch?v=kzNC5163qHk" # "https://www.youtube.com/watch?v=feY49cKUlB0" noOfKeys = keys.shape[0] # print(noOfKeys) video_id = urlparse(url) q = parse_qs(video_id.query) vid = q["v"][0] key = keys.iloc[randint(0, noOfKeys), 0] print("_______________________") print(key) vc = VideoData(vid, key) vc.get_video_comments() urllib.request.urlretrieve( "https://img.youtube.com/vi/" + vid + "/hqdefault.jpg", "temp/img.jpg") imgprocess() print("Number of processors: ", mp.cpu_count()) pool = mp.Pool(1) # for parallel processing df = pd.DataFrame(columns=[ 'comment', 'otherMetadata', 'likeDislikeRatio', 'posToNegCommentRatio', 'pcount', 'recount', 'sgcount' ]) with open("temp/data.json", encoding='utf-8') as f: data = json.load(f) pcount = 0 recount = 0 sgcount = 0 wordcount = 0 # global commentsText commentsText = "" # tags = data[0]["tags"] if 'tags' in data[0].keys(): tags = str(' '.join(data[0]["tags"])) # print("tags" + tags) otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + " " + tags) else: otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"]) likes = int(data[0]["likeCount"]) dislikes = int(data[0]["dislikeCount"]) likeDislikeRatio = str(float(likes / dislikes)) results = pool.map(preprocess, [item for item in data[0]["comments"]]) pool.close() # results.remove(None) # results = filter(None, results) # for item in data[0]["comments"]: # result =[pool.map(preprocess(item))] # results = [pool.apply(howmany_within_range, args=(row, 4, 8)) for row in data] # comment = str(item["comment"]) # processed_comment = pr.process(comment) # print(processed_comment) # if(processed_comment != "None") and (processed_comment is not None): # commentsText += processed_comment # # commentSentimentModel(comment) # print(results) postiveCount = 1 negativeCount = 1 for result in results: if result[0] is not None: commentsText = commentsText + " " + result[0] if result[1] == 1: postiveCount = postiveCount + 1 else: negativeCount = negativeCount + 1 if result[2] is not None: pcount = pcount + int(result[2]) if result[3] is not None: recount = recount + int(result[3]) if result[4] is not None: sgcount = sgcount + int(result[4]) if result[5] is not None: wordcount = wordcount + int(result[5]) p_count = str(float((pcount * 100 / wordcount))) re_count = str(float((recount * 100 / wordcount))) sg_count = str(float((sgcount * 100 / wordcount))) posToNegCommentRatio = str(float(postiveCount / negativeCount)) df.loc[0] = [commentsText] + [otherMetaData] + [likeDislikeRatio] + [ posToNegCommentRatio ] + [p_count] + [re_count] + [sg_count] # print(df['otherMetadata'].iloc[0]) # Classify # Classification_Model = joblib.load("classification_Lr.pkl") # Classification_vectorizer = joblib.load("vectorizer_mapper.pkl") # # tranformed = Classification_vectorizer.transform(df) # print(tranformed[0][2]) # prediction = Classification_Model.predict(tranformed) # print(prediction) print(df.loc[0]) HateDetection_feature_mapper = joblib.load( "models/HD_featureMapper_LR.pkl") HateDetection_Model = joblib.load("models/HateDetection_LR.pkl") tranformed = HateDetection_feature_mapper.transform(df) hate = HateDetection_Model.predict(tranformed) print("Hate Detection :" + str(hate)) clf = HateDetection_Model z = np.dot(clf.coef_, tranformed.T) + clf.intercept_ hypo = 1 / (1 + np.exp(-z)) hate_level = hypo print("Level of hate =" + str(hypo)) df['data'] = df['comment'] + " " + df['otherMetadata'] keras.backend.clear_session() Domain_feature_mapper = joblib.load("models/Domain_feature_mapper.pkl") Domain_Classification_Model = tf.keras.models.load_model( 'models/Domain_ann.pkl') Domaintranformed = Domain_feature_mapper.transform(df) Domain = Domain_Classification_Model.predict_classes(Domaintranformed) print(Domain_Classification_Model.predict(Domaintranformed)) Domain_label_encoder = joblib.load("models/Domain_label_encoder.pkl") domain_label = Domain_label_encoder.inverse_transform(Domain) print("Domain :" + str(domain_label)) print("Done")
def main(): print("Number of processors: ", mp.cpu_count()) pool = mp.Pool(mp.cpu_count()) # for parallel processing df = pd.DataFrame(columns=[ 'comment', 'otherMetadata', 'likeDislikeRatio', 'posToNegCommentRatio', 'sentiment' ]) i = 0 for x in range(751): with open("../../output/" + str(x) + ".json", encoding='utf-8') as f: data = json.load(f) # global commentsText commentsText = "" # tags = data[0]["tags"] sentiment = data[0]['sentiment'] if 'tags' in data[0].keys(): tags = str(' '.join(data[0]["tags"])) # print("tags" + tags) otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + " " + tags) else: otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"]) likes = int(data[0]["likeCount"]) dislikes = int(data[0]["dislikeCount"]) likeDislikeRatio = str(float(likes / dislikes)) results = pool.map(preprocess, [item for item in data[0]["comments"]]) postiveCount = 1 negativeCount = 1 for result in results: if result[0] is not None: commentsText += result[0] if result[1] == 1: postiveCount = postiveCount + 1 else: negativeCount = negativeCount + 1 posToNegCommentRatio = str(float(postiveCount / negativeCount)) df.loc[i] = [commentsText] + [otherMetaData] + [ likeDislikeRatio ] + [posToNegCommentRatio] + [sentiment] print(str(i) + " : " + df['posToNegCommentRatio'].loc[i]) # print(df['otherMetadata'].iloc[0]) i += 1 # Classify # Classification_Model = joblib.load("classification_Lr.pkl") # Classification_vectorizer = joblib.load("vectorizer_mapper.pkl") # # tranformed = Classification_vectorizer.transform(df) # print(tranformed[0][2]) # prediction = Classification_Model.predict(tranformed) # print(prediction) # print(df) df['sentiment_one_hot'] = df['sentiment'].apply(lambda x: 0 if x == 'N' else 1) mapper = DataFrameMapper([ (['posToNegCommentRatio'], StandardScaler()), ('otherMetadata', TfidfVectorizer(ngram_range=(1, 3), max_features=5000)), (['likeDislikeRatio'], StandardScaler()), ]) mapper.fit(df) features = mapper.transform(df) label = df['sentiment_one_hot'] x, x_test, y, y_test = train_test_split(features, label, test_size=0.2, train_size=0.8, random_state=0) clf = LogisticRegression() clf.fit(x, y) predicted = clf.predict(x_test) print(classification_report(y_test, predicted)) pool.close()
def main(): print("Number of processors: ", mp.cpu_count()) pool = mp.Pool(mp.cpu_count()) # for parallel processing df = pd.DataFrame(columns=[ 'comment', 'otherMetadata', 'likeDislikeRatio', 'posToNegCommentRatio', 'sentiment', 'category', 'pcount', 'recount', 'sgcount' ]) # df = pd.DataFrame( # columns=['comment', 'otherMetadata', 'likeDislikeRatio', 'posToNegCommentRatio', 'sentiment', 'category']) i = 0 for x in range(1000): with open("../../output/" + str(x) + ".json", encoding='utf-8') as f: pcount = 0 recount = 0 sgcount = 0 wordcount = 0 data = json.load(f) # global commentsText commentsText = "" # tags = data[0]["tags"] sentiment = data[0]['sentiment'] category = data[0]['category'] print(category) thumbnail_text = df_thumbnail['Thumbnail'].iloc[i] if 'tags' in data[0].keys(): tags = str(' '.join(data[0]["tags"])) # print("tags" + tags) otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + " " + tags + ' ' + thumbnail_text) else: otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + ' ' + thumbnail_text) likes = int(data[0]["likeCount"]) dislikes = int(data[0]["dislikeCount"]) likeDislikeRatio = str(float(likes / dislikes)) results = pool.map(preprocess, [item for item in data[0]["comments"]]) postiveCount = 1 negativeCount = 1 for result in results: if result[0] is not None: commentsText = commentsText + " " + result[0] if result[1] == 1: postiveCount = postiveCount + 1 else: negativeCount = negativeCount + 1 if result[2] is not None: pcount = pcount + int(result[2]) if result[3] is not None: recount = recount + int(result[3]) if result[4] is not None: sgcount = sgcount + int(result[4]) if result[5] is not None: wordcount = wordcount + int(result[5]) p_count = str(float((pcount * 100 / wordcount))) re_count = str(float((recount * 100 / wordcount))) sg_count = str(float((sgcount * 100 / wordcount))) posToNegCommentRatio = str(float(postiveCount / negativeCount)) df.loc[i] = [commentsText] + [ otherMetaData ] + [likeDislikeRatio] + [posToNegCommentRatio] + [sentiment] + [ category ] + [p_count] + [re_count] + [sg_count] print(str(i) + " : " + df['posToNegCommentRatio'].loc[i]) i += 1 df['sentiment_one_hot'] = df['sentiment'].apply(lambda x: 0 if x == 'N' else 1) df['data'] = df['comment'] + " " + df['otherMetadata'] mapper = DataFrameMapper([ ('data', TfidfVectorizer(ngram_range=(1, 3), max_features=5000)), (['pcount'], StandardScaler()), (['recount'], StandardScaler()), (['sgcount'], StandardScaler()), ]) mapper.fit(df) data = mapper.transform(df) # joblib.dump(mapper, "Domain_feature_mapper.pkl") # Synthetic Minority Over-sampling Technique smote = SMOTE('minority') X_sm, Y_sm = smote.fit_sample(data, df['category']) encoder = LabelEncoder() encoder.fit(Y_sm) encoded_Y = encoder.transform(Y_sm) joblib.dump(encoder, "Domain_label_encoder.pkl") print(encoded_Y) dummy_y = np_utils.to_categorical(encoded_Y) print(dummy_y) X_train, X_test, y_train, y_test = train_test_split(X_sm, dummy_y, test_size=0.2) # X_train, X_test, y_train, y_test = train_test_split(data, df['category'], test_size=0.2) pool.close() model_DNN = Build_Model_DNN_Text(X_train.shape[1], 4) history_dropout = model_DNN.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32) predicted = model_DNN.predict(X_test) y_pred_vector = np.argmax(predicted, axis=1) y_test_vector = np.argmax(y_test, axis=1) print(classification_report(y_test_vector, y_pred_vector)) loss = history_dropout.history['loss'] val_loss = history_dropout.history['val_loss'] epochs = range(1, len(loss) + 1) plt.plot(epochs, loss, 'y', label='Training loss') plt.plot(epochs, val_loss, 'r', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() acc = history_dropout.history['acc'] val_acc = history_dropout.history['val_acc'] plt.plot(epochs, acc, 'y', label='Training acc') plt.plot(epochs, val_acc, 'r', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show() model_DNN.save("Domain_ann.pkl")