def run_nlp(self, language): # Make sure server is running properly (as explained in https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK) : # might need root # english: java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,sentiment -status_port 9000 -port 9000 -timeout 15000 # the german implementation cannot do sentiment analysis, the predictions do not bear any relevance, keeping the code like that just makes it easier to maybe add seom sentiment analysis of the parsed german text in the future # if the service times out increasing the timeout helps. This usually happens when a sentence is too long to be handled within the given period. self.__check_language(language) util.time_log("starting NLP...") annotator_dict = {"annotators": "sentiment"} classifier = CoreNLPParser("http://localhost:9000") ret_list = [] for k_iter in range(0, self.k): prediction = [] for review in self.test_data_text(language, k_iter): response_dict = classifier.api_call(review, properties=annotator_dict, timeout=500) count = 0 sentiment = 0.0 for sentence in response_dict["sentences"]: count += 1 sentiment += float(sentence["sentimentValue"]) avg_sentiment = sentiment / count # a lot better results with >=2 prediction.append(1 if avg_sentiment >= 2 else 0) ret_list.append(prediction) return ret_list
def __access_data_reviews(self, language): if self.current_language_type_accessed is None or self.current_language_type_accessed != language: util.time_log("vectorizing for " + language + " ...") self.current_language_type_accessed = language learn_test_tuple = self.file_reader.create_training_test_reviews( self.languages, language) self.k = len(learn_test_tuple) self.current_language_type_data = ([ learn_test_tuple[i][0] for i in range(0, self.k) ], [learn_test_tuple[i][1] for i in range(0, self.k)]) self.__create_bow_matrix(language) # self.__vectorize_reviews_with_bow(language) util.time_log("done vectorizing...")
def run_textblob(self, language): self.__check_language(language) util.time_log("starting textblob...") ret_list = [] for k_iter in range(0, self.k): if self.languages[language] == "english": ret_list.append([ 1 if TextBlob_EN(w).polarity > 0 else 0 for w in self.test_data_text(language, k_iter) ]) else: ret_list.append([ 1 if TextBlob_DE(w).polarity > 0 else 0 for w in self.test_data_text(language, k_iter) ]) return ret_list
def run_polyglot(self, language): self.__check_language(language) util.time_log("starting polyglot...") lang_code = "en" if self.languages[language] == "english" else "de" ret_list = [] for k_iter in range(0, self.k): if self.languages[language] == "english": ret_list.append([ 1 if Text(x, lang_code).polarity > 0 else 0 for x in self.test_data_text(language, k_iter) ]) else: ret_list.append([ 1 if Text(x, lang_code).polarity >= 0 else 0 for x in self.test_data_text(language, k_iter) ]) return ret_list
def run_vader(self, language): self.__check_language(language) util.time_log("starting VADER...") ret_list = [] for k_iter in range(0, self.k): if self.languages[language] == "english": classifier = Vader_EN() ret_list.append([ 1 if classifier.polarity_scores(w)["compound"] >= 0 else 0 for w in self.test_data_text(language, k_iter) ]) else: classifier = Vader_DE() ret_list.append([ 1 if classifier.polarity_scores(w)["compound"] >= 0 else 0 for w in self.test_data_text(language, k_iter) ]) return ret_list
def run_naive_bayes(self, language): self.__check_language(language) util.time_log("starting nb...") ret_list = [] self.load_data_reviews(language) for k_iter in range(0, self.k): util.time_log("learning...") classifier = NaiveBayesClassifier.train( self.training_data_text_vectorized_nb(language, k_iter)) util.time_log("classifying") ret_list.append([ classifier.classify(x) for x in self.test_data_text_vectorized_nb(language, k_iter) ]) return ret_list
def run_svm(self, language): self.__check_language(language) util.time_log("starting svm...") ret_list = [] self.load_data_reviews(language) for k_iter in range(0, self.k): classifier = svm.SVC(kernel="linear") util.time_log("learning...") vectorized = self.training_data_text_vectorized_bow( language, k_iter) classifier.fit(vectorized, self.training_data_rating(language, k_iter)) util.time_log("classifying") ret_list.append( classifier.predict( self.test_data_text_vectorized_bow(language, k_iter))) #print(language + "," + str(k_iter) +": " + str(self.bow_size(language, k_iter))) return ret_list
import util from randomizeFromFiles import K_fold_data_picker_mixed, K_fold_style_data_picker_specific_tests_on_base, \ K_fold_parallel_file_reader from standard_run import RunData def handle_f1_prints(f1_list): accuracy_list = [x.accuracy for x in f1_list] print("accuracy-average: " + str(sum(accuracy_list) / len(f1_list))) print("accuracy-median: " + str(median(accuracy_list))) print("accuracy-min: " + str(min(accuracy_list))) print("accuracy-max: " + str(max(accuracy_list))) if __name__ == '__main__': util.time_log("starting...") filenames_originally_english = [ "../reviews/originally_english/1_star_reviews_orig_english.txt", "../reviews/originally_english/2_star_reviews_orig_english.txt", "../reviews/originally_english/4_star_reviews_orig_english.txt", "../reviews/originally_english/5_star_reviews_orig_english.txt" ] filenames_english_uncorrected = [ "../reviews/original/1-star_translated_mapped.txt", "../reviews/original/2-star_translated_mapped.txt", "../reviews/original/4-star_translated_mapped.txt", "../reviews/original/5-star_translated_mapped.txt" ] """ 2k, 4k and 6k as base relevant base learning data. In this case from the originally english set of data.