def getScore(self, text, typeList=['ope','con','ext','agr','neu']): """ get the Json String according to the predicted data (predict model is determined by the typeList) Args: text: a string of text used to predict big5 trait typeList: contains the big5 trait list which need to be returned Returns: jsonStr: a Json String, e.g., {"ope": 4.2, "neu": 2.31, "con": 3.09, "ext": 3.69, "agr": 3.08} """ #hack to make this work from any directory. import sys sys.path.insert(1,os.path.dirname(__file__)) model_path = os.path.join(os.path.dirname(__file__), """model/""") data = OrderedDict() tp = TextProcessing() X = tp.extractFeature(text) X = np.array(X) for t in typeList: if t not in fullList: continue else: model_name = """Predictor_"""+t+""".pkl""" model = joblib.load(model_path+model_name) y_pred = model.test(X) data[t] = y_pred[0] return data
def getScore(self, text, typeList=['ope', 'con', 'ext', 'agr', 'neu']): """ get the Json String according to the predicted data (predict model is determined by the typeList) Args: text: a string of text used to predict big5 trait typeList: contains the big5 trait list which need to be returned Returns: jsonStr: a Json String, e.g., {"ope": 4.2, "neu": 2.31, "con": 3.09, "ext": 3.69, "agr": 3.08} """ #hack to make this work from any directory. import sys sys.path.insert(1, os.path.dirname(__file__)) model_path = os.path.join(os.path.dirname(__file__), """model/""") data = OrderedDict() tp = TextProcessing() X = tp.extractFeature(text) X = np.array(X) for t in typeList: if t not in fullList: continue else: model_name = """Predictor_""" + t + """.pkl""" model = joblib.load(model_path + model_name) y_pred = model.test(X) data[t] = y_pred[0] return data
def evolve(self): my_text_object_global = TextProcessing( text="", text_file_address="Attachment/global_text.txt") # my_text_object_encoded = TextProcessing(text="", text_file_address="Attachment/encoded_text.txt") my_text_object_encoded = TextProcessing(text=self.encoded_text) text = my_text_object_encoded.text global_text = my_text_object_global.clean_text() encoded_text = my_text_object_encoded.clean_text() generation = Population(self.population_size, True) max_fitness = generation.find_max_fitness(encoded_text, global_text) for j in range(self.number_of_generations): if generation.get_individuals()[0].fitness != max_fitness: # print("Generation number: ", j) for chromosome in generation.get_individuals(): chromosome.calculate_fitness(global_text, encoded_text) generation.get_individuals().sort(key=lambda x: x.fitness, reverse=True) new_generation = generation.get_individuals( )[:int(self.population_size * 0.1)] for i in range(int(self.population_size * 0.7)): if random.uniform(0, 1) < self.cross_chance: while True: parent1, parent2 = random.choices( new_generation[:int(self.population_size * 0.8)], k=2) if parent1 != parent2: break child1, child2 = self.__crossover(parent1, parent2) child1 = self.mutation(child1) child2 = self.mutation(child2) new_generation.append(child1) new_generation.append(child2) generation.save_individuals(new_generation) # print("Report Best Fitness: ", generation.get_individuals()[0].fitness) # print("Population number is:", len(generation.get_individuals())) else: break for chromosome in generation.get_individuals(): chromosome.calculate_fitness(my_text_object_global.get_text(), my_text_object_encoded.get_text()) generation.get_individuals().sort(key=lambda x: x.fitness, reverse=True) return generation.get_individuals()[0].decode_text(text)
def main(): tp = TextProcessing() tp.read_data([ "data/Youtube01-Psy.csv", "data/Youtube02-KatyPerry.csv", "data/Youtube03-LMFAO.csv", "data/Youtube04-Eminem.csv", "data/Youtube05-Shakira.csv" ]) # tp.print_spam() print("Number of comments: %d" % len(tp.comments)) training_data, test_data, training_labels, test_labels = train_test_split( tp.comments, tp.labels, train_size=0.7, random_state=7) fe = FeatureExtraction() fe.fit(training_data) clf_svm = SVMClassifier() clf_svm.train(fe.extract(training_data), training_labels) print("SVM results:") print( classification_report(test_labels, clf_svm.predict(fe.extract(test_data)))) print(confusion_matrix(test_labels, clf_svm.predict(fe.extract(test_data)))) comment = tp.process_comment( "Like comment and http://www.google.com subscribe") print("Test comment: %s" % comment) print("Predicted label: %s" % clf_svm.predict(fe.extract([comment]))) clf_nn = NNClassifier() clf_nn.train(fe.extract(training_data), training_labels) print("\nNN results:") print( classification_report(test_labels, clf_nn.predict(fe.extract(test_data)))) print(confusion_matrix(test_labels, clf_nn.predict(fe.extract(test_data)))) comment = tp.process_comment( "Like comment and http://www.google.com subscribe") print("Test comment: %s" % comment) print("Predicted label: %s" % clf_nn.predict(fe.extract([comment]))) # print(fe.vectorizer.vocabulary_) print("Number of features: %d" % len(fe.vectorizer.vocabulary_))
from TextProcessing import TextProcessing client = TextProcessing("xslgc26CVzxMaUUF1QhmQASVgjGdY7Tj") def get_sentiment(text): text = text.encode('utf-8').strip() response = client.sentiment(text, language='english') return vars(response)['body'] def mean(l): return float(sum(l)) / len(l) if len(l) > 0 else float('nan') def get_mean_sentiment(tweets): pos = [] neg = [] neutral = [] for tweet in tweets: pos.append(float(tweet.pos)) neg.append(float(tweet.neg)) neutral.append(float(tweet.neutral)) pos = mean(pos) neg = mean(neg) neutral = mean(neutral) if neutral > 0.5: label = 'neutral' prob = neutral else:
'../data/ynet_cars_500_talkbacks.json', '../data/ynet_dating_500_talkbacks.json', '../data/ynet_digital_500_talkbacks.json', '../data/ynet_economy_500_talkbacks.json', '../data/ynet_education_500_talkbacks.json', '../data/ynet_health_500_talkbacks.json', '../data/ynet_national_500_talkbacks.json', '../data/ynet_parents_500_talkbacks.json', '../data/ynet_politics_500_talkbacks.json', ] for file in files: print "file", file for line in open(file, 'rb'): line_dict = json.loads(line) processed_list = TextProcessing( line_dict['title_text']).cleanTextHebrew(exclude_punct=False) data.append(processed_list) word2idf = json.loads(open(TFIDF_FILE, 'rb').read()) vocabulary = word2idf.keys() file = open(OUTPUT_FILE, 'wb') for d_list in data: for d in d_list: if d not in ILLEGAL_PUNCT: if d not in LEGAL_PUNCT and d not in vocabulary: file.write(UNKNOWN) else: file.write(d.encode('utf8')) file.write(' ') file.write('\n')
from DataExtractor import DataExtractor from TextProcessing import TextProcessing from TextClassifier import TextClassifier import pandas as pd train, target = DataExtractor('data.csv').data_producer('train') test = DataExtractor('test.csv').data_producer('test') text_classifier = TextClassifier() (x_train, y_train), (x_validation, y_validation) = text_classifier.split_validation_data( train, target) cleaned_x_train = TextProcessing(x_train).clean_text() cleaned_x_validation = TextProcessing(x_validation).clean_text() cleaned_x_test = TextProcessing(test).clean_text() text_classifier.fit(cleaned_x_train, y_train) text_classifier.evaluate(cleaned_x_validation, y_validation) text_classifier.confusion(y_validation, cleaned_x_validation) result = text_classifier.predict(cleaned_x_test) pd.DataFrame(result, columns=['category']).to_csv('output.csv', index=True, index_label='index')
def getFreqDict(self, text): return TextProcessing(text).getFreqDict()
def isEnglish(self, s): '''check if the input text s is in English ''' tp = TextProcessing() lan = tp.languageDetection(s) if lan == 'en': return True else: return False