Пример #1
0
    def getScore(self, text, typeList=['ope','con','ext','agr','neu']):
        """ get the Json String according to the predicted data (predict model is determined by the typeList)
        Args: 
            text: a string of text used to predict big5 trait   
            typeList: contains the big5 trait list which need to be returned  
        Returns:
            jsonStr: a Json String, e.g., {"ope": 4.2, "neu": 2.31, "con": 3.09, "ext": 3.69, "agr": 3.08}  
        """
        
        #hack to make this work from any directory.
        import sys
        sys.path.insert(1,os.path.dirname(__file__))
        model_path =  os.path.join(os.path.dirname(__file__), """model/""")
        
        data = OrderedDict()
        
        tp = TextProcessing()
        X = tp.extractFeature(text)
        X = np.array(X)
            
        for t in typeList:
            if t not in fullList:
                continue
            else:
                model_name = """Predictor_"""+t+""".pkl"""
                model = joblib.load(model_path+model_name)
                y_pred = model.test(X)  
                data[t] = y_pred[0]

        return data
Пример #2
0
    def getScore(self, text, typeList=['ope', 'con', 'ext', 'agr', 'neu']):
        """ get the Json String according to the predicted data (predict model is determined by the typeList)
        Args: 
            text: a string of text used to predict big5 trait   
            typeList: contains the big5 trait list which need to be returned  
        Returns:
            jsonStr: a Json String, e.g., {"ope": 4.2, "neu": 2.31, "con": 3.09, "ext": 3.69, "agr": 3.08}  
        """

        #hack to make this work from any directory.
        import sys
        sys.path.insert(1, os.path.dirname(__file__))
        model_path = os.path.join(os.path.dirname(__file__), """model/""")

        data = OrderedDict()

        tp = TextProcessing()
        X = tp.extractFeature(text)
        X = np.array(X)

        for t in typeList:
            if t not in fullList:
                continue
            else:
                model_name = """Predictor_""" + t + """.pkl"""
                model = joblib.load(model_path + model_name)
                y_pred = model.test(X)
                data[t] = y_pred[0]

        return data
Пример #3
0
    def evolve(self):
        my_text_object_global = TextProcessing(
            text="", text_file_address="Attachment/global_text.txt")
        # my_text_object_encoded = TextProcessing(text="", text_file_address="Attachment/encoded_text.txt")
        my_text_object_encoded = TextProcessing(text=self.encoded_text)
        text = my_text_object_encoded.text
        global_text = my_text_object_global.clean_text()
        encoded_text = my_text_object_encoded.clean_text()
        generation = Population(self.population_size, True)

        max_fitness = generation.find_max_fitness(encoded_text, global_text)
        for j in range(self.number_of_generations):
            if generation.get_individuals()[0].fitness != max_fitness:

                # print("Generation number: ", j)
                for chromosome in generation.get_individuals():
                    chromosome.calculate_fitness(global_text, encoded_text)

                generation.get_individuals().sort(key=lambda x: x.fitness,
                                                  reverse=True)

                new_generation = generation.get_individuals(
                )[:int(self.population_size * 0.1)]
                for i in range(int(self.population_size * 0.7)):
                    if random.uniform(0, 1) < self.cross_chance:
                        while True:
                            parent1, parent2 = random.choices(
                                new_generation[:int(self.population_size *
                                                    0.8)],
                                k=2)
                            if parent1 != parent2:
                                break
                        child1, child2 = self.__crossover(parent1, parent2)
                        child1 = self.mutation(child1)
                        child2 = self.mutation(child2)
                        new_generation.append(child1)
                        new_generation.append(child2)

                generation.save_individuals(new_generation)
                # print("Report Best Fitness: ", generation.get_individuals()[0].fitness)
                # print("Population number is:", len(generation.get_individuals()))
            else:
                break
        for chromosome in generation.get_individuals():
            chromosome.calculate_fitness(my_text_object_global.get_text(),
                                         my_text_object_encoded.get_text())
        generation.get_individuals().sort(key=lambda x: x.fitness,
                                          reverse=True)

        return generation.get_individuals()[0].decode_text(text)
Пример #4
0
def main():
    tp = TextProcessing()
    tp.read_data([
        "data/Youtube01-Psy.csv", "data/Youtube02-KatyPerry.csv",
        "data/Youtube03-LMFAO.csv", "data/Youtube04-Eminem.csv",
        "data/Youtube05-Shakira.csv"
    ])

    # tp.print_spam()
    print("Number of comments: %d" % len(tp.comments))
    training_data, test_data, training_labels, test_labels = train_test_split(
        tp.comments, tp.labels, train_size=0.7, random_state=7)
    fe = FeatureExtraction()
    fe.fit(training_data)

    clf_svm = SVMClassifier()
    clf_svm.train(fe.extract(training_data), training_labels)

    print("SVM results:")
    print(
        classification_report(test_labels,
                              clf_svm.predict(fe.extract(test_data))))
    print(confusion_matrix(test_labels,
                           clf_svm.predict(fe.extract(test_data))))
    comment = tp.process_comment(
        "Like comment and http://www.google.com subscribe")
    print("Test comment: %s" % comment)
    print("Predicted label: %s" % clf_svm.predict(fe.extract([comment])))

    clf_nn = NNClassifier()
    clf_nn.train(fe.extract(training_data), training_labels)
    print("\nNN results:")
    print(
        classification_report(test_labels,
                              clf_nn.predict(fe.extract(test_data))))
    print(confusion_matrix(test_labels, clf_nn.predict(fe.extract(test_data))))
    comment = tp.process_comment(
        "Like comment and http://www.google.com subscribe")
    print("Test comment: %s" % comment)
    print("Predicted label: %s" % clf_nn.predict(fe.extract([comment])))

    # print(fe.vectorizer.vocabulary_)
    print("Number of features: %d" % len(fe.vectorizer.vocabulary_))
Пример #5
0
from TextProcessing import TextProcessing

client = TextProcessing("xslgc26CVzxMaUUF1QhmQASVgjGdY7Tj")


def get_sentiment(text):
    text = text.encode('utf-8').strip()
    response = client.sentiment(text, language='english')
    return vars(response)['body']


def mean(l):
    return float(sum(l)) / len(l) if len(l) > 0 else float('nan')


def get_mean_sentiment(tweets):
    pos = []
    neg = []
    neutral = []
    for tweet in tweets:
        pos.append(float(tweet.pos))
        neg.append(float(tweet.neg))
        neutral.append(float(tweet.neutral))
    pos = mean(pos)
    neg = mean(neg)
    neutral = mean(neutral)

    if neutral > 0.5:
        label = 'neutral'
        prob = neutral
    else:
Пример #6
0
    '../data/ynet_cars_500_talkbacks.json',
    '../data/ynet_dating_500_talkbacks.json',
    '../data/ynet_digital_500_talkbacks.json',
    '../data/ynet_economy_500_talkbacks.json',
    '../data/ynet_education_500_talkbacks.json',
    '../data/ynet_health_500_talkbacks.json',
    '../data/ynet_national_500_talkbacks.json',
    '../data/ynet_parents_500_talkbacks.json',
    '../data/ynet_politics_500_talkbacks.json',
]

for file in files:
    print "file", file
    for line in open(file, 'rb'):
        line_dict = json.loads(line)
        processed_list = TextProcessing(
            line_dict['title_text']).cleanTextHebrew(exclude_punct=False)
        data.append(processed_list)

word2idf = json.loads(open(TFIDF_FILE, 'rb').read())
vocabulary = word2idf.keys()

file = open(OUTPUT_FILE, 'wb')
for d_list in data:
    for d in d_list:
        if d not in ILLEGAL_PUNCT:
            if d not in LEGAL_PUNCT and d not in vocabulary:
                file.write(UNKNOWN)
            else:
                file.write(d.encode('utf8'))
            file.write(' ')
    file.write('\n')
Пример #7
0
from DataExtractor import DataExtractor
from TextProcessing import TextProcessing
from TextClassifier import TextClassifier
import pandas as pd

train, target = DataExtractor('data.csv').data_producer('train')
test = DataExtractor('test.csv').data_producer('test')

text_classifier = TextClassifier()

(x_train, y_train), (x_validation,
                     y_validation) = text_classifier.split_validation_data(
                         train, target)

cleaned_x_train = TextProcessing(x_train).clean_text()
cleaned_x_validation = TextProcessing(x_validation).clean_text()
cleaned_x_test = TextProcessing(test).clean_text()

text_classifier.fit(cleaned_x_train, y_train)

text_classifier.evaluate(cleaned_x_validation, y_validation)

text_classifier.confusion(y_validation, cleaned_x_validation)

result = text_classifier.predict(cleaned_x_test)
pd.DataFrame(result, columns=['category']).to_csv('output.csv',
                                                  index=True,
                                                  index_label='index')
Пример #8
0
 def getFreqDict(self, text):
     return TextProcessing(text).getFreqDict()
Пример #9
0
 def isEnglish(self, s):
     '''check if the input text s is in English '''
     tp = TextProcessing()
     lan = tp.languageDetection(s)
     if lan == 'en': return True
     else: return False
Пример #10
0
 def isEnglish(self, s):
     '''check if the input text s is in English '''
     tp = TextProcessing()
     lan = tp.languageDetection(s)
     if lan == 'en': return True
     else: return False