def __init__(self, **kwargs):
        allowed_kwargs = [
            'reviews', 'eatery_name', 'category', 'total_noun_phrases',
            'word_tokenization_algorithm_name', 'noun_phrases_algorithm_name',
            'pos_tagging_algorithm_name', 'tag_analysis_algorithm_name',
            'sentiment_analysis_algorithm_name',
            'np_clustering_algorithm_name', 'ner_algorithm_name',
            'with_celery', "do_sub_classification"
        ]
        self.__dict__.update(kwargs)
        for kwarg in allowed_kwargs:
            assert eval("self.{0}".format(kwarg)) != None

        self.tag_classifier = joblib.load("{0}/{1}".format(
            path_in_memory_classifiers, self.tag_analysis_algorithm_name))

        #self.sub_tag_classifier = joblib.load("{0}/{1}".format(path_in_memory_classifiers, "svm_linear_kernel_classifier_food_sub_tags.lib"))
        self.sub_tag_classifier = joblib.load("{0}/{1}".format(
            path_in_memory_classifiers,
            "svm_linear_kernel_classifier_food_sub_tags_8May.lib"))

        self.sentiment_classifier = joblib.load("{0}/{1}".format(path_in_memory_classifiers,\
                                               "svm_linear_kernel_classifier_sentiment_new_dataset_30April.lib"))

        self.sent_tokenizer = SentenceTokenizationOnRegexOnInterjections()

        self.clustered_nps = list()
        self.normalized_sent_sentiment_nps = list()
 def sentence_tokenization(self):
         """
         Deals with the sentence tokenization for the self.text
         """
         sent_tokenizer = SentenceTokenizationOnRegexOnInterjections()
         self.tokenized_sents = sent_tokenizer.tokenize(self.text)
         return 
    def __init__(self, **kwargs):
        """
                reviews of the form [(review_id, review), (review_id, review), .........]

                """
        allowed_kwargs = [
            'reviews', 'eatery_name', 'category', 'total_noun_phrases',
            'word_tokenization_algorithm_name', 'noun_phrases_algorithm_name',
            'pos_tagging_algorithm_name', 'tag_analysis_algorithm_name',
            'sentiment_analysis_algorithm_name',
            'np_clustering_algorithm_name', 'ner_algorithm_name', 'with_celery'
        ]

        self.__dict__.update(kwargs)
        for kwarg in allowed_kwargs:
            assert eval("self.{0}".format(kwarg)) != None

        self.sent_tokenizer = SentenceTokenizationOnRegexOnInterjections()
        self.tag_classifier = joblib.load("{0}/{1}".format(
            path_in_memory_classifiers, self.tag_analysis_algorithm_name))

        self.ambience_classifier = joblib.load("{0}/{1}".format(
            path_in_memory_classifiers,
            "svm_linear_kernel_classifier_ambience.lib"))
        self.sentiment_classifier = joblib.load("{0}/{1}".format(path_in_memory_classifiers,\
                                                "svm_linear_kernel_classifier_sentiment_new_dataset.lib"))

        self.sentences = list()
        self.clustered_nps = list()
class FoodWordCloudApiHelper:
    def __init__(self, **kwargs):
        allowed_kwargs = [
            'reviews', 'eatery_name', 'category', 'total_noun_phrases',
            'word_tokenization_algorithm_name', 'noun_phrases_algorithm_name',
            'pos_tagging_algorithm_name', 'tag_analysis_algorithm_name',
            'sentiment_analysis_algorithm_name',
            'np_clustering_algorithm_name', 'ner_algorithm_name',
            'with_celery', "do_sub_classification"
        ]
        self.__dict__.update(kwargs)
        for kwarg in allowed_kwargs:
            assert eval("self.{0}".format(kwarg)) != None

        self.tag_classifier = joblib.load("{0}/{1}".format(
            path_in_memory_classifiers, self.tag_analysis_algorithm_name))

        #self.sub_tag_classifier = joblib.load("{0}/{1}".format(path_in_memory_classifiers, "svm_linear_kernel_classifier_food_sub_tags.lib"))
        self.sub_tag_classifier = joblib.load("{0}/{1}".format(
            path_in_memory_classifiers,
            "svm_linear_kernel_classifier_food_sub_tags_8May.lib"))

        self.sentiment_classifier = joblib.load("{0}/{1}".format(path_in_memory_classifiers,\
                                               "svm_linear_kernel_classifier_sentiment_new_dataset_30April.lib"))

        self.sent_tokenizer = SentenceTokenizationOnRegexOnInterjections()

        self.clustered_nps = list()
        self.normalized_sent_sentiment_nps = list()

    def print_execution(func):
        "This decorator dumps out the arguments passed to a function before calling it"
        argnames = func.func_code.co_varnames[:func.func_code.co_argcount]
        fname = func.func_name

        def wrapper(*args, **kwargs):
            start_time = time.time()
            print "{0} Now {1} have started executing {2}".format(
                bcolors.OKBLUE, func.func_name, bcolors.RESET)
            result = func(*args, **kwargs)
            print "{0} Total time taken by {1} for execution is --<<{2}>>--{3}\n".format(
                bcolors.OKGREEN, func.func_name, (time.time() - start_time),
                bcolors.RESET)

            return result

        return wrapper

    def get_args(self):
        print self.__dict__

    def run(self):
        """
                It returns the result
                """

        self.sent_tokenize_reviews(
        )  #Tokenize reviews, makes self.reviews_ids, self.sentences
        self.predict_tags()  #Predict tags, makes self.predict_tags

        self.filtered_list = [
            e
            for e in zip(self.review_ids, self.sentences, self.predicted_tags)
            if e[2] == self.category
        ]

        if self.do_sub_classification:
            """
                        Classify food sentences into furthur these categories
                        'dishes', 'food-null', 'menu-food', 'null-food', 'overall-food', 'place-food', 'sub-food'
                        """
            self.review_ids, self.sentences, self.predicted_tags = zip(
                *self.filtered_list)
            self.food_sub_tag_classification()
            self.filtered_list = [
                e for e in zip(self.review_ids, self.sentences,
                               self.predicted_sub_tags) if e[2] == "dishes"
            ]

        self.c_review_ids, self.c_sentences, self.c_predicted_tags = zip(
            *self.filtered_list)

        self.predict_sentiment()  #makes self.predicted_sentiment

        self.extract_noun_phrases()  #makes self.noun_phrases
        self.normalize_sentiments()  #makes self.normalized_noun_phrases

        self.do_clustering()  #makes self.clustered_nps
        self.result = self.make_result()

    #@print_execution
    def food_sub_tag_classification(self):
        """
                This deals with the sub classification of fodd sub tags
                """
        self.predicted_sub_tags = self.sub_tag_classifier.predict(
            self.sentences)
        return self.predicted_sub_tags

    #@print_execution
    def sent_tokenize_reviews(self):
        """
                Tokenize self.reviews tuples of the form (review_id, review) to sentences of the form (review_id, sentence)
                and generates two lists self.review_ids and self.sentences
                """
        sentences = list()
        for review in self.reviews:
            for __sentence in self.sent_tokenizer.tokenize(review[1]):
                __sentence = SolveEncoding.preserve_ascii(__sentence)
                sentences.append([review[0], __sentence])

        self.review_ids, self.sentences = zip(*sentences)
        return

    #@print_execution
    def predict_tags(self):
        """
                Predict tags of the sentence which were being generated by self.sent_tokenize_reviews
                """
        self.predicted_tags = self.tag_classifier.predict(self.sentences)
        return self.predicted_tags

    #@print_execution
    def predict_sentiment(self):
        """
                Predict sentiment of self.c_sentences which were made by filtering self.sentences accoring to 
                the specified category
                """
        self.c_predicted_sentiment = self.sentiment_classifier.predict(
            self.c_sentences)
        return

    #@print_execution
    def extract_noun_phrases(self):
        """
                Extarct Noun phrases for the self.c_sentences for each sentence and outputs a list 
                self.sent_sentiment_nps which is of the form 
                [('the only good part was the coke , thankfully it was outsourced ', 
                                            u'positive', [u'good part']), ...]
                """
        self.noun_phrases_algorithm_name = "topia"
        __nouns = NounPhrases(
            self.c_sentences,
            default_np_extractor=self.noun_phrases_algorithm_name)

        self.sent_sentiment_nps = [
            __tuple for __tuple in zip(
                self.c_sentences, self.c_predicted_sentiment,
                __nouns.noun_phrases[self.noun_phrases_algorithm_name])
            if __tuple[2]
        ]

        return self.sent_sentiment_nps

    #@print_execution
    def normalize_sentiments(self, ignore_super=False):
        """
                self.sent_sentiment_nps = 
                [('the only good part was the coke , thankfully it was outsourced ', 
                                            u'positive', [u'good part']), 
                ("I had the bset ferror rocher shake ever", "super-positive", "ferror rocher shake", "positive"), ...]
                Now, the above  list has super-negative and super-positive sentiments associated with
                them,
                        ignore_super:
                                if True:
                                        super-positive and super-negative will be treated same as positive and negative
                                else:
                                        super-positive will consider as two positives,
                
                for element in self.noun_phrases:
                        if element[0].startswith("super"):
                                self.normalized_noun_phrases.append((element[1], element[0].split("-")[1]))
                                if not ignore_super:
                                        self.normalized_noun_phrases.append((element[1], element[0].split("-")[1]))
                        else:
                                self.normalized_noun_phrases.append((element[1], element[0]))

                return self.normalized_noun_phrases
                """
        for (sentence, sentiment, noun_phrases) in self.sent_sentiment_nps:
            __nouns = list()
            if sentiment.startswith("super"):
                sentiment = sentiment.split("-")[1]
                __nouns.extend(noun_phrases)
                if not ignore_super:
                    __nouns.extend(noun_phrases)
            else:
                __nouns.extend(noun_phrases)
            self.normalized_sent_sentiment_nps.append(
                [sentence, sentiment, __nouns])

        return self.normalized_sent_sentiment_nps

    #@print_execution
    def do_clustering(self):
        """
                Deals with clusteing by import another module for food, HeuristicClustering
                passes on self.normalized_sent_sentiment_nps which is of the form 
                Input:
                    [('the only good part was the coke , thankfully it was outsourced ', 
                                            u'positive', [u'good part']), ...]
                Output:
                        makes a class variable self.clustered_nps from output HeuristicClustering 
                        [{"name": "ferror rocher shake", "positive": 20, "negative": 10, "neutral": 3,
                            "similar": ["ferror rocher", "i like ferror rocher", "luv ferror rocher",], 
                            "sentences": [("i luv ferror rocher shake", "positive"), 
                                        ("I went there specially for ferror rocher skae", "neutral"), ..]}, ...]
                """

        __result = HeuristicClustering(self.normalized_sent_sentiment_nps,
                                       self.c_sentences, self.eatery_name)
        self.clustered_nps = sorted(
            __result.result,
            reverse=True,
            key=lambda x: x.get("positive") + x.get("negative"))
        return self.clustered_nps

    #@print_execution
    def convert_sentences(self, __object):
        return {"sentence": __object[0], "sentiment": __object[1]}

    #@print_execution
    def result_lambda(self, __dict):
        __dict.update({
            "sentences":
            map(self.convert_sentences, __dict.get("sentences"))
        })
        try:
            i_likeness = "%.2f" % (
                float(__dict.get("positive") * 100) /
                (__dict.get("negative") + __dict.get("positive")))
        except ZeroDivisionError:
            i_likeness = '100'

        o_likeness = "%.2f" % (float(
            __dict.get("positive") * self.total_positive +
            __dict.get("negative") * self.total_negative) / self.total)
        __dict.update({"i_likeness": i_likeness})
        __dict.update({"o_likeness": o_likeness})

    #@print_execution
    def make_result(self):
        self.total_positive = sum(
            [__dict.get("positive") for __dict in self.clustered_nps])
        self.total_negative = sum(
            [__dict.get("negative") for __dict in self.clustered_nps])
        self.total = self.total_positive + self.total_negative

        map(self.result_lambda, self.clustered_nps)
        final_result = sorted(self.clustered_nps,
                              reverse=True,
                              key=lambda x: x.get("negative") + x.get(
                                  "positive") + x.get("neutral"))

        return final_result
        """
Пример #5
0
class ServiceWordCloudApiHelper:
    def __init__(self, **kwargs):
        allowed_kwargs = [
            'reviews', 'eatery_name', 'category', 'total_noun_phrases',
            'word_tokenization_algorithm_name', 'noun_phrases_algorithm_name',
            'pos_tagging_algorithm_name', 'tag_analysis_algorithm_name',
            'sentiment_analysis_algorithm_name',
            'np_clustering_algorithm_name', 'ner_algorithm_name', 'with_celery'
        ]

        self.__dict__.update(kwargs)
        for kwarg in allowed_kwargs:
            assert eval("self.{0}".format(kwarg)) != None

        self.sent_tokenizer = SentenceTokenizationOnRegexOnInterjections()
        self.tag_classifier = joblib.load("{0}/{1}".format(
            path_in_memory_classifiers, self.tag_analysis_algorithm_name))

        self.service_classifier = joblib.load("{0}/{1}".format(
            path_in_memory_classifiers,
            "svm_linear_kernel_classifier_service.lib"))
        self.sentiment_classifier = joblib.load("{0}/{1}".format(path_in_memory_classifiers,\
                                                "svm_linear_kernel_classifier_sentiment_new_dataset.lib"))

        self.sentences = list()
        self.clustered_nps = list()

    def print_execution(func):
        "This decorator dumps out the arguments passed to a function before calling it"
        argnames = func.func_code.co_varnames[:func.func_code.co_argcount]
        fname = func.func_name

        def wrapper(*args, **kwargs):
            start_time = time.time()
            print "{0} Now {1} have started executing {2}".format(
                bcolors.OKBLUE, func.func_name, bcolors.RESET)
            result = func(*args, **kwargs)
            print "{0} Total time taken by {1} for execution is --<<{2}>>--{3}\n".format(
                bcolors.OKGREEN, func.func_name, (time.time() - start_time),
                bcolors.RESET)

            return result

        return wrapper

    def get_args(self):
        print self.__dict__

    def sent_tokenize_reviews(self):
        sentences = list()
        for review in self.reviews:
            for __sentence in self.sent_tokenizer.tokenize(review[1]):
                __sentence = SolveEncoding.preserve_ascii(__sentence)
                sentences.append([review[0], __sentence])

        self.review_ids, self.sentences = zip(*sentences)
        return

    @print_execution
    def predict_tags(self):
        self.predicted_tags = self.tag_classifier.predict(self.sentences)
        return self.predicted_tags

    @print_execution
    def predict_sentiment(self):
        self.predicted_sentiment = self.sentiment_classifier.predict(
            self.c_sentences)
        return

    @print_execution
    def predict_sub_tags(self):
        print "Going to predict ambience sub tags"
        self.service_tags = self.service_classifier.predict(self.c_sentences)
        return

    @print_execution
    def make_sentences_dict(self):
        """
                Makes sentences_dict from self.c_sentences, self.predicted_sentiment, self.ambience_tags
                of the form 
                { "ambience-null": {"sentences": [(__sent, __sentiment), (__sent, __sentiment), .. ], 
                    "similar": None, 
                    "sentiment": ["positive", "negative", "super-positive", ]}, 
                    
                "decor": { }, }

                """
        self.sentences_dict = dict()
        for __sent, __sentiment, __category in zip(self.c_sentences,
                                                   self.predicted_sentiment,
                                                   self.service_tags):
            if not self.sentences_dict.has_key(__category):
                self.sentences_dict.update({
                    __category: {
                        "sentences": [(__sent, __sentiment)],
                        "similar": None,
                        "sentiment": [__sentiment]
                    }
                })

            else:
                sentiment = self.sentences_dict.get(__category).get(
                    "sentiment")
                sentiment.append(__sentiment)

                sentences = self.sentences_dict.get(__category).get(
                    "sentences")
                sentences.append((__sent, __sentiment))
                self.sentences_dict.update({
                    __category: {
                        "sentences": sentences,
                        "similar": None,
                        "sentiment": sentiment,
                    }
                })
        return

    @print_execution
    def normalize_sentiments(self):
        for __category in self.sentences_dict.keys():
            normalized_sentiments = list()
            for __e in self.sentences_dict[__category]["sentiment"]:
                if __e.startswith("super"):
                    normalized_sentiments.append(__e.split('-')[1])
                    normalized_sentiments.append(__e.split('-')[1])
                else:
                    normalized_sentiments.append(__e)

            sentiments = Counter(normalized_sentiments)
            print __category
            print sentiments, "\n\n"
            self.clustered_nps.append({
                "name":
                __category,
                "sentences":
                self.sentences_dict[__category]["sentences"],
                "similar": [],
                "positive": (0, sentiments.get("positive")
                             )[sentiments.get("positive") != None],
                "negative": (0, sentiments.get("negative")
                             )[sentiments.get("negative") != None],
                "neutral":
                (0,
                 sentiments.get("neutral"))[sentiments.get("neutral") != None],
            })

    @print_execution
    def convert_sentences(self, __object):
        return {"sentence": __object[0], "sentiment": __object[1]}

    @print_execution
    def result_lambda(self, __dict):
        __dict.update({
            "sentences":
            map(self.convert_sentences, __dict.get("sentences"))
        })
        try:
            i_likeness = "%.2f" % (
                float(__dict.get("positive") * 100) /
                (__dict.get("negative") + __dict.get("positive")))
        except ZeroDivisionError:
            i_likeness = '100'

        o_likeness = "%.2f" % (float(
            __dict.get("positive") * self.total_positive +
            __dict.get("negative") * self.total_negative) / self.total)
        __dict.update({"i_likeness": i_likeness})
        __dict.update({"o_likeness": o_likeness})

    @print_execution
    def make_result(self):
        self.total_positive = sum(
            [__dict.get("positive") for __dict in self.clustered_nps])
        self.total_negative = sum(
            [__dict.get("negative") for __dict in self.clustered_nps])
        self.total = self.total_positive + self.total_negative

        map(self.result_lambda, self.clustered_nps)
        final_result = sorted(self.clustered_nps,
                              reverse=True,
                              key=lambda x: x.get("negative") + x.get(
                                  "positive") + x.get("neutral"))

        return final_result

    @print_execution
    def run(self):
        self.sent_tokenize_reviews(
        )  #Tokenize reviews, makes self.reviews_ids, self.sentences
        self.predict_tags()  #Predict tags, makes self.predict_tags

        self.filtered_list = [
            e
            for e in zip(self.review_ids, self.sentences, self.predicted_tags)
            if e[2] == self.category
        ]
        self.c_review_ids, self.c_sentences, self.c_tags = zip(
            *self.filtered_list)
        self.predict_sentiment()
        self.predict_sub_tags()
        self.make_sentences_dict()
        self.normalize_sentiments()
        self.result = self.make_result()
Пример #6
0
class PerReview:
        sent_tokenizer = SentenceTokenizationOnRegexOnInterjections()
        def __init__(self, review_id, review_text, eatery_id):
                
                self.review_id, self.review_text, self.eatery_id = review_id, review_text, eatery_id

        def print_execution(func):
                "This decorator dumps out the arguments passed to a function before calling it"
                argnames = func.func_code.co_varnames[:func.func_code.co_argcount]
                fname = func.func_name
                def wrapper(*args,**kwargs):
                        start_time = time.time()
                        print "{0} Now {1} have started executing {2}".format(bcolors.OKBLUE, func.func_name, bcolors.RESET)
                        result = func(*args, **kwargs)
                        print "{0} Total time taken by {1} for execution is --<<{2}>>--{3}\n".format(bcolors.OKGREEN, func.func_name, 
                                (time.time() - start_time), bcolors.RESET)
                        
                        return result
                return wrapper
        
        def get_args(self):
                print self.__dict__
        

        @print_execution
        def run(self):
                """
                It returns the result
                """

                result = self.__get_review_result()
                print "this is the result %s"%result
                if not bool(result):
                        print "{0}Result for the review_id --<<{1}>>-- has alredy been found{2}".format(bcolors.OKBLUE, \
                                self.review_id, bcolors.RESET)
                        return 


                if result.get("rerun_food_sub_tag_classification"):

                        print "{0}Doing FOOD sub classification again for review_id --<<{1}>>-- {2}".format(bcolors.OKBLUE, \
                                self.review_id, bcolors.RESET)
                        self.food = MongoScripts.get_tag_sentences(self.review_id, "food")
                        self.__food_sub_tag_classification()
                        self.__extract_noun_phrases() #makes self.noun_phrases
                        MongoScripts.update_food_sub_tag_sentences(self.review_id, self.all_food_with_nps)
               

                if result.get("rerun_cost_sub_tag_classification"):
                        print "{0}Doing COST sub classification again for review_id --<<{1}>>-- {2}".format(bcolors.OKBLUE, \
                                self.review_id, bcolors.RESET)
                        self.cost = MongoScripts.get_tag_sentences(self.review_id, "cost")
                        
                        self.__cost_sub_tag_classification()
                        MongoScripts.update_cost_sub_tag_sentences(self.review_id, self.all_cost, 
                                TAG_CLASSIFY_ALG_NME, SENTI_CLSSFY_ALG_NME, 
                                COST_SB_CLSSFY_ALG_NME)


                if result.get("rerun_service_sub_tag_classification"):
                        print "{0}Doing SERVICE sub classification again for review_id --<<{1}>>-- {2}".format(bcolors.OKBLUE, \
                                self.review_id, bcolors.RESET)
                        self.service = MongoScripts.get_tag_sentences(self.review_id, "service")
                        self.__service_sub_tag_classification()
                        MongoScripts.update_service_sub_tag_sentences(self.review_id, self.all_service, 
                                TAG_CLASSIFY_ALG_NME, SENTI_CLSSFY_ALG_NME, SERV_SB_CLSSFY_ALG_NME)

                if result.get("rerun_ambience_sub_tag_classification"):
                        print "{0}Doing AMBIENCE sub classification again for review_id --<<{1}>>-- {2}".format(bcolors.OKBLUE, \
                                self.review_id, bcolors.RESET)
                        self.ambience = MongoScripts.get_tag_sentences(self.review_id, "ambience")
                        self.__ambience_sub_tag_classification()
                        MongoScripts.update_ambience_sub_tag_sentences(self.review_id, self.all_ambience, 
                                TAG_CLASSIFY_ALG_NME, SENTI_CLSSFY_ALG_NME, AMBI_SB_CLSSFY_ALG_NME)
                        

                if result.get("rerun_noun_phrases"):
                        print "{0}Doing noun phrases again for review_id --<<{1}>>-- {2}".format(bcolors.OKBLUE, \
                                self.review_id, bcolors.RESET)
                        
                        self.food = MongoScripts.get_tag_sentences(self.review_id, "food")
                        self.__food_sub_tag_classification()
                        self.__extract_noun_phrases() #makes self.noun_phrases
                        MongoScripts.update_noun_phrases(review_id, self.all_food_with_nps, TAG_CLASSIFY_ALG_NME,\
                                SENTI_CLSSFY_ALG_NME, FOOD_SB_CLSSFY_ALG_NME, NOUN_PHRASES_ALGORITHM_NAME)
        


                if result.get("rerun_all_algorithms"):
                        print "{0} No results found for review id --<<{1}>>--{2}".format(bcolors.FAIL, \
                                self.review_id, bcolors.RESET)
                        self.__sent_tokenize_review() #Tokenize reviews, makes self.reviews_ids, self.sentences
                        self.__predict_tags()          #Predict tags, makes self.predict_tags
                        self.__predict_sentiment() #makes self.predicted_sentiment

                        self.all_sent_tag_sentiment = zip(self.sentences, self.tags, self.sentiments)
                
                        self.__filter_on_category() #generates self.food, self.cost, self.ambience, self.service
                

                        self.__food_sub_tag_classification()
                        self.__service_sub_tag_classification()
                        self.__cost_sub_tag_classification()
                        self.__ambience_sub_tag_classification()

                        self.__extract_noun_phrases() #makes self.noun_phrases
                        self.__update_review_result()
                
                MongoScripts.update_processed_reviews_list(self.eatery_id, self.review_id)
                return 

        @print_execution
        def __food_sub_tag_classification(self):
                """
                This deals with the sub classification of fodd sub tags
                """
                self.food_sub_tags = FOOD_SB_TAG_CLASSIFIER_LIB.predict([__e[0] for __e in self.food])
                self.all_food = [(sent, tag, sentiment, sub_tag) for ((sent, tag, sentiment), sub_tag)\
                        in zip(self.food, self.food_sub_tags)]

                return 
       

        @print_execution
        def __service_sub_tag_classification(self):
                """
                This deals with the sub classification of fodd sub tags
                """
                self.service_sub_tags = SERV_SB_TAG_CLASSIFIER_LIB.predict([__e[0] for __e in self.service])
                self.all_service = [(sent, tag, sentiment, sub_tag) for ((sent, tag, sentiment), sub_tag) \
                        in zip(self.service, self.service_sub_tags)]
                
                return 

        @print_execution
        def __cost_sub_tag_classification(self):
                """
                This deals with the sub classification of cost sub tags
                
                self.all_cost = [(sent, "cost", sentiment, "cost-overall",), .....]
                """

                self.cost_sub_tags = COST_SB_TAG_CLASSIFIER_LIB.predict([__e[0] for __e in self.cost])
                self.all_cost = [(sent, tag, sentiment, sub_tag) for ((sent, tag, sentiment), sub_tag) \
                        in zip(self.cost, self.cost_sub_tags)]
                
                return 

        @print_execution
        def __ambience_sub_tag_classification(self):
                """
                This deals with the sub classification of fodd sub tags
                """
                self.ambience_sub_tags = AMBI_SB_TAG_CLASSIFIER_LIB.predict([__e[0] for __e in self.ambience])
                self.all_ambience = [(sent, tag, sentiment, sub_tag) for ((sent, tag, sentiment), sub_tag) \
                        in zip(self.ambience, self.ambience_sub_tags)]
                
                return




        @print_execution
        def __sent_tokenize_review(self):
                """
                Tokenize self.reviews tuples of the form (review_id, review) to sentences of the form (review_id, sentence)
                and generates two lists self.review_ids and self.sentences
                """
                self.sentences = self.sent_tokenizer.tokenize(self.review_text)
                return
                       

        @print_execution
        def __predict_tags(self):
                """
                Predict tags of the sentence which were being generated by self.sent_tokenize_reviews
                """
                self.tags = TAG_CLASSIFIER_LIB.predict(self.sentences)
                return

        @print_execution
        def __predict_sentiment(self):
                """
                Predict sentiment of self.c_sentences which were made by filtering self.sentences accoring to 
                the specified category
                """
                self.sentiments = SENTI_CLASSIFIER_LIB.predict(self.sentences)
                return 
        


        @print_execution
        def __filter_on_category(self):
                 __filter = lambda tag, __list: [(sent, __tag, sentiment) for (sent, __tag, sentiment) in \
                                                                                    __list if __tag== tag ]


                 self.food, self.cost, self.ambience, self.service, self.null, self.overall = \
                         __filter("food", self.all_sent_tag_sentiment),  __filter("cost", self.all_sent_tag_sentiment),\
                         __filter("ambience", self.all_sent_tag_sentiment), __filter("service", self.all_sent_tag_sentiment),\
                         __filter("null", self.all_sent_tag_sentiment),  __filter("overall", self.all_sent_tag_sentiment)


        @print_execution
        def __extract_noun_phrases(self):
                """
                Extarct Noun phrases for the self.c_sentences for each sentence and outputs a list 
                self.sent_sentiment_nps which is of the form 
                [('the only good part was the coke , thankfully it was outsourced ', 
                                            u'positive', [u'good part']), ...]
                """
                __nouns = NounPhrases([e[0] for e in self.all_food], default_np_extractor=NOUN_PHSE_ALGORITHM_NAME)

                self.all_food_with_nps = [(sent, tag, sentiment, sub_tag, nps) for ((sent, tag, sentiment, sub_tag,), nps) in 
                        zip(self.all_food, __nouns.noun_phrases[NOUN_PHSE_ALGORITHM_NAME])]

                return self.all_food_with_nps


        @print_execution
        def __get_review_result(self):
                result = MongoScripts.get_review_result(review_id = self.review_id) 
                return result 

        
        @print_execution
        def __update_review_result(self):
                MongoScripts.update_review_result_collection(
                        review_id = self.review_id, 
                        eatery_id = self.eatery_id, 
                        food = self.food,
                        cost = self.cost,
                        ambience = self.ambience,
                        null = self.null,
                        overall = self.overall,
                        service = self.service, 
                        food_result= self.all_food_with_nps, 
                        service_result = self.all_service, 
                        cost_result = self.all_cost, 
                        ambience_result = self.all_ambience, ) 
                return 
class PerReview:
    sent_tokenizer = SentenceTokenizationOnRegexOnInterjections()

    def __init__(self, review_id, review_text, review_time, eatery_id):
        """
                Lowering the review text
                """
        self.review_id, self.review_text, self.review_time, self.eatery_id = review_id, \
                SolveEncoding.to_unicode_or_bust(review_text.lower().replace("&nbsp;&nbsp;\n", "")), review_time, eatery_id

        print self.review_time, self.review_text, self.review_id, self.eatery_id
        self.cuisine_name = list()
        self.places_names = list()
        self.np_extractor = extract.TermExtractor()

    def print_execution(func):
        "This decorator dumps out the arguments passed to a function before calling it"
        argnames = func.func_code.co_varnames[:func.func_code.co_argcount]
        fname = func.func_name

        def wrapper(*args, **kwargs):
            start_time = time.time()
            print "{0} Now {1} have started executing {2}".format(
                bcolors.OKBLUE, func.func_name, bcolors.RESET)
            result = func(*args, **kwargs)
            print "{0} Total time taken by {1} for execution is --<<{2}>>--{3}\n".format(
                bcolors.OKGREEN, func.func_name, (time.time() - start_time),
                bcolors.RESET)

            return result

        return wrapper

    def get_args(self):
        print self.__dict__

    @print_execution
    def run(self):
        print "{0} Now processing review id --<<{1}>>--{2}".format(bcolors.FAIL, \
                        self.review_id, bcolors.RESET)
        self.__sent_tokenize_review(
        )  #Tokenize reviews, makes self.reviews_ids, self.sentences
        self.__predict_tags()  #Predict tags, makes self.predict_tags
        self.__predict_sentiment()  #makes self.predicted_sentiment

        self.all_sent_tag_sentiment = zip(self.sentences, self.tags,
                                          self.sentiments)

        self.__filter_on_category(
        )  #generates self.food, self.cost, self.ambience, self.service

        self.__food_sub_tag_classification()
        self.__service_sub_tag_classification()
        self.__cost_sub_tag_classification()
        self.__ambience_sub_tag_classification()
        self.__extract_places()
        self.__extract_cuisines()
        self.__extract_noun_phrases()  #makes self.noun_phrases
        self.__append_time_to_overall()
        self.__append_time_to_menu()
        self.__update_cuisine_places()
        self.__update_review_result()
        return

    @print_execution
    def __sent_tokenize_review(self):
        """
                Tokenize self.reviews tuples of the form (review_id, review) to sentences of the form (review_id, sentence)
                and generates two lists self.review_ids and self.sentences
                """
        self.sentences = self.sent_tokenizer.tokenize(self.review_text)
        return

    @print_execution
    def __predict_tags(self):
        """
                Predict tags of the sentence which were being generated by self.sent_tokenize_reviews
                """
        self.tags = tag_classifier.predict(self.sentences)
        return

    @print_execution
    def __predict_sentiment(self):
        """
                Predict sentiment of self.c_sentences which were made by filtering self.sentences accoring to 
                the specified category
                """
        self.sentiments = sentiment_classifier.predict(self.sentences)
        return

    def __filter_on_category(self):
        """
		Right now there are 
		[u'cuisine', u'service', u'food', u'overall', u'cost', u'place', u'ambience', u'null']
		main categories for the classification of the sentences

		all that has already been stored in all_sent_tag_sentiment alongwith the tag, sentiment
		of the sentences, 
		this function unzip these categpries and make class variables for these categories

		"""
        __filter = lambda tag: [(sent, __tag, sentiment) for (sent, __tag, sentiment) in \
                                                                                      self.all_sent_tag_sentiment if __tag== tag ]
        self.food, self.cost, self.ambience, self.service, self.null, self.overall, self.places, self.cuisine, self.menu = \
                               __filter("food"),  __filter("cost"), __filter("ambience"), __filter("service"),\
          __filter("null"),  __filter("overall"), __filter("place"), __filter("cuisine"), __filter("menu")

        return

    @print_execution
    def __food_sub_tag_classification(self):
        """
                This deals with the sub classification of fodd sub tags
                """
        self.food_sub_tags = food_sb_classifier.predict(
            [sent for (sent, tag, sentiment) in self.food])
        self.all_food = [[sent, tag, sentiment, sub_tag] for ((sent, tag, sentiment), sub_tag)\
                in zip(self.food, self.food_sub_tags)]

        return

    @print_execution
    def __service_sub_tag_classification(self):
        """
                This deals with the sub classification of service sub tags
		and generates self.all_service with an element in the form 
		(sent, tag, sentiment, sub_tag_service)
                """
        self.service_sub_tags = service_sb_classifier.predict(
            [sent for (sent, tag, sentiment) in self.service])
        self.all_service = [[sent, tag, sentiment, sub_tag] for ((sent, tag, sentiment), sub_tag) \
                in zip(self.service, self.service_sub_tags)]

        map(lambda __list: __list.append(self.review_time), self.all_service)
        return

    @print_execution
    def __cost_sub_tag_classification(self):
        """
                This deals with the sub classification of cost sub tags
                
                self.all_cost = [(sent, "cost", sentiment, "cost-overall",), .....]
                """

        self.cost_sub_tags = cost_sb_classifier.predict(
            [sent for (sent, tag, sentiment) in self.cost])
        self.all_cost = [[sent, tag, sentiment, sub_tag] for ((sent, tag, sentiment), sub_tag) \
                in zip(self.cost, self.cost_sub_tags)]

        map(lambda __list: __list.append(self.review_time), self.all_cost)
        return

    @print_execution
    def __ambience_sub_tag_classification(self):
        """
                This deals with the sub classification of fodd sub tags
                """
        self.ambience_sub_tags = ambience_sb_classifier.predict(
            [sent for (sent, tag, sentiment) in self.ambience])
        self.all_ambience = [[sent, tag, sentiment, sub_tag] for ((sent, tag, sentiment), sub_tag) \
                in zip(self.ambience, self.ambience_sub_tags)]

        map(lambda __list: __list.append(self.review_time), self.all_ambience)
        return

    @print_execution
    def __append_time_to_overall(self):
        self.overall = [list(e) for e in self.overall]
        map(lambda __list: __list.append(self.review_time), self.overall)
        return

    @print_execution
    def __append_time_to_menu(self):
        self.menu = [list(e) for e in self.menu]
        map(lambda __list: __list.append(self.review_time), self.menu)
        return

    @print_execution
    def __extract_places(self):
        """
		This function filters all the places mentioned in self.places variable
		it generates a list of places mentioned in the self.places wth the help
		of stanford core nlp
		"""
        def filter_places(__list):
            location_list = list()
            i = 0
            for __tuple in __list:
                if __tuple[1] == "LOCATION":
                    location_list.append([__tuple[0], i])
                i += 1

            i = 0
            try:
                new_location_list = list()
                [first_element, i] = location_list.pop(0)
                new_location_list.append([first_element])
                for element in location_list:
                    if i == element[1] - 1:
                        new_location_list[-1].append(element[0])

                    else:
                        new_location_list.append([element[0]])
                    i = element[1]

                return list(
                    set([" ".join(element) for element in new_location_list]))
            except Exception as e:
                return None

        for (sent, sentiment, tag) in self.places:
            try:
                result = loads(corenlpserver.parse(sent))
                __result = [(e[0], e[1].get("NamedEntityTag"))
                            for e in result["sentences"][0]["words"]]
                self.places_names.extend(filter_places(__result))

            except Exception as e:
                print e, "__extract_place", self.review_id
                pass
        return

    @print_execution
    def __extract_cuisines(self):
        """
		This extracts the name of the cuisines fromt he cuisines sentences
		"""

        for (sent, tag, sentiment) in self.cuisine:
            self.cuisine_name.extend(self.np_extractor(sent))

        self.cuisine_name = [np[0] for np in self.cuisine_name if np[0]]
        print self.cuisine_name
        return

    @print_execution
    def __extract_noun_phrases(self):
        """
                Extarct Noun phrases for the self.c_sentences for each sentence and outputs a list 
                self.sent_sentiment_nps which is of the form 
                [('the only good part was the coke , thankfully it was outsourced ', 
                                            u'positive', [u'good part']), ...]
                """
        __nouns = list()
        for (sent, tag, sentiment, sub_tag) in self.all_food:
            __nouns.append([e[0] for e in self.np_extractor(sent)])

        self.all_food_with_nps = [[sent, tag, sentiment, sub_tag, nps] for ((
            sent,
            tag,
            sentiment,
            sub_tag,
        ), nps) in zip(self.all_food, __nouns)]

        map(lambda __list: __list.append(self.review_time),
            self.all_food_with_nps)
        print __nouns
        return

    @print_execution
    def __get_review_result(self):
        result = MongoScripts.get_review_result(review_id=self.review_id)
        return result

    @print_execution
    def __update_review_result(self):
        MongoScriptsReviews.update_review_result_collection(
            review_id=self.review_id,
            eatery_id=self.eatery_id,
            food=self.food,
            cost=self.cost,
            ambience=self.ambience,
            null=self.null,
            overall=self.overall,
            service=self.service,
            place_sentences=self.places,
            cuisine_sentences=self.cuisine,
            food_result=self.all_food_with_nps,
            service_result=self.all_service,
            menu_result=self.menu,
            cost_result=self.all_cost,
            ambience_result=self.all_ambience,
            places_result=self.places_names,
            cuisine_result=self.cuisine_name)
        return

    @print_execution
    def __update_cuisine_places(self):
        """
                update cuisine and places to the eatery
                """
        MongoScriptsReviews.update_eatery_places_cusines(
            self.eatery_id, self.places_names, self.cuisine_name)
        return
Пример #8
0
#!/usr/bin/env python


import pymongo
import ConfigParser
from sklearn.externals import joblib
import os
from topia.termextract import extract 
import jsonrpclib
from simplejson import loads
from elasticsearch import Elasticsearch, helpers
from Text_Processing.Sentence_Tokenization.Sentence_Tokenization_Classes import SentenceTokenizationOnRegexOnInterjections
sentence_tokenizer = SentenceTokenizationOnRegexOnInterjections()
noun_phrase_extractor = extract.TermExtractor()


this_file_path = os.path.dirname(os.path.abspath(__file__))

config = ConfigParser.RawConfigParser()
config.read("variables.cfg")

ELASTICSEARCH_IP = config.get("elasticsearch", "ip")

path_for_classifiers = "%s/Text_Processing/PrepareClassifiers/InMemoryClassifiers/newclassifiers"%(this_file_path) 

sentiment_classifier = joblib.load("%s/%s"%(path_for_classifiers, config.get("algorithms", "sentiment_classification_library")))
tag_classifier = joblib.load("%s/%s"%(path_for_classifiers, config.get("algorithms", "tag_classification_library")))
food_sb_classifier = joblib.load("%s/%s"%(path_for_classifiers, config.get("algorithms", "food_algorithm_library")))
ambience_sb_classifier = joblib.load("%s/%s"%(path_for_classifiers, config.get("algorithms", "ambience_algorithm_library")))
service_sb_classifier = joblib.load("%s/%s"%(path_for_classifiers, config.get("algorithms", "service_algorithm_library")))
cost_sb_classifier = joblib.load("%s/%s"%(path_for_classifiers, config.get("algorithms", "cost_algorithm_library")))