def __alter_repeat_for_dont_think_SV(fixed_df): try: # TODO see if its neccesary to care about should and cant idx_of_think = Nlp_util.get_idx_list_of_word("think", fixed_df["base_form"])[0] df_after_think = fixed_df.loc[idx_of_think + 1:, :].reset_index(drop=True) verb_list = Nlp_util.make_verb_list(df_after_think, type="normal") noun_list = Nlp_util.make_noun_list(df_after_think) # possibly bug happen here since amount of verbs are different in cant do/dont do is_negative_form = Df_util.anything_isin(["not", "never"], df_after_think.loc[:, "base_form"]) # can add possibly or likely(when its negative) head_words = ["so ", "so probably ", "probably ", "so maybe ", "maybe "] random_idx_for_heads_words = randint(0, len(head_words) - 1) if is_negative_form: # まず主語とるそのあとにwouldntいれるその後ろに動詞の原型をいれて、それ以降はつづける head_word = head_words[random_idx_for_heads_words] subj = noun_list["word"].iloc[0] auxiliary_verb = " would " idx_of_not = Nlp_util.get_idx_list_of_word_list(["not", "never"], df_after_think.loc[:, "base_form"])[0] verb_row = verb_list.loc[idx_of_not:, :].iloc[0] verb = verb_row.base_form + " " after_verb = WordFormatter.Series2Str(df_after_think.loc[verb_row.name + 1:, "word"]) return [head_word + subj + auxiliary_verb + verb + after_verb] else: head_word = head_words[random_idx_for_heads_words] subj = noun_list["word"].iloc[0] auxiliary_verb = " wouldnt " verb = verb_list["base_form"].iloc[0] + " " after_verb = WordFormatter.Series2Str(df_after_think.loc[verb_list.index[0] + 1:, "word"]) return [head_word + subj + auxiliary_verb + verb + after_verb] except: logging.exception('') return []
def __alter_repeat_for_need_sent(fixed_df): try: idx_of_need = Nlp_util.get_idx_list_of_word("need", fixed_df["base_form"])[0] row_of_first_noun = Nlp_util.get_wordsDF_of_wordlist_after_idx(fixed_df, Nlp_util.pos_NOUNs + Nlp_util.pos_PRPs, idx_of_need, column_name="pos").iloc[0] if fixed_df.loc[row_of_first_noun.name-1, "pos"] in Nlp_util.pos_ADJECTIVEs + ["PRP$", "DT"]: noun = WordFormatter.Series2Str(fixed_df.loc[row_of_first_noun.name-1:row_of_first_noun.name, "word"]) else: noun = fixed_df.loc[row_of_first_noun.name, "word"] noun_nominative = Nlp_util.convert_objective_noun_to_nominative(noun) options = [ ["so " + noun_nominative + " is very important thing for you..", "and sounds its kinda hard to get it now right😢"], ["so its like its not easy to get " + noun + " now but you really want..", "and it can frustrate you😞"], ["sounds you really want " + noun + "..", "might be tough time for you to seek for it now😓"] ] random_idx_for_options = randint(0, len(options) - 1) return options[random_idx_for_options] except: logging.exception('') repeat_text = WordFormatter.Df2Str(fixed_df)[1:] return [repeat_text]
def __correct_short(cls, w_toks): try: w_toks = cls.__correct_prp_short_form(w_toks) w_toks = cls.__correct_wanna_type_abbreviations(w_toks) text = WordFormatter.WToks2Str(w_toks) # remove ' because it make word_tokenize messy text = text.replace('\'', '') w_toks = WordFormatter.Str2WToks(text) corrected_w_toks = [] for sent in w_toks: corrected_w_toks.append([]) for idx, word in enumerate(sent): if word in SFDF.short.values: w_tokenized_replacement = word_tokenize(SFDF[SFDF.short == word].normal.values[0]) for replacement_word in w_tokenized_replacement: corrected_w_toks[-1] = corrected_w_toks[-1] + [replacement_word] elif idx == 0 and word == "cause": corrected_w_toks[-1] = corrected_w_toks[-1] + ["because"] else: corrected_w_toks[-1] = corrected_w_toks[-1] + [word] return corrected_w_toks except: logging.exception('') return w_toks
def request_to_apiai(df): try: message = WordFormatter.Df2Str(df) client_access_token = os.environ.get('client_access_token', None) session_id = os.environ.get('session_id', None) ai = ApiAI(client_access_token) request = ai.text_request() request.session_id = session_id request.query = message response = json.loads(request.getresponse().read().decode('utf-8')) try: if response is not None: if 'action' in response['result'].keys(): candidate = response['result']['action'] if Intent.has_value(candidate): return Intent(candidate) except: logging.exception('') if is_haha_intent(message): return Intent.HAHA return Intent.NORMAL except: logging.exception('') return Intent.NORMAL
def __call__(self, message, user): try: if self.__exists_admin_command(message, user): return message w_toks = WordFormatter.MsgDict2WToks(message.message_dicts) message.original_df = OriginalDFGenerator.create_original_df_by_w_toks( w_toks) message_normalizer = MessageNormalizer() message.text_df = message_normalizer(message.message_dicts, user.sender_id) intent_checker = IntentChecker() message.intent_list = intent_checker(message.text_df) text_kw_df_generator = TextKwDFGenerator() message.text_kw_df = text_kw_df_generator(message.text_kw_df) sentiment_score_df_generator = SentimentScoreDFGenerator() message.sentiment_score_df = sentiment_score_df_generator( message.text_df, message.text_kw_df) return message except: logging.exception('') return message
def __call__(self, message_dicts: List[Dict[str, str]], sender_id, from_preprocessor=True): try: if from_preprocessor: message_dicts = self.__convert_attachment(message_dicts) message_dicts = self.__normalize_apostrophe(message_dicts) w_toks = WordFormatter.MsgDict2WToks(message_dicts) else: w_toks = message_dicts print('\nword_tokenized\n', w_toks) normalized_w_toks = self.normalize_message_by_w_toks(w_toks) send_typing_on(sender_id) # make original_df with sidx, widx, word, pos tag df = OriginalDFGenerator.create_original_df_by_w_toks(normalized_w_toks) df = self.normalize_message_by_df(df) return df except: logging.exception('') return None
def __create_response_for_what_to_V(df): df_after_what_to = df.loc[Nlp_util.get_idx_list_of_idiom_list( ["what to", "how to"], df["base_form"])[0] + 2:, :] words_after_what_to = WordFormatter.Df2Str(df_after_what_to) cmp = [ ["it must be not easy to find how to" + words_after_what_to], ["now you are looking for the way to" + words_after_what_to], ["should be not that easy to find how to" + words_after_what_to], ] encourage = [ [ "but i am sure that thinking about it and speaking out it helps you🤗" ], [ "eventho its always tough to find the right way, you try to figure it out. Thats impressing me😊" ], [ "plz let me know any idea comes to your mind now. it might help you figuring it out☺️" ], [ "tell me if you have any little idea. It could help you finding ur way😊" ], ] random_idx_for_cmp = randint(0, len(cmp) - 1) random_idx_for_encourage = randint(0, len(encourage) - 1) return cmp[random_idx_for_cmp] + encourage[random_idx_for_encourage]
def __alter_repeat_for_wish(fixed_df): wish_idx = Nlp_util.get_idx_list_of_word("wish", fixed_df["base_form"])[0] row_of_subj = Nlp_util.get_wordsDF_of_wordlist_after_idx(fixed_df, Nlp_util.pos_NOUNs+Nlp_util.pos_PRPs, wish_idx, column_name="pos").iloc[0] row_of_verb = Nlp_util.get_wordsDF_of_wordlist_after_idx(fixed_df, Nlp_util.pos_VERBs, row_of_subj.name, column_name="pos").iloc[0] subj = row_of_subj.word verb = row_of_verb.word after_verb = WordFormatter.Series2Str(fixed_df.loc[row_of_verb.name+1:, "word"]) objective_subj = Nlp_util.convert_nominative_noun_to_objective(subj) if subj == "you": repeat_list = [ ["you really want to "+verb+" "+after_verb], ["so you seriously hope to "+verb+" "+after_verb], ["so you are dying to "+verb+" "+after_verb] ] else: repeat_list = [ ["you really want "+objective_subj+" to "+verb+" "+after_verb], ["you really wanna have "+objective_subj+" "+verb+" "+after_verb], ["you really wanna make "+objective_subj+" "+verb+" "+after_verb] ] cmp_list = [ ["but sounds you feel bit too much to expect that now..?"], ["and sounds you feel like its impossible..?"], ["and seems like you dont think it never happen😓"] ] random_idx_for_repeat_list = randint(0, len(repeat_list) - 1) random_idx_for_cmp_list = randint(0, len(cmp_list) - 1) return repeat_list[random_idx_for_repeat_list]+cmp_list[random_idx_for_cmp_list]
def __alter_repeat_for_because_sent(df, repeat_text): if df["base_form"].iloc[0] in ["because", "since"]: repeat_text = "its " + repeat_text return [repeat_text] elif Df_util.anything_isin( ["because of"], df.loc[2:, "base_form"]) and not Df_util.anything_isin( ["it is", "that is"], df.loc[:3, "base_form"]): because_of_idx = Nlp_util.get_idx_list_of_idiom( "because of", df["base_form"])[0] first_part = WordFormatter.Df2Str(df.loc[:because_of_idx - 1, :]) last_part = "and its" + WordFormatter.Df2Str( df.loc[because_of_idx:, :]) return [first_part, last_part] else: raise UnknownError
def get_idx_list_of_idiom(idiom, series): tokenized_word = nltk.word_tokenize(idiom) ngram_list_of_the_series = Nlp_util.create_ngrams( WordFormatter.Series2Str(series), len(tokenized_word)) return [ idx for idx, ngram in enumerate(ngram_list_of_the_series) if ngram == tokenized_word ]
def test_wtok2str(self): test_patterns = [(('i', 'am', 'sad'), "i am sad."), (('thank', 'you'), "thank you."), (None, None), ([], [])] for x, y in test_patterns: with self.subTest(x=x): result = WordFormatter.wtok2str(x) self.assertEqual(y, result)
def __alter_repeat_for_want_to(cls, repeat_df): try: i_idx = Nlp_util.get_idx_list_of_idiom("want to", repeat_df.word)[0] words_after_wanna = WordFormatter.Df2Str(repeat_df[i_idx + 2:])[1:] response_options = [ [words_after_wanna, "That's what you wanna do"], ["So you'd be happy if you can " + words_after_wanna + "🤔"], ["So there is something makes you can't " + words_after_wanna + "😢"], ["So now it's hard for you to " + words_after_wanna + "😓"] ] random_idx = randint(0, len(response_options) - 1) return response_options[random_idx] except: logging.exception('') repeat_text = WordFormatter.Df2Str(repeat_df)[1:] return [repeat_text]
def __get_sidx_of_normal_and_too_long_sent(cls, df): delete_sidx_list = [] for sidx in set(df.sidx.values): target_df = df[df.sidx == sidx].copy().reset_index(drop=True) if cls.__is_special_type(target_df): pass else: if len(WordFormatter.Series2Str(target_df.word)) > 75: delete_sidx_list.append(sidx) else: pass return delete_sidx_list
def __call__(self): previous_msg = models.Message.fetch_previous_msg(self.user.id) w_toks = WordFormatter.stoks2wtoks([previous_msg]) message_normalizer = MessageNormalizer() df = message_normalizer(w_toks, self.user.sender_id, from_preprocessor=False) text_kw_df_gengerator = TextKwDFGenerator() text_kw_df = text_kw_df_gengerator(df) sentiment_score_df_generator = SentimentScoreDFGenerator() sentiment_score_df = sentiment_score_df_generator(df, text_kw_df) responses = self.create_cmp(df, sentiment_score_df, self.response_data) self.response_data['regular'] = responses return self.response_data
def anything_isin(word_list, series): for word in word_list: tokenized_word = nltk.word_tokenize(word) if len(tokenized_word) == 1: if any(series.isin(word_list)): return True else: pass else: ngram_list_of_the_series = Nlp_util.create_ngrams( WordFormatter.Series2Str(series), len(tokenized_word)) if tokenized_word in ngram_list_of_the_series: return True else: pass return False
def __is_previous_msg_cmp_makeable(cls, user_id): previous_msg = models.Message.fetch_previous_msg(user_id) if len(previous_msg) == 0: return False w_toks = WordFormatter.stoks2wtoks([previous_msg]) message_normalizer = MessageNormalizer() df = message_normalizer(w_toks, None, from_preprocessor=False) target_pos = Nlp_util.pos_VERBs + Nlp_util.pos_ADVERBs + Nlp_util.pos_ADJECTIVEs target_word_df = df[df.pos.isin(target_pos)] if any(target_word_df.base_form.isin(WORD_LIST_FOR_CMP.word.tolist())): return True else: return False
def __remove_stickers(w_toks): try: for sidx, s in enumerate(w_toks): s_text = WordFormatter.SingleWToks2Str(s) for i in STICKER_DF.sticker.tolist(): while i in s_text: first = s_text[:s_text.index(i)] last = s_text[s_text.index(i) + len(i):] s_text = first + last new_w_tok = word_tokenize(s_text) w_toks[sidx] = new_w_tok return w_toks except: logging.exception('') return w_toks
def __call__(self, text_df): w_toks = WordFormatter.Df2WToks(text_df, column_name="base_form") try: matched_list = [] for sidx, sent in enumerate(w_toks): for widx, word in enumerate(sent): kw_type = self.__find_keywords_from_csv( text_df, sidx, widx, word) if kw_type == 'EMPHASIS': matched_list.append([sidx, widx, word, 'emphasis']) elif kw_type == 'KEYWORD': matched_list.append([sidx, widx, word, '-']) if all('-' not in i for i in matched_list): return None except Exception: logging.exception('Error at: ' + str(__name__)) return None try: text_kw_df = pd.DataFrame(matched_list) text_kw_df.columns = ['sidx', 'widx', 'word', 'Target'] text_kw_df = self.__detect_following_emphasis(text_df, text_kw_df) text_kw_df = self.__detect_prior_emphasis(text_df, text_kw_df) text_kw_df.sort_values(['sidx', 'widx'], ascending=[True, True], inplace=True) text_kw_df = text_kw_df.reset_index(drop=True) text_kw_df = self.__add_points_text_kw_df(text_df, text_kw_df) return text_kw_df except: logging.exception('') return None
def __call__(self, text_df): """ This method creates a df with information of keywords in message. columns are sidx, widx, word, target, emphasis, ng, iscore, kw_type, special :param text_df: :return: """ w_toks = WordFormatter.df2wtoks(text_df, column_name="base_form") matched_list = [] for sidx, sent in enumerate(w_toks): for widx, word in enumerate(sent): kw_type = self.__find_keywords_from_csv( text_df, sidx, widx, word) if kw_type == 'EMPHASIS': matched_list.append([sidx, widx, word, 'emphasis']) elif kw_type == 'KEYWORD': matched_list.append([sidx, widx, word, '-']) if all('-' not in i for i in matched_list): return None text_kw_df = pd.DataFrame(matched_list) text_kw_df.columns = ['sidx', 'widx', 'word', 'Target'] text_kw_df = self.__detect_following_emphasis(text_df, text_kw_df) text_kw_df = self.__detect_prior_emphasis(text_df, text_kw_df) text_kw_df.sort_values(['sidx', 'widx'], ascending=[True, True], inplace=True) text_kw_df = text_kw_df.reset_index(drop=True) text_kw_df = self.__add_points_text_kw_df(text_df, text_kw_df) return text_kw_df
def __generate_repeat(cls, text_df, text_kw_df, sidx_to_repeat): repeat_list = [] for idx, sidx in enumerate(sidx_to_repeat): target_df = text_df[text_df.sidx == sidx].copy().reset_index( drop=True) fixed_df = cls.__replace_word_by_csv(target_df) # TODO fix the convert_df_to_str() so that it would not need [1:] part. repeat_text = WordFormatter.Df2Str(fixed_df) # TODO here has to be same structure as message_type_filter since # TODO one sentence can have "want to" and despising word at the same time. if len(target_df) == 1 and \ (target_df["pos"].iloc[0] in Nlp_util.pos_ADJECTIVEs or target_df["word"].iloc[0] in SENTIMENTAL_NON_ADJ_WORDS.word.tolist()): repeat_list += cls.create_special_repeat_for_only_one_adj_word_sent( target_df) elif cls.__mean_no_friends(target_df): repeat_list += cls.__create_response_for_no_friends() elif cls.__has_what_to_do(target_df): repeat_list += cls.__create_response_for_what_to_V(fixed_df) elif cls.__is_despising_himself(target_df): repeat_list += cls.__alter_repeat_euphemistic(repeat_text) elif cls.__has_nobody_V(target_df): repeat_list += cls.__alter_repeat_euphemistic(repeat_text) elif cls.__does_user_feel_useless(target_df): repeat_list += cls.__create_response_for_healing_useless() elif cls.__has_say_plus_bad_word(target_df): repeat_list += cls.__create_response_for_S_said_bad_word( fixed_df) elif cls.__exists_want_to(target_df): repeat_list += cls.__alter_repeat_for_want_to(fixed_df) elif cls.__exists_make_S_feel_ADJ(target_df): repeat_list += cls.__alter_repeat_for_make_S_feel_ADJ( target_df) elif cls.__has_because(target_df): repeat_list += cls.__alter_repeat_for_because_sent( fixed_df, repeat_text) elif cls.__exists_third_person_BeVerb_pair(target_df): repeat_list += cls.__alter_repeat_for_third_person_BeVerb_pair( repeat_text) elif cls.__has_dont_think_SV_sent(target_df): repeat_list += cls.__alter_repeat_for_dont_think_SV(fixed_df) elif cls.__has_wish_S_V(target_df): repeat_list += cls.__alter_repeat_for_wish(fixed_df) elif cls.__has_need_NN(target_df): repeat_list += cls.__alter_repeat_for_need_sent(fixed_df) elif cls.__exists_keyword(text_kw_df): is_last_sentence = idx == len(sidx_to_repeat) - 1 repeat_list += cls.__alter_repeat_for_keyword( text_df, text_kw_df, idx, repeat_text, is_last_sentence=is_last_sentence) else: repeat_list += cls.__alter_repeat_for_plain_repeat( repeat_text, idx) print('**************111111************\n', repeat_list) return repeat_list