def make_response(self, text, user_info=DEFAULT_USER, global_context=[]): text = normalize.normalize(text) response = '' stop_make_response = False for method in RESPONDING_METHODS: for response in method(text, user_info): if isinstance(response, dict): response['text'] = response.get('text', '').strip() if not (response['text'] in user_info['replies'] or response['text'] in global_context): stop_make_response = True break else: response = response.rstrip() if not (response in user_info['replies'] or response in global_context): stop_make_response = True break if stop_make_response: break if not response: response = {'text': 'ああ(;´Д`)'} elif isinstance(response, str): response = {'text': response} response['text'] = self.replace_name(response['text'], user_info) return response
def run(self): _filter = kuzuha.build_hour_filter(1) for post in kuzuha.search(_filter=_filter, sort=[('_score', 'desc')]): text = normalize.normalize(post['text'], repeat=4) text = regex.re_html_tag.sub('', text) text = regex.re_url.sub('', text) result = self.extract575(text) if result: return result + ' #みさお川柳'
def _extract_oshiete_answer(query, posts): for case_marking_particle in (u'って', u'は', u'の', ''): extract_rule = re.compile('(%s%s%s.{2,})' % (query, NOUN_SUFFIXES, case_marking_particle)) for post in posts: text = post['text'] text = normalize.normalize(text.strip()) if extract_rule.search(text): answer = extract_rule.search(text.strip()).group(1) if not answer or any(w in answer for w in NG_SUBSTRS): continue if 3 < len(answer) < 120: yield answer
def _get_title(self, url): title = '' root, ext = os.path.splitext(url) if ext in image_extensions: time.sleep(3) # for avoiding to be treated as spam by Google logger.info('Search by google: %s' % url) results = google_image.search(url, best_kwds_max_length=18) keywords = filter(lambda x: not x.isdigit(), results['best_keywords']) title = ''.join(keywords) elif not ext in ignore_extensions: logger.info('Retrieve web resource: %s' % url) html = web.open_url(url) soup = BeautifulSoup(html, "html5lib") if soup.title and soup.title.string: title = soup.title.string title = normalize.normalize(title) title = self._shorten_title(title) return title
def run(self, interval=60, min_length=40): m_generator = markov.MarkovGenerater() m = mecab.MeCabWrapper() posts = kuzuha.search('', _filter=kuzuha.build_date_filter_by_range({'minutes': interval}), sort=[]) words = [] for post in posts: if 'text' not in post: continue text = regex.re_a_tag.sub('', post['text']) text = normalize.normalize(text) if 'アニメ時報' in text: continue words.append([]) for line in text.splitlines(): words[-1].append(BOS) for n in m.parse_to_node(line): words[-1].append('%s,%s' % (n.surface, ''.join(n.feature.split(',')[:5]))) words[-1].append(EOS) return m_generator.generate(words, min_length)
def give_present(*arg): present_list = file_io.read('present.txt', data=True) sentence = misc.choice(present_list) while ('集計' in sentence or 'シュウケイ' in sentence or 'を' not in sentence or sentence.endswith('萌え') or len(sentence) < 3): sentence = misc.choice(present_list) present = normalize.remove_emoticon(sentence) present = present.replace('!', '').replace('!', '') present = present.replace('漏れの', '').replace('俺の', '').replace('俺が', '') present = present[:-1] if present.endswith('を') else present search_result = google_image.search(present) if 'images' in search_result: for url in search_result['images']: if url.endswith(('.jpg', '.gif', '.png')): try: web.download(url, '/tmp/present') break except: continue sentence = normalize.normalize(sentence) return {'text': '%nameに' + sentence, 'media[]': '/tmp/present'}
def test_normalize(): got = normalize.normalize(u'あいぼんのおまんこを指で開いてクチュクチュしたいよおおーう') assert got == u'あいぼんのおまんこを指で開いてクチュクチュしたいよーう'
def prepare_for_counting(text): text = regex.re_a_tag.sub('', text) text = normalize.normalize(text, emoticon=False, repeat=3) return text
def cleansing(text): text = text.strip() text = text.replace('\n', '') text = regex.re_a_tag.sub('', text) text = normalize.remove_emoticon(text) return normalize.normalize(text, repeat=3)