def run(self, hour_range=HOUR_RANGE): date_range = kuzuha.build_date_filter_by_range({'hours': hour_range}) posts = kuzuha.search('http', _filter=date_range, sort=[]) tweet = '' for (url, count) in self._count_url(posts).most_common(): if url.startswith('https://twitter.com/'): tweet_id = self.extract_tweet_id(url) if tweet_id: logger.info('RT: id=%s (%s)' % (tweet_id, url)) if not self.debug: try: self.twitter.api.statuses.retweet(id=tweet_id) except TwitterHTTPError as e: logger.warn('%s %s' % (type(e), str(e))) continue title = self._get_title(url) new_url_info = TWEET_FORMAT % (title, url, count) expected_length = self.calc_tweet_length(tweet, title, count) if expected_length < (MAX_TWEET_LENGTH - len(HASH_TAG)): tweet += new_url_info else: tweet = tweet[:-len(DELIMITER)] + HASH_TAG if tweet != HASH_TAG: tweet = tweet.replace('\n', '').replace('\r', '') yield tweet tweet = new_url_info if tweet: if tweet.endswith(DELIMITER): tweet = tweet[:-len(DELIMITER)] tweet = tweet.replace('\n', '').replace('\r', '') yield tweet + HASH_TAG
def morphological_analysis(): if request.method != 'POST': return render_template('ma.html') hours = int(request.form.get('hours')) now = datetime.now() start_dt = now - timedelta(hours=hours) end_dt = start_dt + timedelta(hours=1) dt_range = DT_RANGE.copy() dt_range['range']['dt']['from'] = start_dt.strftime('%Y-%m-%dT%H:%M:%S') dt_range['range']['dt']['to'] = end_dt.strftime('%Y-%m-%dT%H:%M:%S') posts = kuzuha.search('', field='text', _filter=dt_range, sort=[], size=10000) word_counter = Counter() word_contain_posts = defaultdict(list) for post in posts: if not post.get('text'): continue for line in post['text'].splitlines(): words = Counter(count_words(line)) word_counter += words for word in words: word_contain_posts[word].append(line) all_result = dump_all_words(word_counter, word_contain_posts) suspects = suspected_word(word_counter, word_contain_posts) return render_template('ma.html', all_result=Markup(all_result), suspects=Markup(suspects), hours=hours)
def run(self, interval=20): posts = kuzuha.search('', _filter=kuzuha.build_date_filter_by_range( {'minutes': interval})) pairs = self.get_post_res_pairs(posts) for (parent, responses) in pairs.items(): if len(responses) >= 2: ome_posts = set() logger.info('MENTIONED POST: %s' % parent) for (lhs, rhs) in itertools.combinations(responses, 2): logger.info('Compare "%s" with "%s"' % (lhs, rhs)) if lhs and rhs and self.is_ome(lhs, rhs): logger.info('"%s" and "%s" are OME' % (lhs, rhs)) ome_posts |= {lhs, rhs} if len(ome_posts) > 1: num_posts = len(ome_posts) + 1 # childs + parent max_length = (body_length - num_posts * 2) // num_posts parent = self.shorten(parent, max_length) message = '%s『%s』' % (PREFIX, parent) for ome_post in sorted(ome_posts): ome_post = self.shorten(ome_post, max_length) message += '「%s」' % ome_post message += HASH_TAG yield message
def _get_log(self, hours=1): _filter = kuzuha.build_hour_filter(hours) self.year = _filter['range']['dt']['gte'][0:4] self.month = _filter['range']['dt']['gte'][5:7] self.day = _filter['range']['dt']['gte'][8:10] self.start_hour = int(_filter['range']['dt']['gte'][11:13]) self.end_hour = int(_filter['range']['dt']['lte'][11:13]) + 1 return kuzuha.search(_filter=_filter)
def _get_log(self, hours=1): _filter = kuzuha.build_hour_filter(hours) self.year = _filter['range']['dt']['gte'][0:4] self.month = _filter['range']['dt']['gte'][5:7] self.day = _filter['range']['dt']['gte'][8:10] self.start_hour = int(_filter['range']['dt']['gte'][11:13]) self.end_hour = int(_filter['range']['dt']['lte'][11:13]) + 1 return kuzuha.search(_filter=_filter, sort=[])
def run(self): _filter = kuzuha.build_hour_filter(1) for post in kuzuha.search(_filter=_filter, sort=[('_score', 'desc')]): text = normalize.normalize(post['text'], repeat=4) text = regex.re_html_tag.sub('', text) text = regex.re_url.sub('', text) result = self.extract575(text) if result: return result + ' #みさお川柳'
def respond_oshiete(text): oshiete_match = re_oshiete.search(text) if not oshiete_match: return None query = oshiete_match.group('query') posts = list(kuzuha.search(query, field='text', size=5)) for answer in _extract_oshiete_answer(query, posts): yield answer yield NOT_FOUND_MESSAGE % query
def find(self, lemma, query, date_range): result = '' posts = kuzuha.search(query, field='text', _filter=date_range, sort=[]) for post in posts: result = self.find_clause(lemma, cabocha.parse(post['text']), query[0]) if result: result = re_right.sub(')', result) break return result or ''
def respond_oshiete(text, *args): oshiete_match = re_oshiete.search(text) if not oshiete_match: return None query = oshiete_match.group('query') posts = list(kuzuha.search(query, field='text', size=5)) for answer in _extract_oshiete_answer(query, posts): yield answer yield NOT_FOUND_MESSAGE % query
def _extract_response_by_search(query, or_flag): validated_query = _validate_query(query) if not validated_query: return _operator = 'or' if or_flag else 'and' posts = kuzuha.search(validated_query, _operator=_operator, size=200) for post in sorted(posts, key=lambda x: len(x['q1'])): response = _validate_post(post) if response: yield response
def _extract_response_by_search(query, or_flag): validated_query = _validate_query(query) if not validated_query: return _operator = 'or' if or_flag else 'and' posts = kuzuha.search(validated_query, _operator=_operator, size=200, indices=['mirai', 'qwerty', 'misao']) for post in sorted(posts, key=lambda x: len(x['q1'])): response = _validate_post(post) if response: yield response
def respond_what_who(text, *args): """ 何がXXX? -> YYYがXXX e.g. 何がおかしい? -> 頭がおかしい """ predicate = _build_what_who_query(text) if predicate: query = 'が%s は%s' % (predicate, predicate) for post in kuzuha.search(query, field='text', sort=[('dt', 'desc')], _operator='or', size=50): if 120 > len(post['text']) > 4 and not mecab.has_demonstrative(post['text']): yield post['text']
def respond_what_who(text): """ 何がXXX? -> YYYがXXX e.g. 何がおかしい? -> 頭がおかしい """ predicate = _build_what_who_query(text) if predicate: query = 'が%s は%s' % (predicate, predicate) for post in kuzuha.search(query, field='text', sort=[('dt', 'desc')], _operator='or', size=50): if 120 > len(post['text']) > 4 and not mecab.has_demonstrative(post['text']): yield post['text']
def _extract_response_by_search(query, or_flag): validated_query = _validate_query(query) if not validated_query: return _operator = 'or' if or_flag else 'and' _filter = { "script": { "script": "doc['log.quoted_by'].size() > 0", } } posts = kuzuha.search(validated_query, _operator=_operator, _filter=_filter, size=200) for post in sorted(posts, key=lambda x: len(x['q1'])): response = _validate_post(post) if response: yield response
def run(self, interval=60, min_length=40): m_generator = markov.MarkovGenerater() m = mecab.MeCabWrapper() posts = kuzuha.search('', _filter=kuzuha.build_date_filter_by_range({'minutes': interval}), sort=[]) words = [] for post in posts: if 'text' not in post: continue text = regex.re_a_tag.sub('', post['text']) text = normalize.normalize(text) if 'アニメ時報' in text: continue words.append([]) for line in text.splitlines(): words[-1].append(BOS) for n in m.parse_to_node(line): words[-1].append('%s,%s' % (n.surface, ''.join(n.feature.split(',')[:5]))) words[-1].append(EOS) return m_generator.generate(words, min_length)
def run(self, interval=20): posts = kuzuha.search('', _filter=kuzuha.build_date_filter_by_range({'minutes': interval})) pairs = self.get_post_res_pairs(posts) for (parent, responses) in pairs.items(): if len(responses) >= 2: ome_posts = set() logger.info('MENTIONED POST: %s' % parent) for (lhs, rhs) in itertools.combinations(responses, 2): logger.info('Compare "%s" with "%s"' % (lhs, rhs)) if lhs and rhs and self.is_ome(lhs, rhs): logger.info('"%s" and "%s" are OME' % (lhs, rhs)) ome_posts |= {lhs, rhs} if len(ome_posts) > 1: num_posts = len(ome_posts) + 1 # childs + parent max_length = (body_length - num_posts*2) // num_posts parent = self.shorten(parent, max_length) message = '%s『%s』' % (PREFIX, parent) for ome_post in sorted(ome_posts): ome_post = self.shorten(ome_post, max_length) message += '「%s」' % ome_post message += HASH_TAG yield message
def get_log(target_dt): date_range = kuzuha.build_date_filter(target_dt, target_dt.replace(hour=23, minute=59)) posts = kuzuha.search('', _filter=date_range, sort=[('dt', 'asc')], _id=True) return list(posts)
def _get_log(self, hours=1): _filter = kuzuha.build_hour_filter(hours) self.start_hour = int(_filter["range"]["dt"]["gte"][11:13]) self.end_hour = int(_filter["range"]["dt"]["lte"][11:13]) + 1 return kuzuha.search(_filter=_filter)
def respond(text, *args): logs = kuzuha.search(mecab.extract_word(text)) logs = [cleansing(log.get('text', '')) for log in logs] for message in get_longest_common_substring(''.join(logs)): if message: yield message + '(;´Д`)'