def phrase_search(topics, bibleverses, start, end, ts_field='created_at_date'): conn = get_es_connection() es_settings = ESSettings() sorted_topics = [] for topic in topics: is_spam = False for topic_term in topic: filters = [] filters.append( RangeFilter(qrange=ESRange(field=ts_field, from_value=start.strftime("%Y-%m-%d"), to_value=end.strftime("%Y-%m-%d"), include_upper=False)) ) filters.append(TermsFilter(field="bibleverse", values=bibleverses)) q = MatchQuery('text', topic_term['text'], type='phrase', slop=50) q = FilteredQuery(q, ANDFilter(filters)) q = q.search(size=1) resultset = conn.search(indices=es_settings.search_index, doc_types=[es_settings.search_es_type], query=q, size=1) for r in resultset: terms = topic_term['text'].split() regex = u"(?P<phrase>[a-z\s'\u2019]*{}.*{}[a-z\s']*)".format(*terms) # print "regex",regex ma = re.search(regex, r.text.lower()) if not ma: continue topic_term['es_phrase'] = ma.group('phrase').strip() topic_term['es_score'] = r._meta.score topic_term['final_score'] = topic_term['weight'] * topic_term['es_score'] topic_term['tweet_text'] = r.text.encode('ascii', 'ignore') topic_term['bibleverse'] = r['bibleverse'] if not is_spam and has_spam_text(topic_term['es_phrase']): is_spam = True sorted_topic = sorted(topic, key=lambda x: x.get('final_score', 0.0), reverse=True) if is_spam: for topic_term in topic: topic_term['is_spam'] = True sorted_topics.append(copy.deepcopy(sorted_topic)) return sorted_topics