示例#1
0
def phrase_search(topics, bibleverses, start, end, ts_field='created_at_date'):
    conn = get_es_connection()
    es_settings = ESSettings()
    sorted_topics = []

    for topic in topics:
        is_spam = False
        for topic_term in topic:
            filters = []
            filters.append(
                RangeFilter(qrange=ESRange(field=ts_field,
                                           from_value=start.strftime("%Y-%m-%d"),
                                           to_value=end.strftime("%Y-%m-%d"),
                                           include_upper=False))
            )
            filters.append(TermsFilter(field="bibleverse", values=bibleverses))
            q = MatchQuery('text', topic_term['text'], type='phrase', slop=50)
            q = FilteredQuery(q, ANDFilter(filters))
            q = q.search(size=1)
            resultset = conn.search(indices=es_settings.search_index,
                                    doc_types=[es_settings.search_es_type],
                                    query=q,
                                    size=1)

            for r in resultset:
                terms = topic_term['text'].split()
                regex = u"(?P<phrase>[a-z\s'\u2019]*{}.*{}[a-z\s']*)".format(*terms)
                # print "regex",regex
                ma = re.search(regex, r.text.lower())
                if not ma:
                    continue
                topic_term['es_phrase'] = ma.group('phrase').strip()
                topic_term['es_score'] = r._meta.score
                topic_term['final_score'] = topic_term['weight'] * topic_term['es_score']
                topic_term['tweet_text'] = r.text.encode('ascii', 'ignore')
                topic_term['bibleverse'] = r['bibleverse']

                if not is_spam and has_spam_text(topic_term['es_phrase']):
                    is_spam = True

        sorted_topic = sorted(topic,
                              key=lambda x: x.get('final_score', 0.0),
                              reverse=True)
        if is_spam:
            for topic_term in topic:
                topic_term['is_spam'] = True
        sorted_topics.append(copy.deepcopy(sorted_topic))
    return sorted_topics