Exemplo n.º 1
0
def save_es(en_name, result):
    bulk_action = []
    count = 0
    tb = time.time()
    for weibos in result:
        #try:
        source = weibos['_source']
        action = {'index': {'_id': weibos['_id']}}
        bulk_action.extend([action, source])
        count += 1
        if count % 1000 == 0:
            es_event.bulk(bulk_action,
                          index=en_name,
                          doc_type=event_type,
                          timeout=100)
            bulk_action = []
            print count
            if count % 10000 == 0:
                te = time.time()
                print "index 10000 per %s second" % (te - tb)
                tb = ts
    print "all done"
    if bulk_action:
        es_event.bulk(bulk_action,
                      index=en_name,
                      doc_type=event_type,
                      timeout=100)
    return 1
Exemplo n.º 2
0
def find_flow_texts_scan(start_ts, end_ts, topic, en_name, keywords, mid):
    index_names = get_day_zero(start_ts, end_ts)
    #mid = re.compile('^\d{16}$')
    if len(keywords) == 0 and len(mid) == 0:
        query_body = {'query': {'wildcard': {'text': '*' + topic + '*'}}}
    elif len(mid) == 16:
        #elif len(mid.findall(keywords))>0:
        query_body = {'query': {'term': {'root_mid': mid}}}
    else:
        #keywords_list = [{'wildcard':{'text':'*'+topic+'*'}}]
        keywords_list = []
        for i in keywords:
            print i
            # keywords_list.append({'wildcard':{'text':'*'+i+'*'}})
            keywords_list.append({'term': {'keywords_string': i}})

        query_body = {
            'query': {
                'bool': {
                    'should': keywords_list,
                    'minimum_should_match': '60%'
                }
            }
        }

    print query_body
    result = []
    index_list = []
    for index_name in index_names:
        index_list.append(flow_text_index_name_pre + index_name)
    s_re = scan(es_flow_text,
                index=index_list,
                doc_type=flow_text_index_type,
                query=query_body)
    bulk_action = []
    count = 0
    tb = time.time()
    while True:
        try:
            if count > 100000:
                break
            scan_re = s_re.next()
            _id = scan_re['_id']
            source = scan_re['_source']
            source['en_name'] = en_name
            action = {"index": {"_id": _id}}
            bulk_action.extend([action, source])
            count += 1
            if count % 1000 == 0:
                es_event.bulk(bulk_action,
                              index=en_name,
                              doc_type=event_text_type,
                              timeout=100)
                bulk_action = []
                print count
                if count % 10000 == 0:
                    te = time.time()
                    print "index 10000 per %s second" % (te - tb)
                    tb = te
        except StopIteration:
            print "all done"
            break
    if bulk_action:
        es_event.bulk(bulk_action,
                      index=en_name,
                      doc_type=event_text_type,
                      timeout=100)

    return 1