Пример #1
0
def save_es(en_name, result):
    bulk_action = []
    count = 0
    tb = time.time()
    for weibos in result:
        #try:
        source = weibos['_source']
        action = {'index': {'_id': weibos['_id']}}
        bulk_action.extend([action, source])
        count += 1
        if count % 1000 == 0:
            weibo_es.bulk(bulk_action,
                          index=en_name,
                          doc_type=topic_index_type,
                          timeout=100)
            bulk_action = []
            print count
            if count % 10000 == 0:
                te = time.time()
                print "index 10000 per %s second" % (te - tb)
                tb = ts
    print "all done"
    if bulk_action:
        weibo_es.bulk(bulk_action,
                      index=en_name,
                      doc_type=topic_index_type,
                      timeout=100)
    return 1
Пример #2
0
def gexf2es(indexname, value):
    bulk_action = []
    action = {"index": {"_id": 1}}
    #print value
    source = json.dumps(value)
    bulk_action.extend([action, source])
    es.bulk(bulk_action, index=indexname, doc_type='text', timeout=600)
Пример #3
0
def gexf2es(indexname, value):
	bulk_action = []
	action = {"index":{"_id":1}}
	#print value
	source = json.dumps(value)
	bulk_action.extend([action,source])
	es.bulk(bulk_action, index=indexname, doc_type='text', timeout=600)
Пример #4
0
def txt2es(filename, name):
    weibo = []
    f = open(filename, 'r')
    i = 0
    bulk_action = []
    for line0 in f:
        line0 = json.loads(line0)
        #print line0[-1]
        count = 0
        for line in line0:
            #weibo.append(line)
            #print line['_source']['mid'],type(line['_source']['mid'])
            action = {"index": {"_id": line['_source']['mid']}}
            source = line['_source']
            count += 1
            bulk_action.extend([action, source])
            if count % 1000 == 0:
                print es.bulk(bulk_action,
                              index=name,
                              doc_type='text',
                              timeout=600)
                bulk_action = []
                print count
                #print len(bulk_action)
            print len(bulk_action)
        #print bulk_action

    #print es
    print name, type(name), name.decode('utf-8')
    print es.bulk(bulk_action, index=name, doc_type='text', timeout=600)
Пример #5
0
def txt2es(filename,name ):
	weibo = []
	f = open(filename,'r')
	i = 0
	bulk_action = []
	for line0 in f:
		line0 = json.loads(line0)
		#print line0[-1]
		count = 0
		for line in line0:
			#weibo.append(line)
			#print line['_source']['mid'],type(line['_source']['mid'])
			action = {"index":{"_id":line['_source']['mid']}}
			source = line['_source']
			count += 1
			bulk_action.extend([action,source])
			if count % 1000 == 0:
				print es.bulk(bulk_action, index=name, doc_type='text', timeout=600)
				bulk_action = []
				print count
def find_flow_texts_scan(start_ts, end_ts, topic, en_name, keywords):
    index_names = get_day_zero(start_ts, end_ts)
    if len(keywords) == 0:
        query_body = {'query': {'wildcard': {'text': '*' + topic + '*'}}}
    else:
        #keywords_list = [{'wildcard':{'text':'*'+topic+'*'}}]
        keywords_list = []
        for i in keywords:
            print i
            keywords_list.append({'wildcard': {'text': '*' + i + '*'}})
        query_body = {
            'query': {
                'bool': {
                    'should': keywords_list,
                    'minimum_should_match': '60%'
                }
            }
        }

    print query_body
    result = []
    index_list = []
    for index_name in index_names:
        index_list.append(flow_text_index_name_pre + index_name)
    s_re = scan(es_flow_text,
                index=index_list,
                doc_type=flow_text_index_type,
                query=query_body)
    bulk_action = []
    count = 0
    tb = time.time()
    while True:
        try:
            if count > 5000:
                break
            scan_re = s_re.next()
            _id = scan_re['_id']
            source = scan_re['_source']
            source['en_name'] = en_name
            action = {"index": {"_id": _id}}
            bulk_action.extend([action, source])
            count += 1
            if count % 1000 == 0:
                weibo_es.bulk(bulk_action,
                              index=event_text,
                              doc_type=event_text_type,
                              timeout=100)
                bulk_action = []
                print count
                if count % 10000 == 0:
                    te = time.time()
                    print "index 10000 per %s second" % (te - tb)
                    tb = te
        except StopIteration:
            print "all done"
    if bulk_action:
        weibo_es.bulk(bulk_action,
                      index=event_text,
                      doc_type=event_text_type,
                      timeout=100)

    return 1