Exemplo n.º 1
0
def input_hashtag(index_pos):
	query_body = {
		'query':{
			'match_all':{}
		}
	}
	index_name = flow_text_index_name_pre + index_pos
	# results = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits']

	# print 'results...',results
	es_scan = scan(es_flow_text,index=index_name,doc_type=flow_text_index_type,\
					query=query_body,size=1000)

	bulk_action = []
	
	count = 0

	while 1:
		try:
			data = es_scan.next()
			_id = data['_id']
			item = data['_source']
			text = item['text']

			action = {'update':{'_id':_id}}

			if isinstance(text, str):
			    text = text.decode('utf-8', 'ignore')
			RE = re.compile(u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
			hashtag_list = RE.findall(text)

			if hashtag_list:
				hashtag = '&'.join(hashtag_list)
			else:
				hashtag = ''

			bulk_action.extend([{'update':{'_id':_id}}, {'doc':{'hashtag':hashtag}}])

			if count % 1000 == 0 and count != 0:
				es_flow_text.bulk(bulk_action, index=index_name,doc_type=flow_text_index_type,timeout=600)
				bulk_action = []
				print count


			count += 1
		except StopIteration:
			break


	if bulk_action:
		
		print es_flow_text.bulk(bulk_action, index=index_name,doc_type=flow_text_index_type,timeout=600)
def main():
    if RUN_TYPE:
        ts = time.time() - DAY
        date = ts2datetime(ts)
    else:
        date = "2013-09-05"
    index_name = flow_text_index_name_pre + date

    tb = time.time()
    count = 0
    while 1:
        user_set = cluster_redis.rpop("update_mid_list")
        if user_set:
            bulk_action = json.loads(user_set)
            es_text.bulk(bulk_action, index=index_name, doc_type=flow_text_index_type, timeout=600)
            count += 1000
            ts = time.time()
            print "%s : %s" % (count, ts - tb)
            tb = ts
        else:
            break
Exemplo n.º 3
0
def main():
    if RUN_TYPE:
        ts = time.time() - DAY
        date = ts2datetime(ts)
    else:
        date = '2013-09-05'
    index_name = flow_text_index_name_pre + date

    tb = time.time()
    count = 0
    while 1:
        user_set = cluster_redis.rpop('update_mid_list')
        if user_set:
            bulk_action = json.loads(user_set)
            es_text.bulk(bulk_action,
                         index=index_name,
                         doc_type=flow_text_index_type,
                         timeout=600)
            count += 1000
            ts = time.time()
            print "%s : %s" % (count, ts - tb)
            tb = ts
        else:
            break
def main():
    if RUN_TYPE:
        ts = time.time() - DAY
        date = ts2datetime(ts)
    else:
        date = '2013-09-05'
    index_name = flow_text_index_name_pre+date

    print index_name
    tb = time.time()
    count = 0
    while 1:
        user_set = r_flow.rpop('update_mid_list')
        if user_set:
            bulk_action = json.loads(user_set)
            es_text.bulk(bulk_action, index=index_name, doc_type=flow_text_index_type, timeout=600)
            count += 1000
            ts = time.time()
            if count % 1000000:
                print "%s : %s" %(count, ts - tb)
                tb = ts
        else:
            #break
            time.sleep(100)
                        if sensitive_count_dict.has_key(word):
                            sensitive_count_dict[word] += sensitive_words_dict[word]
                        else:
                            sensitive_count_dict[word] = sensitive_words_dict[word]
                    r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
                else:
                    r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_words_dict))

            #identify whether to mapping new es
            weibo_timestamp = item['timestamp']
            should_index_name_date = ts2datetime(weibo_timestamp)
            if should_index_name_date != now_index_name_date:
                if action != [] and xdata != []:
                    index_name = index_name_pre + now_index_name_date
                    if bulk_action:
                        es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60)
                    bulk_action = []
                    count = 0
                    now_index_name_date = should_index_name_date
                    index_name = index_name_pre + now_index_name_date
                    get_mappings(index_name)

            # save
            action, xdata = expand_index_action(item)
            bulk_action.extend([action, xdata])
            count += 1
        
        if count % 1000 == 0 and count != 0:
            index_name = index_name_pre + now_index_name_date
            if bulk_action:
                es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60)
Exemplo n.º 6
0
                                word]
                    r_cluster.hset('sensitive_' + str(ts), str(uid),
                                   json.dumps(sensitive_count_dict))
                else:
                    r_cluster.hset('sensitive_' + str(ts), str(uid),
                                   json.dumps(sensitive_words_dict))

            #identify whether to mapping new es
            weibo_timestamp = item['timestamp']
            should_index_name_date = ts2datetime(weibo_timestamp)
            if should_index_name_date != now_index_name_date:
                if action != [] and xdata != []:
                    index_name = index_name_pre + now_index_name_date
                    if bulk_action:
                        es.bulk(bulk_action,
                                index=index_name,
                                doc_type=index_type,
                                timeout=60)
                    bulk_action = []
                    count = 0
                    now_index_name_date = should_index_name_date
                    index_name = index_name_pre + now_index_name_date
                    get_mappings(index_name)

            # save
            action, xdata = expand_index_action(item)
            bulk_action.extend([action, xdata])
            count += 1

        if count % 1000 == 0 and count != 0:
            index_name = index_name_pre + now_index_name_date
            if bulk_action: