def input_hashtag(index_pos): query_body = { 'query':{ 'match_all':{} } } index_name = flow_text_index_name_pre + index_pos # results = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits'] # print 'results...',results es_scan = scan(es_flow_text,index=index_name,doc_type=flow_text_index_type,\ query=query_body,size=1000) bulk_action = [] count = 0 while 1: try: data = es_scan.next() _id = data['_id'] item = data['_source'] text = item['text'] action = {'update':{'_id':_id}} if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) hashtag_list = RE.findall(text) if hashtag_list: hashtag = '&'.join(hashtag_list) else: hashtag = '' bulk_action.extend([{'update':{'_id':_id}}, {'doc':{'hashtag':hashtag}}]) if count % 1000 == 0 and count != 0: es_flow_text.bulk(bulk_action, index=index_name,doc_type=flow_text_index_type,timeout=600) bulk_action = [] print count count += 1 except StopIteration: break if bulk_action: print es_flow_text.bulk(bulk_action, index=index_name,doc_type=flow_text_index_type,timeout=600)
def main(): if RUN_TYPE: ts = time.time() - DAY date = ts2datetime(ts) else: date = "2013-09-05" index_name = flow_text_index_name_pre + date tb = time.time() count = 0 while 1: user_set = cluster_redis.rpop("update_mid_list") if user_set: bulk_action = json.loads(user_set) es_text.bulk(bulk_action, index=index_name, doc_type=flow_text_index_type, timeout=600) count += 1000 ts = time.time() print "%s : %s" % (count, ts - tb) tb = ts else: break
def main(): if RUN_TYPE: ts = time.time() - DAY date = ts2datetime(ts) else: date = '2013-09-05' index_name = flow_text_index_name_pre + date tb = time.time() count = 0 while 1: user_set = cluster_redis.rpop('update_mid_list') if user_set: bulk_action = json.loads(user_set) es_text.bulk(bulk_action, index=index_name, doc_type=flow_text_index_type, timeout=600) count += 1000 ts = time.time() print "%s : %s" % (count, ts - tb) tb = ts else: break
def main(): if RUN_TYPE: ts = time.time() - DAY date = ts2datetime(ts) else: date = '2013-09-05' index_name = flow_text_index_name_pre+date print index_name tb = time.time() count = 0 while 1: user_set = r_flow.rpop('update_mid_list') if user_set: bulk_action = json.loads(user_set) es_text.bulk(bulk_action, index=index_name, doc_type=flow_text_index_type, timeout=600) count += 1000 ts = time.time() if count % 1000000: print "%s : %s" %(count, ts - tb) tb = ts else: #break time.sleep(100)
if sensitive_count_dict.has_key(word): sensitive_count_dict[word] += sensitive_words_dict[word] else: sensitive_count_dict[word] = sensitive_words_dict[word] r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_words_dict)) #identify whether to mapping new es weibo_timestamp = item['timestamp'] should_index_name_date = ts2datetime(weibo_timestamp) if should_index_name_date != now_index_name_date: if action != [] and xdata != []: index_name = index_name_pre + now_index_name_date if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60) bulk_action = [] count = 0 now_index_name_date = should_index_name_date index_name = index_name_pre + now_index_name_date get_mappings(index_name) # save action, xdata = expand_index_action(item) bulk_action.extend([action, xdata]) count += 1 if count % 1000 == 0 and count != 0: index_name = index_name_pre + now_index_name_date if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60)
word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict)) #identify whether to mapping new es weibo_timestamp = item['timestamp'] should_index_name_date = ts2datetime(weibo_timestamp) if should_index_name_date != now_index_name_date: if action != [] and xdata != []: index_name = index_name_pre + now_index_name_date if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60) bulk_action = [] count = 0 now_index_name_date = should_index_name_date index_name = index_name_pre + now_index_name_date get_mappings(index_name) # save action, xdata = expand_index_action(item) bulk_action.extend([action, xdata]) count += 1 if count % 1000 == 0 and count != 0: index_name = index_name_pre + now_index_name_date if bulk_action: