def mapper_bci_history(todaydate=None):
    if todaydate:
        TODAY_TIME = todaydate
    es_query = {"query":{"bool":{"must":[],"must_not":[{"term":{"bci.update_time":TODAY_TIME}}],"should":[{"match_all":{}}]}},"from":0,"size":MAX_ITEMS,"fields":[]}

    s_re = scan(es_9200, query=es_query, index=BCIHIS_INDEX_NAME, doc_type=BCIHIS_INDEX_TYPE)
    count = 0
    array = []
    while 1:
        try:
            temp = s_re.next()
            one_item = {}
            one_item['id'] = temp['_id'].encode("utf-8")
            one_item['total_num'] = 0
            one_item['today_bci'] = 0
            one_item['update_time'] = TODAY_TIME
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                count = 0
        except StopIteration: 
                print "all done" 
                r_flow.lpush('update_bci_list', json.dumps(array))
                break 
def mapper_bci_history(todaydate=None):
    if todaydate:
        TODAY_TIME = todaydate
    es_query = {"query":{"bool":{"must_not":[{"term":{"bci.update_time":TODAY_TIME}}]}},"fields":["user_fansnum"], "size":1000}

    s_re = scan(es_user_profile, query=es_query, index=BCIHIS_INDEX_NAME, doc_type=BCIHIS_INDEX_TYPE)
    count = 0
    array = []
    while 1:
        try:
            temp = s_re.next()
            one_item = {}
            one_item['id'] = temp['_id'].encode("utf-8")
            one_item['total_num'] = 0
            one_item['today_bci'] = 0
            one_item['update_time'] = TODAY_TIME
            tmp = temp.get('fields', {})
            if tmp:
                one_item['user_fansnum'] = temp['fields']["user_fansnum"][0]
            else:
                one_item['user_fansnum'] = 0
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                if count % 100000 == 0:
                    print count
        except StopIteration: 
                print "all done" 
                if array:
                    r_flow.lpush('update_bci_list', json.dumps(array))
                break 

    print count
def mapper_bci_today(todaydate=None):
    if todaydate:
        BCI_INDEX_NAME = BCI_INDEX_NAME_PRE + ts2datetime((datetime2ts(todaydate) - DAY)).replace("-","")
        TODAY_TIME = todaydate
    else :
        BCI_INDEX_NAME = BCI_INDEX_NAME_PRE + '20130901'
        TODAY_TIME = '2013-09-02'
    s_re = scan(es_9200, query={"query":{"match_all":{}},"size":MAX_ITEMS ,"fields":[TOTAL_NUM,TODAY_BCI]}, index=BCI_INDEX_NAME, doc_type=BCI_INDEX_TYPE)
    count = 0
    array = []
    while 1:
        try:
            temp = s_re.next()
            one_item = {}
            one_item['id'] = temp['_id'].encode("utf-8")
            one_item['total_num'] = temp['fields'][TOTAL_NUM][0]
            one_item['today_bci'] = temp['fields'][TODAY_BCI][0]
            one_item['update_time'] = TODAY_TIME
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                count = 0
        except StopIteration: 
                print "all done" 
                r_flow.lpush('update_bci_list', json.dumps(array))              
                break 
def mapper_bci_today(todaydate):
    BCI_INDEX_NAME = "bci_" + ts2datetime((datetime2ts(todaydate) - DAY)).replace("-","")
    TODAY_TIME = todaydate
    print BCI_INDEX_NAME
    s_re = scan(es_9200, query={"query":{"match_all":{}},"size":MAX_ITEMS ,"fields":[TOTAL_NUM,TODAY_BCI, "user_fansnum", 'user_friendsnum']}, index=BCI_INDEX_NAME, doc_type=BCI_INDEX_TYPE)
    count = 0
    array = []
    while 1:
        try:
            temp = s_re.next()
            one_item = {}
            one_item['id'] = temp['_id'].encode("utf-8")
            one_item['user_fansnum'] = temp['fields']["user_fansnum"][0]
            one_item['user_friendsnum'] = temp['fields']['user_friendsnum'][0]
            one_item['total_num'] = temp['fields'][TOTAL_NUM][0]
            one_item['today_bci'] = temp['fields'][TODAY_BCI][0]
            one_item['update_time'] = TODAY_TIME
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                #if count % 100000 == 0:
                #    print count
        except StopIteration: 
                print "all done" 
                if array:
                    r_flow.lpush('update_bci_list', json.dumps(array))              
                break 

    print count
def main():
    scan_cursor = 0
    count = 0
    bulk_action = []
    number = r.scard('user_set')
    print number

    if RUN_TYPE:
        ts = time.time() - DAY
        date = ts2datetime(ts)
        start_time = str(ts2datetime(time.time()))
        print "/cron/push_mid2redis.py&start&%s" %start_time
    else:
        date = '2013-09-05'
    index_name = flow_text_index_name_pre+date
    print index_name

    ts = time.time()
    while 1:
        re_scan = r.sscan("user_set", scan_cursor, count=3000)
        scan_cursor = re_scan[0]
        uid_list = re_scan[1] #具体数据
        if len(uid_list):
            for uid in uid_list:
                detail_dict = r.hgetall(uid)
                for k,v in detail_dict.iteritems():
                    update_dict = dict()
                    if "_origin_weibo_retweeted" in k and v:
                        mid = k.split('_')[0]
                        update_dict["retweeted"] = int(v)
                    elif "_origin_weibo_comment" in k and v:
                        mid = k.split('_')[0]
                        update_dict["comment"] = int(v)
                    elif '_retweeted_weibo_comment' in k and v:
                        mid = k.split('_')[0]
                        update_dict["comment"] = int(v)
                    elif '_retweeted_weibo_retweeted' in k and v:
                        mid = k.split('_')[0]
                        update_dict["retweeted"] = int(v)
                    else:
                        pass
                    if update_dict:
                        action = {"update": {"_id": mid}}
                        xdata = {"doc": update_dict}
                        bulk_action.extend([action, xdata])
                        count += 1
                        if count % 400 == 0:
                            r_flow.lpush('update_mid_list', json.dumps(bulk_action))
                            bulk_action = []
                            tp = time.time()
                            #print "%s cost %s" %(count, tp-ts)
                            ts = tp
        if int(scan_cursor) == 0:
            break

    if bulk_action:
        r_flow.lpush('update_mid_list', json.dumps(bulk_action))

    print count
Exemplo n.º 6
0
def main():
    scan_cursor = 0
    count = 0
    bulk_action = []
    number = r.scard('user_set')
    print number

    if RUN_TYPE:
        ts = time.time() - DAY
        date = ts2datetime(ts)
    else:
        date = '2013-09-05'
    index_name = flow_text_index_name_pre + date

    ts = time.time()
    while 1:
        re_scan = r.sscan("user_set", scan_cursor, count=3000)
        scan_cursor = re_scan[0]
        uid_list = re_scan[1]  #具体数据
        if len(uid_list):
            for uid in uid_list:
                detail_dict = r.hgetall(uid)
                for k, v in detail_dict.iteritems():
                    update_dict = dict()
                    if "_origin_weibo_retweeted" in k and int(v):
                        mid = k.split('_')[0]
                        update_dict["retweeted"] = int(v)
                    elif "_origin_weibo_comment" in k and int(v):
                        mid = k.split('_')[0]
                        update_dict["comment"] = int(v)
                    else:
                        pass
                    if update_dict:
                        action = {"update": {"_id": mid}}
                        xdata = {"doc": update_dict}
                        bulk_action.extend([action, xdata])
                        count += 1
                        if count % 1000 == 0:
                            #print bulk_action
                            r_flow.lpush('update_mid_list',
                                         json.dumps(bulk_action))
                            bulk_action = []
                            tp = time.time()
                            print "%s cost %s" % (count, tp - ts)
                            ts = tp
        if int(scan_cursor) == 0:
            break

    if bulk_action:
        r_flow.lpush('update_mid_list', json.dumps(bulk_action))

    print count
def mapper_bci_history(todaydate=None):
    if todaydate:
        TODAY_TIME = todaydate
    es_query = {
        "query": {
            "bool": {
                "must": [],
                "must_not": [{
                    "term": {
                        "bci.update_time": TODAY_TIME
                    }
                }],
                "should": [{
                    "match_all": {}
                }]
            }
        },
        "from": 0,
        "size": MAX_ITEMS,
        "fields": []
    }

    s_re = scan(es_user_profile,
                query=es_query,
                index=BCIHIS_INDEX_NAME,
                doc_type=BCIHIS_INDEX_TYPE)
    count = 0
    array = []
    while 1:
        try:
            temp = s_re.next()
            one_item = {}
            one_item['id'] = temp['_id'].encode("utf-8")
            one_item['total_num'] = 0
            one_item['today_bci'] = 0
            one_item['update_time'] = TODAY_TIME
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                if count % 100000 == 0:
                    print count
        except StopIteration:
            print "all done"
            if array:
                r_flow.lpush('update_bci_list', json.dumps(array))
            break

    print count
def mapper_bci_today(todaydate):
    BCI_INDEX_NAME = "bci_" + ts2datetime(
        (datetime2ts(todaydate) - DAY)).replace("-", "")
    TODAY_TIME = todaydate
    print BCI_INDEX_NAME
    s_re = scan(es_9200,
                query={
                    "query": {
                        "match_all": {}
                    },
                    "size":
                    MAX_ITEMS,
                    "fields":
                    [TOTAL_NUM, TODAY_BCI, "user_fansnum", 'user_friendsnum']
                },
                index=BCI_INDEX_NAME,
                doc_type=BCI_INDEX_TYPE)
    count = 0
    array = []
    while 1:
        try:
            temp = s_re.next()
            one_item = {}
            one_item['id'] = temp['_id'].encode("utf-8")
            one_item['user_fansnum'] = temp['fields']["user_fansnum"][0]
            one_item['user_friendsnum'] = temp['fields']['user_friendsnum'][0]
            one_item['total_num'] = temp['fields'][TOTAL_NUM][0]
            one_item['today_bci'] = temp['fields'][TODAY_BCI][0]
            one_item['update_time'] = TODAY_TIME
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                #if count % 100000 == 0:
                #    print count
        except StopIteration:
            print "all done"
            if array:
                r_flow.lpush('update_bci_list', json.dumps(array))
            break

    print count
def mapper_bci_history(todaydate=None):
    if todaydate:
        TODAY_TIME = todaydate
    es_query = {"query":{"bool":{"must_not":[{"term":{"bci.update_time":TODAY_TIME}}]}},"fields":["user_fansnum", "user_friendsnum"], "size":1000}

    s_re = scan(es_user_profile, query=es_query, index=BCIHIS_INDEX_NAME, doc_type=BCIHIS_INDEX_TYPE)
    count = 0
    array = []
    while 1:
        try:
            temp = s_re.next()
            one_item = {}
            one_item['id'] = temp['_id'].encode("utf-8")
            one_item['total_num'] = 0
            one_item['today_bci'] = 0
            one_item['update_time'] = TODAY_TIME
            tmp = temp.get('fields', {})
            if tmp.has_key("user_friendsnum"):
                one_item["user_friendsnum"] = temp['fields']["user_friendsnum"][0]
            else:
                one_item["user_friendsnum"] = 0
            if tmp.has_key('user_fansnum'):
                one_item['user_fansnum'] = temp['fields']["user_fansnum"][0]
            else:
                one_item['user_fansnum'] = 0
            
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                #if count % 100000 == 0:
                #    print count
        except StopIteration: 
                print "all done" 
                if array:
                    r_flow.lpush('update_bci_list', json.dumps(array))
                break 

    print count
Exemplo n.º 10
0
def mapper_bci_today(todaydate=None):
    if todaydate:
        BCI_INDEX_NAME = BCI_INDEX_NAME_PRE + ts2datetime(
            (datetime2ts(todaydate) - DAY)).replace("-", "")
        TODAY_TIME = todaydate
    else:
        BCI_INDEX_NAME = BCI_INDEX_NAME_PRE + '20130901'
        TODAY_TIME = '2013-09-02'
    s_re = scan(es_9200,
                query={
                    "query": {
                        "match_all": {}
                    },
                    "size": MAX_ITEMS,
                    "fields": [TOTAL_NUM, TODAY_BCI]
                },
                index=BCI_INDEX_NAME,
                doc_type=BCI_INDEX_TYPE)
    count = 0
    array = []
    while 1:
        try:
            temp = s_re.next()
            one_item = {}
            one_item['id'] = temp['_id'].encode("utf-8")
            one_item['total_num'] = temp['fields'][TOTAL_NUM][0]
            one_item['today_bci'] = temp['fields'][TODAY_BCI][0]
            one_item['update_time'] = TODAY_TIME
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                count = 0
        except StopIteration:
            print "all done"
            r_flow.lpush('update_bci_list', json.dumps(array))
            break