Пример #1
0
def choose_perf_collection(begin_datetime, end_datetime, prefix):
    begin_time = datetime_timestamp(begin_datetime)
    end_time = datetime_timestamp(end_datetime)
    ## 1. 动态确定查询分表
    # 注意条件中begin_time和end_time关系颠倒
    col_condition = {
        'type': prefix,
        'endTime': {
            '$gte': begin_time
        },
        'beginTime': {
            '$lt': end_time
        }
    }

    perf_mgo = database('PERF_MGO').get_db()
    sub_collections = list(
        perf_mgo.get_collection('t_db_sub_collections').find(col_condition))
    # 没有查到数据, 可能分表失败,默认从最新的表中提取数据
    if not sub_collections:
        sub_collections = list(
            perf_mgo.get_collection('t_db_sub_collections').find(
                {
                    'type': prefix
                }, limit=1).sort('endTime', pymongo.DESCENDING))
    collections = []
    for item in sub_collections:
        collections.append(item['collectionName'])

    return collections
Пример #2
0
def _query_flow_day_group(imsi_list, begin_datetime, end_datetime, con=None):
    flowdata = {}
    if not con:
        con = database('OSS_MGO').get_db()
    # 处理时间跨月的情况
    while begin_datetime < end_datetime:
        begin_time = datetime_timestamp(begin_datetime)
        end_time = datetime_timestamp(end_datetime)
        pipeline = [{
            '$match': {
                'countTime': {
                    '$gte': begin_time,
                    '$lt': end_time
                },
                'imsi': {
                    '$in': imsi_list
                }
            }
        }, {
            '$group': {
                '_id': '$imsi',
                'total': {
                    '$sum': '$total'
                },
                'userTotal': {
                    '$sum': '$userTotal'
                },
                'cardTotal': {
                    '$sum': '$cardTotal'
                }
            }
        }, {
            '$project': {
                '_id': 1,
                'flow': {
                    '$add': ['$userTotal', '$total', '$cardTotal']
                }
            }
        }]
        col = 't_terminal_flow_count_day_{}'.format(
            format_datetime(begin_datetime, '%Y%m01'))
        for doc in con[col].aggregate(pipeline, allowDiskUse=True):
            if doc['_id'] in flowdata.keys():
                flowdata[doc['_id']] += doc['flow']
            else:
                flowdata.update({doc['_id']: doc['flow']})
        if begin_datetime.month + 1 > 12:
            begin_datetime = datetime.datetime(year=begin_datetime.year + 1,
                                               month=1,
                                               day=1)
        else:
            begin_datetime = datetime.datetime(year=begin_datetime.year,
                                               month=begin_datetime.month + 1,
                                               day=1)
    return flowdata
Пример #3
0
def generate_timestamp_datetime():
    begin_datetime = mkdatetime('2017-01-01 08:00:00')
    end_datetime = mkdatetime('2018-01-01 08:00:00')
    tmp = [['timestamp', 'date']]
    while begin_datetime < end_datetime:
        ts = datetime_timestamp(begin_datetime)
        tmp.append([ts, begin_datetime])
        begin_datetime += datetime.timedelta(1)
    pyexcel.save_as(array=tmp, dest_file_name='2017_datetime_timestamp.csv')
Пример #4
0
def _query_flow_hour_group(imsi_list, begin_datetime, end_datetime, con=None):
    begin_time = datetime_timestamp(begin_datetime)
    end_time = datetime_timestamp(end_datetime)
    pipeline = [{
        '$match': {
            'countTime': {
                '$gte': begin_time,
                '$lt': end_time
            },
            'imsi': {
                '$in': imsi_list
            }
        }
    }, {
        '$group': {
            '_id': '$imsi',
            'total': {
                '$sum': '$total'
            },
            'userTotal': {
                '$sum': '$userTotal'
            },
            'cardTotal': {
                '$sum': '$cardTotal'
            }
        }
    }, {
        '$project': {
            '_id': 1,
            'flow': {
                '$add': ['$userTotal', '$total', '$cardTotal']
            }
        }
    }]
    if not con:
        con = database('OSS_MGO').get_db()
    col = 't_terminal_flow_count_hour_{}'.format(
        format_datetime(begin_datetime, '%Y%m01'))
    flowdata = {}
    for doc in con[col].aggregate(pipeline, allowDiskUse=True):
        flowdata.update({doc['_id']: doc['flow']})
    return flowdata
def convertToUnixtime(df, datetime):
    datetime = df[datetime]
    unixtime = utils.datetime_timestamp(datetime)
    return unixtime
Пример #6
0
def fetch_day_flow(begin_datetime, end_datetime):
    begin_time = datetime_timestamp(begin_datetime)
    end_time = datetime_timestamp(end_datetime)

    pipeline = [{
        '$match': {
            'createtime': {
                '$gte': begin_time,
                '$lt': end_time
            }
        }
    }, {
        '$project': {
            '_id': 0,
            'createtime': 1,
            'mcc': 1,
            'lac': 1,
            'plmn': 1,
            'imei': 1,
            'imsi': 1,
            'userFlow': '$userFlower',
            'cardFlow': 1,
            'sysFlow': '$sysFlower'
        }
    }]
    mgo = database('GSVC_MGO').get_db()
    rdata = list(
        mgo.get_collection('dayChargeFlower').aggregate(pipeline,
                                                        allowDiskUse=True))
    keys = [
        'createtime', 'lac', 'mcc', 'plmn', 'imei', 'imsi', 'userFlow',
        'cardFlow', 'sysFlow'
    ]

    imei_org = load_imei_org()
    mcc_country = load_mcc_country()
    # format data, and deal with missing data
    for i in range(len(rdata)):
        # deal with missing key
        missing_key = list(set(keys).difference(set(rdata[i].keys())))
        for k in missing_key:
            if k in ['lac', 'imsi', 'plmn', 'mcc', 'imei']:
                rdata[i][k] = 'NaN'
            else:
                rdata[i][k] = 0
        # add imei info
        if rdata[i]['imei'] != 'NaN' and (rdata[i]['imei'] in imei_org.keys()):
            rdata[i]['t_orgid'] = imei_org[rdata[i]['imei']]['t_orgid']
            rdata[i]['t_type'] = imei_org[rdata[i]['imei']]['t_type']
        else:
            rdata[i]['t_orgid'] = 'NaN'
            rdata[i]['t_type'] = 'NaN'

        # add country info
        if rdata[i]['mcc'] != 'NaN' and (rdata[i]['mcc']
                                         in mcc_country.keys()):
            # deal GU and SP
            if rdata[i]['mcc'] == '310':
                if rdata[i]['plmn'] != 'NaN':
                    mnc = rdata[i]['plmn'][-3:]
                    if (mnc in ['470', '140']) and \
                        (rdata[i]['lac'] in ['208','171','1', '10', '23', '24', '60', '66']):
                        rdata[i]['country'] = 'SPGU'
                    else:
                        rdata[i]['country'] = 'US'
                else:
                    rdata[i]['country'] = 'US'
            else:
                rdata[i]['country'] = mcc_country[rdata[i]['mcc']]
        else:
            rdata[i]['country'] = 'NaN'
    return rdata
Пример #7
0
def fetch_day_flow_cursor(begin_datetime, end_datetime):
    begin_time = datetime_timestamp(begin_datetime)
    end_time = datetime_timestamp(end_datetime)

    match = {'createtime': {'$gte': begin_time, '$lt': end_time}}
    project = {'_id': 0}
    mgo = database('GSVC_MGO').get_db()
    cur = mgo.get_collection('dayChargeFlower').find(match, project)
    keys = [
        'createtime', 'lac', 'mcc', 'plmn', 'imei', 'imsi', 'userFlower',
        'cardFlow', 'sysFlower'
    ]
    count = 0
    tmp = []  # data to be insert
    imei_org = load_imei_org()
    mcc_country = load_mcc_country()
    while True:
        try:
            doc = cur.next()
        except StopIteration:
            # 最后一次插入
            if tmp:
                insertTable(tmp, 't_terminal_flow_count_day_201701')
            break
        # 处理缺失的key
        missing_key = list(set(keys).difference(set(doc.keys())))
        for k in missing_key:
            if k in ['lac', 'imsi', 'plmn', 'mcc', 'imei']:
                doc[k] = 'NaN'
            else:
                doc[k] = 0
        # 丢弃异常流量数据
        # 不在数据库做限制,是因为加上这个条件后,查询和数据抽取的性能低下
        if doc['userFlower'] > 2147483648:
            continue
        # 增加imei信息
        if doc['imei'] != 'NaN' and (doc['imei'] in imei_org.keys()):
            doc['t_orgid'] = imei_org[doc['imei']]['t_orgid']
            doc['t_type'] = imei_org[doc['imei']]['t_type']
        else:
            doc['t_orgid'] = 'NaN'
            doc['t_type'] = 'NaN'
        # 增加国家信息
        if doc['mcc'] != 'NaN' and (doc['mcc'] in mcc_country.keys()):
            # deal GU and SP
            if doc['mcc'] == '310':
                if doc['plmn'] != 'NaN':
                    mnc = doc['plmn'][-3:]
                    if (mnc in ['470', '140']) and \
                        (doc['lac'] in ['208','171','1', '10', '23', '24', '60', '66']):
                        doc['country'] = 'SPGU'
                    else:
                        doc['country'] = 'US'
                else:
                    doc['country'] = 'US'
            else:
                doc['country'] = mcc_country[doc['mcc']]
        else:
            doc['country'] = 'NaN'

        tmp.append(doc)
        count += 1
        # 每2000条做一次数据插入
        if count % 2000 == 0:
            insertTable(tmp, 't_terminal_flow_count_day_201701')
            print('insert {}\n'.format(count))
            tmp = []
Пример #8
0
def update_vsim_estsucc_last_succ():
    begin_datetime = generic_query(
        database('GSVC_SQL').get_db(),
        "SELECT MAX(update_time) FROM `t_term_vsim_estsucc_last_succ`")[0][0]
    begin_datetime = mkdatetime(str(begin_datetime))
    now = datetime.datetime.now()
    today = datetime.datetime(year=now.year, month=now.month, day=now.day)
    mgo = database('PERF_MGO').get_db()
    succ_info = {}
    # 1. 循环更新succ信息, 按天避免查询范围过大。返回最终要插入的表
    while begin_datetime < today:
        begin_time = datetime_timestamp(begin_datetime)
        end_time = datetime_timestamp(begin_datetime +
                                      datetime.timedelta(days=1))
        # choose collection返回的是list, 此场景下只有一个值
        col = choose_perf_collection(begin_datetime,
                                     begin_datetime +
                                     datetime.timedelta(days=1),
                                     prefix='t_term_vsim_estsucc')[0]

        match = {'createTime': {'$gte': begin_time, '$lt': end_time}}
        # $吐槽: 为了使用mongo自带的project重命名, 写pipeline的代价可是真的大
        pipeline = [{
            '$match': match
        }, {
            '$sort': {
                'createTime': -1
            }
        }, {
            '$group': {
                '_id': '$vsimImsi',
                'succ_time': {
                    '$first': '$succTime'
                },
                'succ_mcc': {
                    '$first': '$mcc'
                },
                'succ_mnc': {
                    '$first': '$mnc'
                },
                'succ_lac': {
                    '$first': '$lac'
                },
            }
        }, {
            '$project': {
                '_id': 0,
                'imsi': '$_id',
                'succ_time': 1,
                'succ_mcc': 1,
                'succ_mnc': 1,
                'succ_lac': 1
            }
        }]

        for info in mgo[col].aggregate(pipeline, allowDiskUse=True):
            if info['imsi'] == "":
                continue

            info['update_time'] = today  # 记录更新日期
            info['create_date'] = today  # 插入数据库时生效,update不生效
            if len(str(info['succ_time'])) != 13:
                info['succ_time'] = mkdatetime('1900-01-01')
            else:
                info['succ_time'] = timestamp_datetime(info['succ_time'])
            info['succ_country'] = mcc_country(info['succ_mcc'],
                                               info['succ_mnc'],
                                               info['succ_lac'])

            if info['imsi'] in succ_info.keys():
                succ_info[info['imsi']] = info
            else:
                succ_info.update({info['imsi']: info})

        begin_datetime += datetime.timedelta(days=1)

    # 2. format查询结果,便于插入与update
    succ_list = [x for x in succ_info.values()]
    succ_tuple_key = succ_list[1].keys()
    succ_tuple_val = [tuple(v.values()) for v in succ_list]

    update_stmt = (
        "INSERT INTO `{target_table}` (`{colname}`) VALUES ({placeholder}) ON DUPLICATE KEY "
        "UPDATE `succ_time` = values(`succ_time`), "
        "`update_time` = values(`update_time`),"
        "`succ_lac` = values(`succ_lac`),"
        "`succ_mcc` = values(`succ_mcc`),"
        "`succ_country` = values(`succ_country`),"
        "`succ_mnc` = values(`succ_mnc`)").format(
            target_table='t_term_vsim_estsucc_last_succ',
            colname="`,`".join(succ_tuple_key),
            placeholder=','.join(['%s' for x in range(len(succ_tuple_key))]))
    con = database('GSVC_SQL_ADMIN').get_db()
    try:
        with con.cursor() as cur:
            effect_row = cur.executemany(update_stmt, succ_tuple_val)
        con.commit()
    except Exception as e:
        con.close()
        print('INSERT ROWS:{}'.format(effect_row))
        raise e
    print('INSERT ROWS:{}'.format(effect_row))
    con.close()