Exemplo n.º 1
0
class IPLoaderFromMG(IPLoader):
    '''
    load 需要定位的IP地址 -> IPContainer(StackSet)
    '''
    def __init__(self):
        self.client = PyMongoClient()
        self.IPContainer = StackSet()

    def load(self):
        t = threading.Thread(target=self.load_thread,
                             name="Thread_LoadIPFromMG")
        t.setDaemon(True)
        t.start()

    def load_thread(self, once_sleep=30):
        logger.info("IP Loader Starting.......")
        _counter = 0
        while True:
            if self.IPContainer.size() >= 10000:
                time.sleep(once_sleep)
                continue
            size_before = self.IPContainer.size()
            cur = self.client.find(
                "jh", "UserIP", {
                    "timestamp": {
                        "$gte": time.time() - 5 * 60
                    },
                    "city": {
                        "$exists": False
                    }
                })
            for item in cur:
                ip = item["_id"]
                self.IPContainer.push(ip)
                _counter += 1
            size_after = self.IPContainer.size()
            # print self.IPContainer.items
            logger.info(
                "Total Read IP: %s, IPContainer has ip: %d, load ip: %d" %
                (_counter, size_after, size_after - size_before))
            time.sleep(once_sleep)

    def iter(self):
        while True:
            try:
                item = self.IPContainer.pop()
            except IndexError:
                import traceback
                logger.warning("IPContainer is empty!")
                time.sleep(10)
                continue
            yield item
Exemplo n.º 2
0
def usetimeDistribute(num, appkey="BIQU_ANDROID", delta=120):
    # def usetimeDistribute(num, appkey="biqu", delta=120):
    curday = datetime.datetime.today().strftime("%Y%m%d")
    dayStr = time.strftime("%Y-%m-%d",
                           time.localtime(time.time() - 86400 * num))
    client = PyMongoClient()
    result = {}
    # for item in client.find(appkey, "uvfile", {"tm": dayStr, "jhd_userkey": userkey}):
    m, n = 0, 0
    for item in client.find(appkey, "uvfile", {"tm": dayStr}):
        opas = ["action", "page", "in", "end"]
        uid = item["jhd_userkey"]
        end_sum = item["item_add"].get("end", 0)
        opatms = list(
            set(
                reduce(
                    lambda a, b: a + b,
                    map(
                        lambda opa: item["item_count"].get(opa, {}).get(
                            "opatm", []), opas))))
        opatms.sort()
        opsdtsmps = map(
            lambda opatm: int(
                time.mktime(
                    time.strptime("".join([curday, opatm]), "%Y%m%d%H:%M:%S"))
            ), opatms)
        tmp = [
            0,
        ]
        for opastamp, pos in zip(opsdtsmps, range(len(opsdtsmps) - 1)):
            a = opsdtsmps[pos]
            b = opsdtsmps[pos + 1]
            tmp.append(b - a)
        if end_sum >= 600:
            print(uid, end_sum, sum([i for i in tmp if i <= delta]), tmp)
            print(uid, end_sum, sum([i for i in tmp if i <= delta]), opatms)
            m += 1
            print(i, end_sum)
        else:
            print(uid, end_sum, sum([i for i in tmp if i <= delta]), tmp)
            print(uid, end_sum, sum([i for i in tmp if i <= delta]), opatms)
            n += 1
        total_opatm = sum([i for i in tmp if i <= delta])
        if total_opatm != 0:
            result.setdefault(uid, total_opatm)
    print(m, n)
Exemplo n.º 3
0
class UserActiveWriter(ModeWriter):
    def __init__(self):
        self.client = PyMongoClient()
        self.modename = "UserActive"

    def setClient(self, client):
        self.client = client

    def remove(self, appkey, modename, tm):
        modename = self.modename
        tm = tm.replace("-", "")
        # 格式化
        tm = time.strftime(
            "%Y%m%d", time.localtime(time.mktime(time.strptime(tm, "%Y%m%d"))))
        self.client.remove(appkey, modename, {"partition_date": tm})

    # def write(self, appkey, modename, curDay=time.strftime("%Y-%m-%d", time.localtime(time.time()-86400)), *args, **kwargs):
    def write(self, data, appkey, modename, modetools, *args, **kwargs):
        a = time.time()
        modename = self.modename
        curDay = kwargs["today"].replace(
            "-", "") if "today" in kwargs else time.strftime(
                "%Y%m%d", time.localtime(time.time() - 86400))
        conn = self.client.getConn()
        userActiveCollection = conn[appkey][modename]
        docs = self.client.find(appkey, "UserProfile", {})
        yesterday = getDay(curDay, "%Y%m%d", -1)
        op = []
        a = time.time()
        for doc in docs:
            try:
                key = doc["_id"]
                activelife = doc.get("activelife", [0])
                firstLoginTime = doc["firstLoginTime"][:8]
                login = getDayDelta(curDay, firstLoginTime) in activelife
                # 查找前一天的用户活跃记录,需要配合索引提升速度db.UserActive.ensureIndex({partition_date: -1, jh_uid: 1})
                userActive = userActiveCollection.find_one({
                    "jh_uid":
                    key,
                    "partition_date":
                    yesterday
                })
                # 构造今天的用户记录
                newUserActive = UserActiveBuilder()
                newUserActive.setJhdUid(key)
                newUserActive.setPartitionDate(curDay)
                if userActive is None:
                    newUserActive.setActive([1] if login else [0])
                else:
                    userActive["active"].append(1 if login else 0)
                    newUserActive.setActive(userActive["active"])
                # 计算衡量指标
                newUserActive.setFirstLoginTime(doc["firstLoginTime"])
                newUserActive.setLastLoginTime(doc["lastLoginTime"])
                op.append(
                    ReplaceOne({
                        "jh_uid": key,
                        "partition_date": curDay
                    },
                               newUserActive.builder(),
                               upsert=True))
            except:
                import traceback
                print(traceback.print_exc(), doc)
        print("find cost time: %d" % int(time.time() - a))
        # print("len(op): ", len(op), "yesterday: ", yesterday)
        try:
            if op:
                userActiveCollection.bulk_write(op)
        except:
            import traceback
            print(traceback.print_exc())
            print(
                "Warn: bulkStore 'UserActive' Rise a error; Switch to Single Mode"
            )
            for op_item in op:
                try:
                    userActiveCollection.bulk_write([op_item])
                except:
                    import traceback
                    print(traceback.print_exc())
        finallyMask(appkey, modename, self.client)
        print("UserActiveWriter cost seconds %.10f" % ((time.time() - a), ))
Exemplo n.º 4
0
class UserProfileWriter(ModeWriter):
    def __init__(self, mongo_id=1):
        self.client = PyMongoClient(mongo_id=mongo_id)
        self.conn = self.client.getConn()
        self.modename = "UserProfile"

    def setClient(self, client):
        self.client = client
        self.conn = self.client.getConn()

    def remove(self, appkey, modename, tm):
        pass

    def write(self, data, appkey, modename, modetools, *args, **kwargs):
        a = time.time()
        curDay = kwargs["today"].replace("-", "")
        fix_deltaday = getDayDelta(curDay, "20160101")
        modename = self.modename
        uids = data.keys()

        docs = self.client.find(appkey, modename, {"_id": {"$in": uids}})

        oldusers = set()
        for doc in docs:
            try:
                # UserProfile _id 为 userkey
                key = doc["_id"]
                oldusers.add(key)
                # 如果新添加数据比首次访问时间要早,对历史数据进行修正处理
                if "lastLoginTime" in data[key]:
                    lastLoginTime_new = data[key]["lastLoginTime"][:8]
                if "firstLoginTime" in doc and "firstLoginTime" in data[key]:
                    activelife = doc.get("activelife", [0])
                    firstLoginTime_new = data[key]["firstLoginTime"][:8]
                    firstLoginTime_old = doc["firstLoginTime"][:8]
                    if firstLoginTime_new < firstLoginTime_old:
                        firstLoginDelta = getDayDelta(firstLoginTime_old,
                                                      firstLoginTime_new)
                        doc["activelife"] = map(lambda i: i + firstLoginDelta,
                                                activelife)

                data[key] = modetools.mergeUserProfile(data[key], doc)
                # 生成用户生命周期数据
                firstLoginDay = data[key]["firstLoginTime"][:8]
                lastLoginDay = data[key]["lastLoginTime"][:8]
                dayDelta = getDayDelta(lastLoginTime_new, firstLoginDay)
                data[key].setdefault("activelife", [0])  # 兼容历史数据
                if dayDelta not in data[key]["activelife"]:
                    data[key]["activelife"].append(dayDelta)
                data[key]["activelife"].sort()
                # 用户绝对活跃数据,起始 日期为 2016-01-01
                try:
                    firstlogin_deltaday = getDayDelta(firstLoginDay,
                                                      "20160101")
                    data[key]["activelifeabs"] = [
                        firstlogin_deltaday + remain_day
                        for remain_day in data[key]["activelife"]
                    ]
                except:
                    import traceback
                    print(traceback.print_exc())
            except:
                import traceback
                print(traceback.print_exc())
        # 设置新增版本
        # for key in set(uids)-set([item["_id"] for item in docs]):
        for key in set(uids) - oldusers:
            data[key]["comever"] = data[key]["ver"]
            # 用户绝对活跃数据,起始 日期为 2016-01-01
            try:
                firstLoginDay = data[key]["lastLoginTime"][:8]
                firstlogin_deltaday = getDayDelta(firstLoginDay, "20160101")
                data[key]["activelifeabs"] = [
                    firstlogin_deltaday + remain_day
                    for remain_day in data[key]["activelife"]
                ]
            except:
                import traceback
                print(traceback.print_exc())

        op = []
        for key in data:
            op.append(ReplaceOne({"_id": key}, data[key], True))
        try:
            if op:
                self.client.bulkWrite(appkey, modename, op)
        except:
            print(
                "Warn: bulkStore 'UserProfile' Rise a error; Switch to Single Mode"
            )
            try:
                replace_onebyone(data, appkey, modename, self.client)
            except:
                import traceback
                print(traceback.print_exc())
        finallyMask(appkey, modename, self.client)
        print("UserProfileWriter cost seconds %.3f" % ((time.time() - a), ))
Exemplo n.º 5
0
class UserCrumbsWriter(ModeWriter):
    def __init__(self, mongo_id=1):
        self.client = PyMongoClient(mongo_id=mongo_id)
        self.conn = self.client.getConn()
        self.modename = "uvfile"
        # self.store_attachmode = UserIP()
        self.attachmode_storers = []
        try:
            # self.attachmode_storers = [UserIP(), UserProfileUpdateWriter()]
            self.attachmode_storers = [UserIP()]
        except:
            import traceback
            print(traceback.print_exc())

    def setClient(self, client):
        self.client = client
        self.conn = self.client.getConn()

    def remove(self, appkey, modename, tm):
        modename = self.modename
        tm = tm.replace("-", "")
        tm = time.strftime(
            "%Y-%m-%d", time.localtime(time.mktime(time.strptime(tm,
                                                                 "%Y%m%d"))))
        self.client.remove(appkey, modename, {"tm": tm})

    def getMeasure(self, activelifeabs, fix_deltaday):
        activelifeabs = [i for i in activelifeabs if i <= fix_deltaday]
        measure = {
            "last7ActiveNum": 0,
            "last14ActiveNum": 0,
            "last28ActiveNum": 0,
            "last30ActiveNum": 0,
        }
        for activelifeabs_delta in activelifeabs:
            delta = fix_deltaday - activelifeabs_delta
            if delta <= 6:
                measure["last7ActiveNum"] += 1
            if delta <= 13:
                measure["last14ActiveNum"] += 1
            if delta <= 27:
                measure["last28ActiveNum"] += 1
            if delta <= 29:
                measure["last30ActiveNum"] += 1
        return measure

    def write(self, data, appkey, modename, modetools, *args, **kwargs):
        modename = self.modename
        # today = kwargs["today"] if "today" in kwargs else time.strftime("%Y-%m-%d", time.localtime(time.time()-86400))
        today = kwargs["today"]
        today = today.replace("-", "")
        uids = data.keys()
        yesterday = time.strftime(
            "%Y-%m-%d",
            time.localtime(
                time.mktime(time.strptime(today, "%Y%m%d")) - 86400))
        yyyy_mm_dd = time.strftime(
            "%Y-%m-%d",
            time.localtime(
                time.mktime(time.strptime(yesterday.replace("-", ""),
                                          "%Y%m%d")) + 86400))

        uvfile = self.client.find(
            appkey, "uvfile",
            OrderedDict([("tm", yyyy_mm_dd), ("jhd_userkey", {
                "$in": uids
            })]))

        user_profile = self.client.find(appkey, "UserProfile",
                                        {"_id": {
                                            "$in": uids
                                        }})

        ips = set()
        ip_loc = {}
        try:
            for uid in data:
                ips = ips.union(data[uid].get("jhd_ip"))
            ip_loc_cur = self.conn["jh"]["UserIP"].find(
                OrderedDict([("_id", {
                    "$in": list(ips)
                }), ("province", {
                    "$exists": True
                }), ("city", {
                    "$exists": True
                })]), {
                    "province": True,
                    "city": True
                })
            for item in ip_loc_cur:
                ip = item["_id"]
                province = item["province"]
                city = item["city"]
                if not province:
                    continue
                if not city:
                    city = province
                # ip_loc.setdefault(ip, "_".join([province, city]))
                ip_loc.setdefault(ip, {"prov": province, "city": city})
        except:
            import traceback
            print traceback.print_exc()

        # 合并 数据
        for doc in uvfile:
            uid = doc["jhd_userkey"]
            data[uid] = modetools.mergeUserCrumbs(doc, data[uid])
            try:
                ip_lis = data[uid]["jhd_ip"]
                data[uid].setdefault("jhd_loc", [])

                for ip in ip_lis:
                    loc = ip_loc.get(ip, None)
                    if loc and loc not in data[uid]["jhd_loc"]:
                        data[uid]["jhd_loc"].append(loc)
                # tmp = []
                # for item in data[uid]["jhd_loc"]:
                #     if isinstance(item, dict):
                #         tmp.append(item)
                # data[uid]["jhd_loc"] = tmp
            except:
                import traceback
                print traceback.print_exc()

        fix_deltaday = getDayDelta(today, "20160101")

        # lastActiveInterval
        # firstLoginTime
        for doc in user_profile:
            # print("doc", doc["_id"], fix_deltaday, doc.get("activelifeabs", []))
            key = doc["_id"]
            tmp = {}
            # 获取用户首次登录时间
            tmp["firstLoginTime"] = doc.get("firstLoginTime", "unknown")
            firstloginday = tmp["firstLoginTime"][:8]
            activelifeabs = doc.get("activelifeabs", [])
            # 获取用户最近最近活跃信息
            tmp["measure"] = self.getMeasure(activelifeabs, fix_deltaday)
            tmp["measure"]["firstLoginTime"] = tmp["firstLoginTime"]
            # 更新数据
            data[key] = dict(data[key], **tmp)
        op = []
        for key in data:

            if "_id" not in data[key]:
                data[key]["_id"] = ObjectId()
            _id = data[key]["_id"]
            op.append(
                ReplaceOne({"_id": _id}, modetools.formatList(data[key]),
                           True))

        try:
            if op:
                self.client.bulkWrite(appkey, modename, op)
        except:
            print(
                "Warn: bulkStore 'uvfile' Rise a error; Switch to Single Mode")
            try:
                replace_onebyone(data, appkey, modename, self.client)
            except:
                import traceback
                print(traceback.print_exc())
        finallyMask(appkey, modename, self.client)
        try:
            kwargs["ip_loc"] = ip_loc
            self.store_attachmode(data, appkey, modename, modetools, *args,
                                  **kwargs)
        except:
            import traceback
            print traceback.print_exc()