Пример #1
0
def stream_x_fund_info_010004(fund_ids=None):
    def clean_fund_status(limit_date, limit_time):
        if limit_date is not None:
            if limit_date <= dt.date.today():
                return "终止"
            else:
                return "运行中"
        if limit_time is not None:
            if limit_time == "无期限":
                return "运行中"
        return None

    sql = "\
    SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, fund_name_amac, reg_code_amac, foundation_date_amac, fund_time_limit_amac \
    FROM x_fund_info_securities TB_MAIN \
    JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_securities WHERE is_used = 1 GROUP BY fund_id) as TB_LATEST \
    ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id \
    JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '010004' AND im.is_used = 1 "

    if fund_ids is not None:
        fids = SQL.values4sql(fund_ids)
        sql += "WHERE im.matched_id IN {fids}".format(fids=fids)

    inp = MysqlNativeInput(engine_c, sql)

    vm = transform.ValueMap(
        OrderedDict([("fund_name_amac", lambda x: x.strip()),
                     ("reg_code_amac", lambda x: x.strip()),
                     ("limit_date",
                      (lambda x: dt.datetime.strptime(x, "%Y-%m-%d").date(),
                       "fund_time_limit_amac")),
                     ("fund_time_limit_amac", lambda x: x
                      if x == "无期限" else None),
                     ("fund_status", (lambda x, y: clean_fund_status(x, y),
                                      "limit_date", "fund_time_limit_amac"))]))

    ac = transform.AddConst({"source_id": "010004"})

    cln = transform.CleanWrongToNone({"reg_code_amac": ""}, repls=None)

    dd = transform.SortDropduplicate(sort_by=["version", "matched_id"],
                                     ascending=[False, True],
                                     subset="matched_id",
                                     keep="first")

    km = transform.MapSelectKeys({
        # "version": None,
        "matched_id": "fund_id",
        "source_id": None,
        "fund_name_amac": "fund_full_name",
        "reg_code_amac": "reg_code",
        "foundation_date_amac": "foundation_date",
        "fund_time_limit_amac": "limit_time",
        "limit_date": "limit_date",
        "fund_status": "fund_status"
    })

    stream = Stream(inp, (vm, cln, ac, dd, km), name="x_fund_info_security")
    return stream
Пример #2
0
def stream_d_fund_info_020001(fund_ids=None):
    sql = "\
    SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, TB_MAIN.source_id, fund_name, fund_full_name, reg_code, fund_status, locked_time_limit, open_date, foundation_date \
    FROM d_fund_info TB_MAIN \
    JOIN (SELECT fund_id, MAX(version) latest_ver, source_id FROM d_fund_info WHERE source_id = '020001' and is_used = 1 GROUP BY fund_id, source_id) as TB_LATEST \
    ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id AND TB_MAIN.source_id = TB_LATEST.source_id \
    JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '020001' AND im.is_used = 1 "

    if fund_ids is not None:
        fids = SQL.values4sql(fund_ids)
        sql += "WHERE im.matched_id IN {fids}".format(fids=fids)

    # JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '010004' AND im.is_used = 1"

    inp1 = MysqlNativeInput(engine_c, sql)

    # inp2 = MysqlNativeInput(engine_b, "SELECT matched_id, source_id FROM id_match WHERE id_type=1 AND source='020001' AND is_used = 1")

    vm = transform.ValueMap({
        "fund_name": lambda x: x.strip(),
        "fund_full_name": lambda x: x.strip(),
        "fund_status": {
            "正常": "运行中",
            "终止": "终止"
        },
    })

    cln1 = transform.CleanWrongToNone({
        "locked_time_limit": "-|--",
        "open_date": "-|--"
    })

    cln2 = transform.CleanRight({"reg_code": "\w\w\d{4}"})
    # jn = transform.Join(inp2, left_on="fund_id", right_on="source_id")

    dd = transform.SortDropduplicate(sort_by=["version", "matched_id"],
                                     ascending=[False, True],
                                     subset="matched_id",
                                     keep="first")

    km = transform.MapSelectKeys({
        # "version": None,
        "matched_id": "fund_id",
        "source_id_x": "source_id",
        "fund_name": None,
        "fund_full_name": None,
        "reg_code": None,
        "fund_status": None,
        "open_date": None,
        "foundation_date": None
    })

    stream = Stream(inp1, (vm, cln1, cln2, dd, km), name="d_fund_info")
    return stream
Пример #3
0
def stream_d_fund_info_020002(fund_ids=None):
    sql = "\
    SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, TB_MAIN.source_id, fund_name, fund_full_name, fund_status, locked_time_limit, \
    open_date, foundation_date \
    FROM d_fund_info TB_MAIN \
    JOIN (SELECT fund_id, MAX(version) latest_ver, source_id FROM d_fund_info WHERE source_id = '020002' and is_used = 1 GROUP BY fund_id, source_id) as TB_LATEST \
    ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id AND TB_MAIN.source_id = TB_LATEST.source_id \
    JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '020002' AND im.is_used = 1 "

    if fund_ids is not None:
        fids = SQL.values4sql(fund_ids)
        sql += "WHERE im.matched_id IN {fids}".format(fids=fids)

    inp = MysqlNativeInput(engine_c, sql)

    cln1 = transform.CleanWrongToNone({"open_date": "-|--|---"})

    cln2 = transform.CleanRight({
        "locked_time_limit": "(\d*)个月|无封闭期",
    })
    vm = transform.ValueMap({
        "fund_status": {
            "运行中": "运行中",
            "已清算": "终止",
            "封闭运行": "运行中",
            "开放运行": "运行中",
            "募集中": "募集发行"
        }
    })

    dd = transform.SortDropduplicate(sort_by=["version", "matched_id"],
                                     ascending=[False, True],
                                     subset="matched_id",
                                     keep="first")

    km = transform.MapSelectKeys({
        # "version": None,
        "matched_id": "fund_id",
        "fund_name": None,
        "fund_full_name": None,
        "source_id_x": "source_id",
        "fund_status": None,
        "locked_time_limit": None,
        "open_date": None,
        "foundation_date": None
    })

    stream = Stream(inp, (cln1, cln2, vm, dd, km))
    return stream
Пример #4
0
def stream_x_fund_info_010005(fund_ids=None):
    sql = "\
    SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, fund_name_amac, reg_code_amac, foundation_date_amac, issuing_scale_amac, number_clients_amac \
    FROM x_fund_info_futures TB_MAIN \
    JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_futures WHERE is_used = 1 GROUP BY fund_id) as TB_LATEST \
    ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id \
    JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '010005' AND im.is_used = 1 "

    if fund_ids is not None:
        fids = SQL.values4sql(fund_ids)
        sql += "WHERE im.matched_id IN {fids}".format(fids=fids)

    inp = MysqlNativeInput(engine_c, sql)

    vm = transform.ValueMap(
        OrderedDict([
            ("fund_name_amac", lambda x: x.strip()),
            ("reg_code_amac", lambda x: x.strip()),
            ("number_clients_amac", lambda x: int(x)),
        ]))

    cln = transform.CleanWrongToNone({"reg_code_amac": ""}, repls=None)

    ac = transform.AddConst({"source_id": "010005"})

    # jn = transform.Join(inp2, left_on="fund_id", right_on="source_id")

    dd = transform.SortDropduplicate(sort_by=["version", "matched_id"],
                                     ascending=[False, True],
                                     subset="matched_id",
                                     keep="first")

    km = transform.MapSelectKeys({
        "matched_id": "fund_id",
        "source_id": None,
        "fund_name_amac": "fund_full_name",
        "reg_code_amac": "reg_code",
        "foundation_date_amac": "foundation_date",
        "issuing_scale_amac": "issuing_scale",
        "number_clients_amac": "number_clients",
    })

    stream = Stream(inp, (vm, cln, ac, dd, km), name="x_fund_info_future")
    return stream
Пример #5
0
    def stream_010001(cls):
        sql = "SELECT xoi.version, im.matched_id, org_name_en, org_full_name, org_category, found_date, org_code, " \
              "reg_code, reg_time, manage_type, fund_scale, property, reg_capital, real_capital, " \
              "real_capital_proportion, legal_person, is_qualified, qualifying_way, address, reg_address, org_web,  " \
              "employee_scale, final_report_time, currency, is_member, member_type, initiation_time, " \
              "law_firm_name, lawyer_name " \
              "FROM crawl_private.x_org_info xoi " \
              "JOIN (SELECT org_id, MAX(version) latest_ver FROM crawl_private.x_org_info GROUP BY org_id) tb_latest " \
              "ON xoi.org_id = tb_latest.org_id AND xoi.version = tb_latest.latest_ver " \
              "JOIN (SELECT matched_id, source_id FROM base.id_match " \
              "WHERE id_type = 2 AND source = '010001' AND is_used = 1) im ON xoi.org_id = im.source_id "

        inp = MysqlInput(ENGINE_RD, sql)

        vm = transform.ValueMap(OrderedDict([
            ("org_full_name", lambda x: x.strip()),
            (OrgInfo.org_category.name, "私募基金管理公司"),
            ("sourec_id", "010001"),
            ("org_code", lambda x: x.strip()),
            ("reg_code", lambda x: x.strip()),
            ("real_capital_proportion", lambda x: x.strip()),
            ("legal_person", lambda x: x.strip()),
            ("qualifying_way", lambda x: x.strip()),
            ("is_qualified", (lambda x: "是" if x in ("通过考试", "资格认定") else "否", "qualifying_way")),
            ("address", lambda x: x.strip()),
            ("reg_address", lambda x: x.strip()),
            ("org_web", lambda x: re.sub("http://|https://|\s", "", x)),
            ("employee_scale", lambda x: int(x))
        ]))

        vm2 = transform.ValueMap(OrderedDict([
            ("org_web", lambda x: x[:-1] if x[-1] == "/" else x),
        ]))

        cw = transform.CleanWrongToNone({
            "org_full_name": "\s",
            "org_name_en": "\s",
            "manage_type": "\s",
            "org_web": "\s"
        })

        # 如果一个ID出现多次, 保留最新版本数据
        sdd = transform.SortDropduplicate(sort_by=["version"], ascending=[False], subset=["matched_id"])

        sk = transform.MapSelectKeys({
            "matched_id": OrgInfo.org_id.name,
            "source_id": None,
            "org_name_en": OrgInfo.org_name_en.name,
            "org_full_name": OrgInfo.org_full_name.name,
            "org_category": OrgInfo.org_category.name,
            "found_date": OrgInfo.found_date.name,
            "org_code": OrgInfo.org_code.name,
            "reg_code": OrgInfo.reg_code.name,
            "reg_time": OrgInfo.reg_time.name,
            "manage_type": OrgInfo.manage_type.name,
            "fund_scale": OrgInfo.asset_mgt_scale.name,
            "property": OrgInfo.property.name,
            "reg_capital": OrgInfo.reg_capital.name,
            "real_capital": OrgInfo.real_capital.name,
            "real_capital_proportion": OrgInfo.real_capital_proportion.name,
            "legal_person": OrgInfo.legal_person.name,
            "is_qualified": OrgInfo.is_qualified.name,
            "qualifying_way": OrgInfo.qualifying_way.name,
            "address": OrgInfo.address.name,
            "reg_address": OrgInfo.reg_address.name,
            "org_web": OrgInfo.org_web.name,
            "employee_scale": OrgInfo.employee_scale.name,
            "final_report_time": OrgInfo.final_report_time.name,
            "currency": OrgInfo.currency.name,
            "is_member": OrgInfo.is_member.name,
            "member_type": OrgInfo.member_type.name,
            "initiation_time": OrgInfo.initiation_time.name,
            "law_firm_name": OrgInfo.law_firm_name.name,
            "lawyer_name": OrgInfo.lawyer_name.name,
        })

        s = Stream(inp, transform=[vm, vm2, cw, sdd, sk, ])
        return s
Пример #6
0
def stream_x_fund_info_010002(fund_ids=None):
    sql = "\
    SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, fund_name_amac, reg_code_amac, reg_time_amac, fund_time_limit_amac, issuing_scale_amac, number_clients_amac \
    FROM x_fund_info_fundaccount TB_MAIN \
    JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_fundaccount WHERE is_used = 1 GROUP BY fund_id) as TB_LATEST \
    ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id \
    JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '010002' AND im.is_used = 1 "

    if fund_ids is not None:
        fids = SQL.values4sql(fund_ids)
        sql += "WHERE im.matched_id IN {fids}".format(fids=fids)

    inp = MysqlNativeInput(engine_c, sql)

    def clean_fund_status(reg_time, limit_time):
        from dateutil.relativedelta import relativedelta
        now = dt.date.today()
        if reg_time is not None and limit_time is not None and limit_time > 0:
            if limit_time > 1200:  # 有些无限期的limit_time会被标记为9999月
                return "运行中"
            limit_date = reg_time + relativedelta(months=int(limit_time))
            if limit_date <= now:
                return "终止"
            else:
                return "运行中"
        return None

    vm = transform.ValueMap({
        "fund_name_amac":
        lambda x: x.strip(),
        "reg_code_amac":
        lambda x: x.strip(),
        "fund_time_limit_amac":
        lambda x: int(x),
        "issuing_scale_amac":
        lambda x: x * 1e4 if x is not None else None,
        "number_clients_amac":
        lambda x: int(x),
    })

    vm2 = transform.ValueMap({
        "fund_status": (lambda x, y: clean_fund_status(x, y), "reg_time_amac",
                        "fund_time_limit_amac")
    })

    cln = transform.CleanWrongToNone({"reg_code_amac": ""}, repls=None)

    # jn = transform.Join(
    #     inp2, left_on="fund_id", right_on="source_id"
    # )

    ac = transform.AddConst({"source_id": "010002"})

    dd = transform.SortDropduplicate(sort_by=["version", "matched_id"],
                                     ascending=[False, True],
                                     subset="matched_id",
                                     keep="first")

    km = transform.MapSelectKeys({
        # "version": None,
        "matched_id": "fund_id",
        "source_id": None,
        "reg_code_amac": "reg_code",
        "reg_time_amac": "reg_time",
        "fund_status": "fund_status",
        "fund_time_limit_amac": "limit_time",
        "issuing_scale_amac": "issuing_scale",
        "number_clients_amac": "number_clients",
        "fund_name_amac": "fund_full_name",
    })

    stream = Stream(inp, (vm, vm2, cln, dd, ac, km),
                    name="x_fund_info_account")

    return stream
Пример #7
0
def stream_x_fund_info_010003(fund_ids=None):
    def clean_currency(x):
        patt = "(?P<currency>人民币|美元|多币种|欧元|港元|澳元|其他|其它|日元|英镑)?(?P<type>现钞|现汇)?"
        cpatt = re.compile(patt)
        sre = cpatt.search(x)
        if sre is not None:
            return sre.groupdict()
        return {}

    sql = "\
    SELECT TB_MAIN.version, im.matched_id, TB_MAIN.fund_id, fund_name_amac, reg_code_amac, reg_time_amac, foundation_date_amac, currency_name_amac, \
    fund_status_amac, final_report_time_amac \
    FROM x_fund_info_private TB_MAIN \
    JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_private WHERE is_used = 1 GROUP BY fund_id) as TB_LATEST \
    ON TB_MAIN.version = TB_LATEST.latest_ver AND TB_MAIN.fund_id = TB_LATEST.fund_id \
    JOIN base.id_match im ON im.source_id = TB_MAIN.fund_id AND id_type = 1 AND source = '010003' AND im.is_used = 1 "

    if fund_ids is not None:
        fids = SQL.values4sql(fund_ids)
        sql += "WHERE im.matched_id IN {fids}".format(fids=fids)

    inp = MysqlNativeInput(engine_c, sql)

    vm = transform.ValueMap(
        OrderedDict([
            ("fund_name_amac", lambda x: x.strip()),
            ("reg_code_amac", lambda x: x.strip()),
            ("is_abnormal_liquidation", (lambda x: {
                "延期清算": 1,
                "提前清算": 1,
                "投顾协议已终止": 1,
                "正常清算": 0
            }.get(x), "fund_status_amac")),
            ("liquidation_cause", (lambda x: {
                "延期清算": "延期清算",
                "提前清算": "提前清算",
                "投顾协议已终止": "投顾协议已终止"
            }.get(x), "fund_status_amac")),
            ("fund_status_amac", {
                "正常清算": "终止",
                "正在运作": "运行中",
                "延期清算": "终止",
                "提前清算": "终止",
                "投顾协议已终止": "终止"
            }),
            ("currency_type", (lambda x: clean_currency(x).get("type"),
                               "currency_name_amac")),
            ("currency_name_amac",
             lambda x: clean_currency(x).get("currency")),
        ]))

    cln = transform.CleanWrongToNone({"reg_code_amac": ""}, repls=None)

    # jn = transform.Join(
    #     inp2, left_on="fund_id", right_on="source_id"
    # )

    dd = transform.SortDropduplicate(sort_by=["version", "matched_id"],
                                     ascending=[False, True],
                                     subset="matched_id",
                                     keep="first")

    ac = transform.AddConst({"source_id": "010003"})

    km = transform.MapSelectKeys({
        # "version": None,
        "matched_id": "fund_id",
        "source_id": None,
        "fund_name_amac": "fund_full_name",
        "reg_code_amac": "reg_code",
        "reg_time_amac": "reg_time",
        "foundation_date_amac": "foundation_date",
        "currency_name_amac": "currency",
        "currency_type": None,
        "fund_status_amac": "fund_status",
        "is_abnormal_liquidation": None,
        "liquidation_cause": None
    })

    stream = Stream(inp, (vm, cln, dd, ac, km), name="x_fund_info_private")
    return stream