Пример #1
0
    def stream_020002_investment_years(cls):
        """
            清洗 d_person_info;

        """

        sql = "select idm.matched_id, dd.investment_years FROM \
                (SELECT matched_id,source,source_id FROM base.id_match \
                where id_type = 3 and is_used = 1 AND source = '020002') as idm \
                JOIN \
                (SELECT * FROM \
                (select MAX(version) as mm,person_id as pi,source_id as si  \
                FROM crawl_private.d_person_info GROUP BY person_id) as ma \
                JOIN crawl_private.d_person_info as dpd \
                ON dpd.person_id = ma.pi and ma.mm = dpd.version) dd \
                ON dd.pi = idm.source_id \
                AND dd.si = idm.source"

        inp = MysqlInput(ENGINE_RD, sql)

        vm = transform.ValueMap({
            "investment_years": lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x,
        })

        sk = transform.MapSelectKeys({
            "matched_id": "person_id",
            'investment_years': 'investment_years'
        })
        dp = transform.Dropna(how="any", axis=0)

        s = Stream(inp, transform=[vm, sk, dp])
        return s
Пример #2
0
    def stream_020003_resume(cls):


        sql = "select idm.matched_id, dd.resume FROM \
                (SELECT matched_id,source,source_id FROM base.id_match \
                where id_type = 3 and is_used = 1 AND source = '020003') as idm \
                JOIN \
                (SELECT * FROM \
                (select MAX(version) as mm,person_id as pi,source_id as si \
                from crawl_private.d_person_description GROUP BY person_id) ma \
                JOIN crawl_private.d_person_description as dpd \
                ON dpd.person_id = ma.pi and ma.mm = dpd.version) dd \
                ON dd.pi = idm.source_id \
                AND dd.si = idm.source"

        inp = MysqlInput(ENGINE_RD, sql)

        vm = transform.ValueMap({
            "investment_years": (lambda x: cls.clean_01(x) if type(x) is str else x, 'resume')
        })

        sk = transform.MapSelectKeys({
            "matched_id": "person_id",
            'investment_years': 'investment_years'
        })
        dp = transform.Dropna(how='any', axis=0)
        s = Stream(inp, transform=[vm, sk, dp])
        return s
Пример #3
0
    def stream_020003_graduate_school(cls):
        """
            清洗 d_person_info;

        """

        sql = "select idm.matched_id, dd.graduate_school FROM \
                (SELECT matched_id,source,source_id FROM base.id_match \
                where id_type = 3 and is_used = 1 AND source = '020003') as idm \
                JOIN \
                (SELECT * FROM \
                (select MAX(version) as mm,person_id as pi,source_id as si  \
                FROM crawl_private.d_person_info GROUP BY person_id) as ma \
                JOIN crawl_private.d_person_info as dpd \
                ON dpd.person_id = ma.pi and ma.mm = dpd.version) dd \
                ON dd.pi = idm.source_id \
                AND dd.si = idm.source"

        inp = MysqlInput(ENGINE_RD, sql)

        sk = transform.MapSelectKeys({
            "matched_id": "person_id",
            'graduate_school': 'graduate_school'
        })
        dp = transform.Dropna(how='any', axis=0)
        s = Stream(inp, transform=[sk, dp])
        return s
Пример #4
0
    def stream_602_4(cls, c_cleaned):
        sql = "SELECT fi.fund_id, fi.fund_name " \
              "FROM base.fund_info fi " \
              "WHERE fi.fund_id NOT IN (SELECT DISTINCT fund_id FROM base.fund_type_mapping_import WHERE typestandard_code = 602) "

        inp = MysqlInput(ENGINE, sql)

        # 非惰性求值
        fids = set(c_cleaned.dataframe["fund_id"])

        vm0 = transform.ValueMap({
            "__tmp": (lambda x: (60201, "非结构化", None, None) if x not in fids else (None, None, None, None), "fund_id")
        })

        vm1 = transform.ValueMap({
            FundTypeMappingImport.typestandard_code.name: 602,
            FundTypeMappingImport.typestandard_name.name: "按结构类型分类",
            FundTypeMappingImport.type_code.name: (lambda x: x[0], "__tmp"),
            FundTypeMappingImport.type_name.name: (lambda x: x[1], "__tmp"),
            FundTypeMappingImport.stype_code.name: (lambda x: x[2], "__tmp"),
            FundTypeMappingImport.stype_name.name: (lambda x: x[3], "__tmp"),
            FundTypeMappingImport.classified_by.name: CLASSIFIED_BY
        })

        sk = transform.MapSelectKeys(cls.FIXED_FIELDS)

        dn = transform.Dropna(subset=[FundTypeMappingImport.type_code.name, FundTypeMappingImport.stype_code.name], how="all")

        s = base.Stream(inp, [vm0, vm1, sk, dn])

        return s
Пример #5
0
def stream_000001():
    session = dbsession(bind=engine_r)
    stmt = session.query(YBondInfo).join(
        BondInfo, YBondInfo.bond_id == BondInfo.bond_id).filter(
            and_(YBondInfo.is_used == 1,
                 YBondInfo.is_used == 1)).with_entities(
                     YBondInfo.bond_id, YBondInfo.interest_type)
    inp = MysqlInput(session.bind, stmt)

    dn = transform.Dropna(subset=[YBondInfo.interest_type.name])

    km = transform.MapSelectKeys({
        YBondInfo.bond_id.name:
        BondInfo.bond_id.name,
        YBondInfo.source_id.name:
        None,
        YBondInfo.interest_type.name:
        BondInfo.interest_type.name,
    })

    s = Stream(inp, (
        dn,
        km,
    ))

    return s
Пример #6
0
    def stream_010005_ts103(cls):
        source_id = "010005"
        tb = "crawl_private.x_fund_info_futures"
        tmpsql = cls.SQL_103.format(sid=source_id, tb=tb)

        inp = MysqlInput(ENGINE, tmpsql)

        vm = transform.ValueMap(OrderedDict(
            [
                [FundTypeSource.type_code.name, (lambda x: cls.TS_MAPPER_103.get(x, [None, None])[0], "fund_type_target_amac")],
                [FundTypeSource.type_name.name, (lambda x: cls.TS_MAPPER_103.get(x, [None, None])[1], "fund_type_target_amac")],
                [FundTypeSource.typestandard_code.name, "103"],
                [FundTypeSource.typestandard_name.name, "按投资标的分类"],
                [FundTypeSource.source_id.name, source_id]
            ]
        ))

        sk = transform.MapSelectKeys(
            {
                "matched_id": FundTypeSource.fund_id.name,
                "fund_name": FundTypeSource.fund_name.name,
                FundTypeSource.source_id.name: None,
                FundTypeSource.typestandard_name.name: None,
                FundTypeSource.typestandard_code.name: None,
                FundTypeSource.type_code.name: None,
                FundTypeSource.type_name.name: None,
            }
        )

        dn = transform.Dropna(subset=[FundTypeSource.type_code.name])

        s = base.Stream(inp, transform=(vm, sk, dn))
        return s
Пример #7
0
    def stream_020001_ts304(cls):
        source_id = "020001"
        tb = "crawl_private.d_fund_info"
        tmpsql = cls.SQL_X04.format(tb=tb, sid=source_id)

        inp = MysqlInput(ENGINE, tmpsql)

        vm = transform.ValueMap(
            OrderedDict([
                [FundTypeSource.source_id.name, source_id],
                [FundTypeSource.typestandard_code.name, "104"],
                [FundTypeSource.typestandard_name.name, "按发行主体分类"],
                [
                    "__cached",
                    (lambda x: cls.TS_MAPPER_304.get(x, [None, None]),
                     "fund_type_issuance")
                ],
                [FundTypeSource.type_code.name, (lambda x: x[0], "__cached")],
                [FundTypeSource.type_name.name, (lambda x: x[1], "__cached")],
            ]))

        sk = transform.MapSelectKeys({
            "matched_id": FundTypeSource.fund_id.name,
            "fund_name": FundTypeSource.fund_name.name,
            FundTypeSource.source_id.name: None,
            FundTypeSource.typestandard_code.name: None,
            FundTypeSource.typestandard_name.name: None,
            FundTypeSource.type_code.name: None,
            FundTypeSource.type_name.name: None,
        })
        dn = transform.Dropna(subset=[FundTypeSource.type_code.name])

        s = base.Stream(inp, [vm, sk, dn])

        return s
Пример #8
0
    def stream_000001(cls):
        """
            清洗stock_info_010001 CSRC分类;

        """

        sql = "SELECT stock_id,type_csrc FROM crawl_finance.stock_info_010001"

        inp = MysqlInput(ENGINE_RD, sql)

        vm = transform.ValueMap({
            "type_csrc":
            lambda x: re.sub("\s", "",
                             x.split(",")[1]),
            "category_code": (lambda x: cls.TYPE_CODE.get(x), "type_csrc"),
            "category_name": (lambda x: cls.TYPE_NAME.get(x), "type_csrc"),
            "type_code": (lambda x: cls.STYPE_CODE.get(x), "type_csrc"),
            "stock_id":
            lambda x: str(cls.add_stock_suffix(x)) + ".SH"
        })

        sk = transform.MapSelectKeys({
            "stock_id": "stock_id",
            "category_name": "category_name",
            "category_code": "category_code",
            "type_code": "type_code",
            "type_csrc": "type_name"
        })

        dn = transform.Dropna(subset=None, axis=0, how='any')

        s = Stream(inp, transform=[vm, sk, dn])
        return s
Пример #9
0
    def stream_020003(cls, fund_ids):
        sql = "SELECT im.matched_id, dfp.statistic_date, dfp.subject_id, fpb.data_source, dfp.quantity " \
              "FROM base_public.id_match im " \
              "JOIN crawl_public.d_fund_position dfp " \
              "ON im.source_id = dfp.fund_id AND im.data_source = dfp.data_source " \
              "JOIN base_test.fund_position_bond_test_20180515 fpb " \
              "ON im.matched_id = fpb.fund_id AND fpb.statistic_date = dfp.statistic_date AND fpb.subject_id = dfp.subject_id " \
              "WHERE type = '债券' AND im.matched_id IN {fids} AND im.id_type = 1 AND im.is_used = 1 " \
              "AND im.data_source = '020003'".format(fids=sf.SQL.values4sql(fund_ids))

        inp = MysqlInput(cls.engine, sql)

        vm = transform.ValueMap({
            "quantity": lambda x: cls._clean_amount(x),
        })

        sk = transform.MapSelectKeys({
            "matched_id": "fund_id",
            "statistic_date": None,
            "subject_id": None,
            "quantity": None
        })

        dn = transform.Dropna(subset=["quantity"])

        return base.Stream(inp, transform=[vm, sk, dn])
Пример #10
0
def stream_010005():
    sql_fi = "SELECT im.matched_id as fund_id, fi.fund_name, fi.foundation_date, issuing_scale_amac " \
             "FROM crawl_private.{tb} xfi " \
             "JOIN (SELECT fund_id, MAX(version) latest_ver FROM crawl_private.{tb} GROUP BY fund_id) tb_latest " \
             "ON tb_latest.latest_ver = xfi.version AND tb_latest.fund_id = xfi.fund_id " \
             "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.source = '010005' AND im.id_type = 1 AND im.is_used = 1 " \
             "LEFT JOIN base.fund_info fi ON fi.fund_id = im.matched_id".format(tb="x_fund_info_futures")

    inp = MysqlInput(ENGINE_C, sql_fi)

    vm = transform.ValueMap({
        "issuing_scale_amac": lambda x: float(x),
    })

    vm2 = transform.ValueMap({
        "issuing_scale_amac":
        lambda x: None if x == 0 else x,
    })

    dn = transform.Dropna(
        subset=["fund_id", "foundation_date", "issuing_scale_amac"])

    sk = transform.MapSelectKeys({
        "fund_id":
        FundAssetScale.fund_id.name,
        "fund_name":
        FundAssetScale.fund_name.name,
        "issuing_scale_amac":
        FundAssetScale.asset_scale.name,
        "foundation_date":
        FundAssetScale.statistic_date.name,
    })

    s = base.Stream(inp, transform=(vm, vm2, dn, sk))
    return s
Пример #11
0
def stream_x_org_info():
    sql = "SELECT xoi.org_id, xoi.final_report_time, fund_num, fund_scale  FROM crawl_private.x_org_info xoi " \
          "JOIN (SELECT org_id, MAX(version) latest_ver FROM crawl_private.x_org_info WHERE is_used = 1 GROUP BY org_id) tb " \
          "ON xoi.org_id = tb.org_id AND xoi.version = tb.latest_ver"

    inp = MysqlInput(ENGINE, sql)

    vm = transform.ValueMap({OrgAssetScale.data_time.name: dt.date.today()})

    sk = transform.MapSelectKeys({
        "org_id":
        OrgAssetScale.org_id.name,
        "final_report_time":
        OrgAssetScale.statistic_date.name,
        "fund_num":
        OrgAssetScale.funds_num.name,
        "fund_scale":
        OrgAssetScale.asset_scale.name,
        OrgAssetScale.data_time.name:
        None
    })

    dn = transform.Dropna(subset=[
        OrgAssetScale.asset_scale.name, OrgAssetScale.funds_num.name,
        OrgAssetScale.statistic_date.name
    ],
                          how="all")

    s = base.Stream(inp, transform=(vm, sk, dn))
    return s
Пример #12
0
    def stream_010003_ts104(cls):
        source_id = "010003"
        tb = "crawl_private.x_fund_info_private"

        sql = "SELECT im.matched_id, xfi.type_name_amac, fi.fund_name " \
              "FROM {tb} xfi " \
              "JOIN (SELECT fund_id, MAX(version) latest_ver FROM {tb} GROUP BY fund_id) t " \
              "ON xfi.fund_id = t.fund_id AND xfi.version = t.latest_ver " \
              "JOIN (SELECT matched_id, source_id FROM base.id_match WHERE source = '{sid}' AND id_type = 1 AND is_used = 1) im " \
              "ON xfi.fund_id = im.source_id " \
              "LEFT JOIN fund_info fi ON im.matched_id = fi.fund_id".format(
            tb=tb, sid=source_id
        )

        inp = MysqlInput(ENGINE, sql)

        vm = transform.ValueMap(
            {
                FundTypeSource.source_id.name: source_id,
                FundTypeSource.typestandard_code.name: "104",
                FundTypeSource.typestandard_name.name: "按发行主体分类",
                FundTypeSource.type_code.name: (lambda x: cls.TS_MAPPER_104.get(x, [None, None])[0], "type_name_amac"),
                FundTypeSource.type_name.name: (lambda x: cls.TS_MAPPER_104.get(x, [None, None])[1], "type_name_amac")
            }
        )

        dn = transform.Dropna(subset=[FundTypeSource.fund_id.name, FundTypeSource.type_code.name])

        s = base.Stream(inp, [vm, cls.SK, dn])
        return s
Пример #13
0
    def stream_consultant_000001(cls):
        sql = "SELECT fund_id, fund_name, oi.org_id, oi.org_name, org_type_code, start_date, end_date, is_current " \
              "FROM crawl_private.y_fund_org_mapping yfom " \
              "JOIN base.org_info oi ON yfom.org_id = oi.org_id " \
              "WHERE org_type_code = 1 AND is_used = 1"

        inp = MysqlInput(ENGINE_C, sql)

        vm = transform.ValueMap({
            "org_type": "投资顾问"
        })

        sk = transform.MapSelectKeys({
            "fund_id": None,
            "fund_name": None,
            "org_id": None,
            "org_name": None,
            "org_type": None,
            "org_type_code": None,
            "start_date": None,
            "end_date": None,
            "is_current": None,
        })
        dn = transform.Dropna(
            subset=[FundOrgMapping.fund_id.name, FundOrgMapping.org_id.name, FundOrgMapping.org_type_code.name]
        )
        s = Stream(inp, transform=[vm, sk, dn])
        return s
Пример #14
0
    def stream_securities(cls):
        """
            清洗 x_info_securities(010004);

        """

        sql = " \
               SELECT id.matched_id, ac.orientation_amac FROM ( \
                SELECT A.matched_id, A.source_id FROM  \
                (SELECT matched_id, source_id, entry_time FROM base.id_match \
                WHERE source = '010004' and is_used = 1 and id_type = 1 ORDER BY entry_time DESC) \
                 AS A  GROUP BY A.matched_id) AS id  \
                JOIN \
                (SELECT * FROM (SELECT MAX(version) mm ,fund_id as maid FROM crawl_private.x_fund_info_securities \
                GROUP BY fund_id) AS p \
                JOIN crawl_private.x_fund_info_securities as ff \
                on ff.fund_id = p.maid and ff.version = p.mm) AS ac \
                ON ac.maid = id.source_id"

        inp = MysqlInput(ENGINE_RD, sql)

        vm = transform.ValueMap({
            'orientation_amac':
            lambda x: cls.sub_wrong_to_none(x) if type(x) is str else x
        })
        dr = transform.Dropna(axis=0, how="any")

        sk = transform.MapSelectKeys({
            "matched_id": "fund_id",
            "orientation_amac": "investment_range"
        })

        s = Stream(inp, transform=[vm, dr, sk])
        return s
Пример #15
0
    def stream_manager_010003(cls, d_xoi=None):
        sql_maxver = "SELECT im.matched_id, xfi.fund_issue_org_amac, manage_type_amac, type_name_amac, fi.fund_name, fi.foundation_date, fi.end_date, fi.fund_status " \
                     "FROM x_fund_info_private xfi " \
                     "JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_private GROUP BY fund_id) tb_latest " \
                     "ON xfi.version = tb_latest.latest_ver AND xfi.fund_id = tb_latest.fund_id " \
                     "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.id_type = 1 AND im.source = '010003' AND im.is_used = 1 " \
                     "LEFT JOIN base.fund_info fi " \
                     "ON im.matched_id = fi.fund_id AND im.id_type = 1 AND im.source = '010003' " \
                     "WHERE fund_name_amac NOT LIKE '%%信托计划'"

        inp = MysqlInput(ENGINE_C, sql_maxver)

        inp2 = MysqlInput(ENGINE_B, "SELECT org_id, org_name FROM base.org_info")

        def clean_org_type(mng_tp, tp):
            # 私募证券投资基金 = 证券投资基金; 其他私募投资基金 = 其他投资基金
            if mng_tp == "顾问管理":
                if tp in {"私募证券投资基金", "证券投资基金"}:
                    return "基金管理人"

            elif mng_tp == "受托管理":
                if tp in {"私募证券投资基金", "证券投资基金", "创业投资基金", "其他私募投资基金", "其他投资基金", "股权投资基金"}:
                    return "基金管理人"

            elif mng_tp == "自我管理":
                if tp in {"私募证券投资基金", "证券投资基金", "股权投资基金", "创业投资基金", "其他私募投资基金", "其他投资基金"}:
                    return "基金管理人"

        if d_xoi is None:
            d_xoi = cls._d_oi_manager()

        vm = transform.ValueMap(
            OrderedDict([
                ("fund_issue_org_amac", lambda x: x.strip()),
                ("org_id", (lambda x: d_xoi.get(x), "fund_issue_org_amac")),
                (FundOrgMapping.org_type.name,
                 (lambda mng_tp, tp: clean_org_type(mng_tp, tp), "manage_type_amac", "type_name_amac")),
                (FundOrgMapping.is_current.name, (lambda x: cls._clean_iscurrent(x), "fund_status"))
            ])
        )

        jn = transform.Join(inp2, how="left", on="org_id")

        dn = transform.Dropna(subset=["matched_id", "org_id", FundOrgMapping.org_type.name])

        vm2 = transform.ValueMap({
            FundOrgMapping.org_type_code.name: (lambda x: 2 if x == "基金管理人" else None, FundOrgMapping.org_type.name),
        })

        sk = transform.MapSelectKeys(cls.FIELDS)

        s = Stream(inp, transform=(vm, jn, dn, vm2, sk))
        return s
Пример #16
0
    def stream_010101(cls):
        sql = "SELECT im.matched_id as person_id, pi.person_name, xpc.status, xpc.org_name as org_name_ " \
              "FROM crawl_private.x_person_certificate_010101 xpc " \
              "JOIN base.id_match im ON xpc.person_id = im.source_id " \
              "LEFT JOIN base.person_info pi ON im.matched_id = pi.person_id " \
              "WHERE im.source = '010101' AND im.id_type = 3 AND im.is_used = 1"

        def org_name_dict():
            import pandas as pd
            sql_oi = "SELECT org_full_name, org_id, org_name FROM base.org_info"
            res = pd.read_sql(sql_oi, ENGINE)
            v = tuple([(oid, oname)
                       for oid, oname in zip(res["org_id"], res["org_name"])])
            res = dict(zip(res["org_full_name"], v))
            return res

        inp = MysqlInput(ENGINE, sql)

        d = org_name_dict()

        vm = transform.ValueMap({
            OrgPersonMapping.org_id.name: (lambda x: d.get(x)[0], "org_name_"),
            OrgPersonMapping.org_name.name:
            (lambda x: d.get(x)[1], "org_name_"),
            OrgPersonMapping.is_current.name: (lambda x: {
                "正常": 1,
                "离职": 0
            }.get(x), "status"),
            OrgPersonMapping.duty.name:
            None
        })

        sk = transform.MapSelectKeys({
            "person_id":
            OrgPersonMapping.person_id.name,
            "person_name":
            OrgPersonMapping.person_name.name,
            OrgPersonMapping.org_id.name:
            None,
            "org_name":
            OrgPersonMapping.org_name.name,
            OrgPersonMapping.is_current.name:
            None,
            OrgPersonMapping.duty.name:
            None
        })

        dn = transform.Dropna(subset=[
            OrgPersonMapping.person_id.name, OrgPersonMapping.org_id.name
        ])

        s = base.Stream(inp, [vm, sk, dn])
        return s
Пример #17
0
def stream_020001():
    session = dbsession(bind=engine_r)
    stmt = session.query(DBondInfo).join(
        BondInfo, DBondInfo.bond_id == BondInfo.bond_id).filter(
            DBondInfo.source_id == "020001").with_entities(
                DBondInfo.bond_id, DBondInfo.source_id,
                DBondInfo.bond_full_name, DBondInfo.interest_type,
                DBondInfo.interest_freq)
    inp = MysqlInput(session.bind, stmt)

    def clean_interest_freq(type_, freq, fname):
        if fname.find("贴现") >= 0:
            return "贴现"

        if type_ in {"贴现", "零息"}:
            return "贴现"
        else:
            d_type_freq = {
                "固定": {
                    "年付": "年付息",
                    "半年付": "半年付息",
                    "季付": "季度付息",
                    "月付": "月付息",
                    "贴现": "贴现"
                },
                "浮动": {
                    "年付": "年付息",
                    "半年付": "半年付息",
                    "季付": "季度付息",
                    "月付": "月付息",
                }
            }
            return d_type_freq.get(type_).get(freq)

    vm = transform.ValueMap({
        DBondInfo.interest_freq.name:
        (lambda type_, freq, fname: clean_interest_freq(type_, freq, fname),
         DBondInfo.interest_type.name, DBondInfo.interest_freq.name,
         DBondInfo.bond_full_name.name)
    })

    dn = transform.Dropna(subset=[BondInfo.interest_freq.name])

    km = transform.MapSelectKeys({
        DBondInfo.bond_id.name: BondInfo.bond_id.name,
        BondInfo.interest_freq.name: None,
    })

    s = Stream(inp, (vm, dn, km))

    return s
Пример #18
0
    def stream_020002_ts401(cls):
        source_id = "020002"
        tb = "crawl_private.d_fund_info"
        tmpsql = cls.SQL_X01.format(tb=tb, sid=source_id)

        inp = MysqlInput(ENGINE, tmpsql)

        vm0 = transform.ValueMap({
            "type_name":
            lambda x: "" if x.replace("-", "") == "" else x,
            "stype_name":
            lambda x: "" if x.replace("-", "") == "" else x,
        })

        vm = transform.ValueMap(
            OrderedDict([
                [FundTypeSource.source_id.name, source_id],
                [FundTypeSource.typestandard_code.name, "101"],
                [FundTypeSource.typestandard_name.name, "按投资策略分类"],
                [
                    "__cached",
                    (lambda tn, stn: cls.TS_MAPPER_401.get(
                        (tn, stn), [None] * 4), "type_name", "stype_name")
                ],
                [FundTypeSource.type_code.name, (lambda x: x[0], "__cached")],
                [FundTypeSource.type_name.name, (lambda x: x[2], "__cached")],
                [FundTypeSource.stype_code.name, (lambda x: x[1], "__cached")],
                [FundTypeSource.stype_name.name, (lambda x: x[3], "__cached")],
            ]))

        sk = transform.MapSelectKeys({
            "matched_id": FundTypeSource.fund_id.name,
            "fund_name": FundTypeSource.fund_name.name,
            FundTypeSource.source_id.name: None,
            FundTypeSource.typestandard_code.name: None,
            FundTypeSource.typestandard_name.name: None,
            FundTypeSource.type_code.name: None,
            FundTypeSource.type_name.name: None,
            FundTypeSource.stype_code.name: None,
            FundTypeSource.stype_name.name: None,
        })
        dn = transform.Dropna(subset=[
            FundTypeSource.type_code.name, FundTypeSource.stype_code.name
        ],
                              how="all")

        s = base.Stream(inp, [vm0, vm, sk, dn])

        return s
Пример #19
0
    def stream_020001(cls):
        """
            清洗 fund_custodian;

        """
        dict_01 = {'中国银行(香港)有限公司': '02000001'}  #往这里可以添加需要的匹配关系

        dict_020001 = dict(cls.org_full_name, **dict_01)

        sql = "SELECT idh.matched_id, ff.fund_custodian, fi.fund_name FROM ( \
                SELECT matched_id,source_id FROM base_public.id_match \
                where id_type = 1 AND is_used = 1 AND data_source = '020001') as idh \
                JOIN \
                (SELECT * FROM ( \
                SELECT MAX(version) as mm, fund_id as id FROM crawl_public.d_fund_info GROUP BY fund_id) as idm \
                JOIN crawl_public.d_fund_info as df \
                ON idm.id = df.fund_id AND idm.mm = df.version \
                WHERE df.data_source = '020001') AS ff \
                ON ff.id = idh.matched_id \
                JOIN base_public.fund_info as fi \
                ON fi.fund_id = idh.matched_id \
                where idh.matched_id <> '777777'"

        inp = MysqlInput(ENGINE_RD, sql)

        vm = transform.ValueMap({
            "org_id": (lambda x: dict_020001.get(x), "fund_custodian"),
        })

        vm2 = transform.ValueMap({
            "type_code": (lambda x: cls.type_code.get(x), "org_id"),
        })

        vm3 = transform.ValueMap({
            "type_name": (lambda x: cls.type_name.get(x), "type_code"),
        })

        dr = transform.Dropna(axis=0, how="any")

        sk = transform.MapSelectKeys({
            "matched_id": "fund_id",
            "org_id": "org_id",
            "fund_custodian": "org_name",
            "fund_name": "fund_name",
            "type_code": "type_code",
            "type_name": "type_name"
        })
        s = Stream(inp, transform=[vm, vm2, vm3, dr, sk])
        return s
Пример #20
0
    def stream_consultant_010004(cls, d_xoi=None):
        """
            清洗010004源(证券公司)发行产品的投顾公司配对信息;

        Args:
            d_xoi:

        Returns:

        """

        sql_maxver = "SELECT im.matched_id, xfi.fund_issue_org_amac, manage_type_amac, fi.fund_name, fi.foundation_date, fi.end_date, fi.fund_status " \
                     "FROM x_fund_info_securities xfi " \
                     "JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_securities GROUP BY fund_id) tb_latest " \
                     "ON xfi.version = tb_latest.latest_ver AND xfi.fund_id = tb_latest.fund_id " \
                     "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.id_type = 1 AND im.source = '010004' AND im.is_used = 1 " \
                     "LEFT JOIN base.fund_info fi ON im.matched_id = fi.fund_id AND im.id_type = 1 AND im.source = '010004'" \
                     "WHERE fund_name_amac NOT LIKE '%%信托计划'"

        inp = MysqlInput(ENGINE_C, sql_maxver)

        inp2 = MysqlInput(ENGINE_B, "SELECT org_id, org_name FROM base.org_info")

        if d_xoi is None:
            d_xoi = cls._d_oi_consultant()

        vm = transform.ValueMap(
            OrderedDict([
                ("fund_issue_org_amac", lambda x: x.strip()),
                ("org_id", (lambda x: d_xoi.get(x), "fund_issue_org_amac")),
                (FundOrgMapping.org_type.name, (lambda x: {"主动管理": "投资顾问"}.get(x), "manage_type_amac")),
                (FundOrgMapping.is_current.name, (lambda x: cls._clean_iscurrent(x), "fund_status"))
            ])
        )

        jn = transform.Join(inp2, how="left", on="org_id")

        dn = transform.Dropna(subset=["matched_id", "org_id", FundOrgMapping.org_type.name])

        vm2 = transform.ValueMap({
            FundOrgMapping.org_type_code.name: (lambda x: 1 if x == "投资顾问" else None, FundOrgMapping.org_type.name),
        })

        dd = transform.DropDuplicate(subset=["matched_id", "org_id", FundOrgMapping.org_type.name])

        sk = transform.MapSelectKeys(cls.FIELDS)
        s = Stream(inp, transform=[vm, jn, dn, vm2, dd, sk])  # 先去空, 避免空值产生, 导致整形列变成浮点型
        return s
Пример #21
0
def stream_010001():
    session = dbsession(bind=engine_r)
    stmt = session.query(DBondInfo).join(
        BondInfo, DBondInfo.bond_id == BondInfo.bond_id).filter(
            DBondInfo.source_id == "010001").with_entities(
                DBondInfo.bond_id, DBondInfo.source_id,
                DBondInfo.bond_full_name, DBondInfo.interest_type,
                DBondInfo.interest_freq)
    inp = MysqlInput(session.bind, stmt)

    def clean_interest_freq(type_, freq, fname):
        if fname.find("贴现") >= 0:
            return "贴现"

        if type_ in {"零息", "贴现"}:
            return "贴现"
        elif type_ == "一次还本息":
            return "到期一次还本付息"
        else:

            d_freq = {
                "到期一次还本付息":
                "到期一次还本付息",  # 除去type_为"零息"的情况, 其余freq为"到期一次还本付息"映射为"到期一次还本付息"
                "按年付息": "年付息",
                "按半年付息": "半年付息",
                "按季度付息": "季度付息",
                "按月付息": "月付息",
            }
            return d_freq.get(freq)

    vm = transform.ValueMap({
        BondInfo.interest_freq.name:
        (lambda type_, freq, fname: clean_interest_freq(type_, freq, fname),
         DBondInfo.interest_type.name, DBondInfo.interest_freq.name,
         DBondInfo.bond_full_name.name)
    })

    dn = transform.Dropna(subset=[BondInfo.interest_freq.name])

    km = transform.MapSelectKeys({
        DBondInfo.bond_id.name: BondInfo.bond_id.name,
        BondInfo.interest_freq.name: None,
    })

    s = Stream(inp, (vm, dn, km))

    return s
Пример #22
0
    def stream_602_2(cls):
        sql = "SELECT fi.fund_id, fi.fund_name, fi.fund_full_name " \
              "FROM base.fund_info fi " \
              "WHERE fi.fund_id NOT IN (SELECT DISTINCT fund_id FROM base.fund_type_mapping_import WHERE typestandard_code = 602) "

        inp = MysqlInput(ENGINE, sql)

        def clean_tc(fund_full_name):
            patt = "优先级|(?<!\w)A级|劣后级|进取级|B级|普通级|次级|风险级|中间级|管理级|结构化|分级"  # A级前面不为数字和字母
            matched = set(re.findall(patt, fund_full_name))

            if ("优先级" in matched) or ("A级" in matched):
                return 60202, "结构化", 6020202, "优先级"
            elif ("劣后级" in matched) or ("进取级" in matched)\
                    or ("B级" in matched) or ("普通级" in matched)\
                    or ("次级" in matched) or ("风险级" in matched):
                return 60202, "结构化", 6020203, "进取级/劣后级"
            elif "中间级" in matched:
                return 60202, "结构化", 6020204, "中间级(夹层)"
            elif "管理级" in matched:
                return 60202, "结构化", 6020205, "管理型份额"
            elif ("结构化" in matched) or ("分级" in matched):
                return 60202, "结构化", None, None

            return None, None, None, None

        vm0 = transform.ValueMap({
            "__tmp": (lambda x: clean_tc(x), "fund_full_name")
        })

        vm = transform.ValueMap({
            FundTypeMappingImport.typestandard_code.name: 602,
            FundTypeMappingImport.typestandard_name.name: "按结构类型分类",
            FundTypeMappingImport.type_code.name: (lambda x: x[0], "__tmp"),
            FundTypeMappingImport.type_name.name: (lambda x: x[1], "__tmp"),
            FundTypeMappingImport.stype_code.name: (lambda x: x[2], "__tmp"),
            FundTypeMappingImport.stype_name.name: (lambda x: x[3], "__tmp"),
            FundTypeMappingImport.classified_by.name: CLASSIFIED_BY
        })

        sk = transform.MapSelectKeys(cls.FIXED_FIELDS)

        dn = transform.Dropna(subset=[FundTypeMappingImport.type_code.name, FundTypeMappingImport.stype_code.name], how="all")

        s = base.Stream(inp, [vm0, vm, sk, dn])
        return s
Пример #23
0
def stream_010001():
    session = dbsession(bind=engine_r)
    stmt = session.query(DBondInfo).join(
        BondInfo, DBondInfo.bond_id == BondInfo.bond_id).filter(
            and_(DBondInfo.source_id == "010001",
                 DBondInfo.is_used == 1)).with_entities(
                     DBondInfo.bond_id, DBondInfo.source_id,
                     DBondInfo.bond_full_name, DBondInfo.interest_type)
    inp = MysqlInput(session.bind, stmt)

    def clean_interest_type(type_, fname):
        if fname.find("贴现") >= 0:
            return "零息"

        else:
            d_type = {
                "零息": "零息",
                "固定利息": "固定利率",
                "浮动利息": "浮动利率",
            }

            res = d_type.get(type_)
            if res is None:
                if fname.find("短期融资券") >= 0:
                    return "固定利率"
            return res

    vm = transform.ValueMap({
        BondInfo.interest_type.name:
        (lambda type_, fname: clean_interest_type(type_, fname),
         DBondInfo.interest_type.name, DBondInfo.bond_full_name.name)
    })

    dn = transform.Dropna(subset=[BondInfo.interest_type.name])

    km = transform.MapSelectKeys({
        DBondInfo.bond_id.name: BondInfo.bond_id.name,
        BondInfo.interest_type.name: None
    })

    s = Stream(inp, (vm, dn, km))

    return s
Пример #24
0
    def stream_manager_010005(cls, d_xoi=None):
        """
            清洗010005源(期货公司)发行产品的基金管理人配对信息;

        Args:
            d_xoi:

        Returns:

        """

        sql_maxver = "SELECT im.matched_id, xfi.fund_issue_org_amac, fi.fund_name, fi.foundation_date, fi.end_date, fi.fund_status " \
                     "FROM x_fund_info_futures xfi " \
                     "JOIN (SELECT fund_id, MAX(version) latest_ver FROM x_fund_info_futures GROUP BY fund_id) tb_latest " \
                     "ON xfi.version = tb_latest.latest_ver AND xfi.fund_id = tb_latest.fund_id " \
                     "JOIN base.id_match im ON im.source_id = xfi.fund_id AND im.id_type = 1 AND im.source = '010005' AND im.is_used = 1 " \
                     "LEFT JOIN base.fund_info fi " \
                     "ON im.matched_id = fi.fund_id AND im.id_type = 1 AND im.source = '010005' " \
                     "WHERE fund_name_amac NOT LIKE '%%信托计划'"

        inp = MysqlInput(ENGINE_C, sql_maxver)

        inp2 = MysqlInput(ENGINE_B, "SELECT org_id, org_name FROM base.org_info")

        if d_xoi is None:
            d_xoi = cls._d_oi_manager()

        vm = transform.ValueMap(OrderedDict([
            ("fund_issue_org_amac", lambda x: x.strip()),
            ("org_id", (lambda x: d_xoi.get(x), "fund_issue_org_amac")),
            (FundOrgMapping.org_type.name, "基金管理人"),  # 该来源的fund_issue_org_amac都是基金管理人;
            (FundOrgMapping.org_type_code.name, 2),
            (FundOrgMapping.is_current.name, (lambda x: cls._clean_iscurrent(x), "fund_status"))
        ]))

        jn = transform.Join(inp2, how="left", on="org_id")

        sk = transform.MapSelectKeys(cls.FIELDS)
        dn = transform.Dropna(
            subset=[FundOrgMapping.fund_id.name, FundOrgMapping.org_id.name, FundOrgMapping.org_type_code.name])

        s = Stream(inp, transform=(vm, jn, sk, dn,))
        return s
Пример #25
0
def stream_xof_manager(s):
    vm = transform.ValueMap(
        OrderedDict([
            (FundOrgMapping.org_type_code.name, (lambda x: {"受托管理": 2, "自我管理": 2}.get(x), "org_type")),
            (FundOrgMapping.org_type.name, (lambda x: {"受托管理": "基金管理人", "自我管理": "基金管理人"}.get(x), "org_type"))
        ])
    )

    dn = transform.Dropna()
    sk = transform.MapSelectKeys(
        {
            "fund_id": FundOrgMapping.fund_id.name,
            "org_id": FundOrgMapping.org_id.name,
            FundOrgMapping.org_type.name: None,
            FundOrgMapping.org_type_code.name: None,
            "fund_name": FundOrgMapping.fund_name.name,
            "org_name": FundOrgMapping.org_name.name
        }
    )
    s = Stream(s, transform=(vm, dn, sk))
    return s
Пример #26
0
    def stream_010002_ts102(cls):
        source_id = "010002"
        tb = "crawl_private.x_fund_info_fundaccount"
        tmpsql = cls.SQL_102.format(tb=tb, sid=source_id)

        inp = MysqlInput(ENGINE, tmpsql)

        vm = transform.ValueMap(
            {
                FundTypeSource.source_id.name: source_id,
                FundTypeSource.typestandard_code.name: "102",
                FundTypeSource.typestandard_name.name: "按结构类型分类",
                FundTypeSource.type_code.name: (lambda x: cls.TS_MAPPER_102.get(x, [None, None])[0], "fund_type_allocation_amac"),
                FundTypeSource.type_name.name: (lambda x: cls.TS_MAPPER_102.get(x, [None, None])[1], "fund_type_allocation_amac"),
            }
        )

        dn = transform.Dropna(subset=[FundTypeSource.fund_id.name, FundTypeSource.type_code.name])

        s = base.Stream(inp, [vm, cls.SK, dn])
        return s
Пример #27
0
def stream_xof_consultant(s):
    vm = transform.ValueMap(
        OrderedDict([
            (FundOrgMapping.org_type_code.name, (lambda x: {"顾问管理": 1, "自我管理": 1}.get(x), "org_type")),
            (FundOrgMapping.org_type.name, (lambda x: {"顾问管理": "投资顾问", "自我管理": "投资顾问",}.get(x), "org_type"))
        ])
    )

    dn = transform.Dropna()
    sk = transform.MapSelectKeys(
        {
            "fund_id": FundOrgMapping.fund_id.name,
            "org_id": FundOrgMapping.org_id.name,
            FundOrgMapping.org_type.name: None,
            FundOrgMapping.org_type_code.name: None,
            "fund_name": FundOrgMapping.fund_name.name,
            "org_name": FundOrgMapping.org_name.name
        }
    )
    s = Stream(s, transform=(vm, dn, sk))
    return s
Пример #28
0
def stream_020003():
    session = dbsession(bind=ENGINE)

    query_oh = session.query(IdMatch).join(
        DFundInfo, and_(IdMatch.id_type == 1, IdMatch.source_id == DFundInfo.fund_id, IdMatch.data_source == DFundInfo.data_source, IdMatch.data_source == "020003")
    ).outerjoin(
        FundInfo, and_(IdMatch.id_type == 1, IdMatch.matched_id == FundInfo.fund_id)
    ).filter(
        IdMatch.is_used == 1
    ).with_entities(
        FundInfo.fund_id, FundInfo.fund_name, IdMatch.data_source, DFundInfo.fund_type
    )
    inp = MysqlInput(session.bind, query_oh)

    vm = transform.ValueMap(
        OrderedDict([
            [FundTypeMappingSource.type_code.name, (lambda ds, tp: etl.EnumMap.DFundInfo.type_code4.get((ds, tp)), DFundInfo.data_source.name, DFundInfo.fund_type.name)],
            [FundTypeMappingSource.type_name.name, (lambda tc: etl.Enum.TypeMapping.type_name.get(tc), FundTypeMappingSource.type_code.name)],
            [FundTypeMappingSource.typestandard_code.name, (lambda tc: tc[:-2], FundTypeMappingSource.type_code.name)],
            [FundTypeMappingSource.typestandard_name.name, (lambda tsc: etl.Enum.TypeMapping.typestandard_name.get(tsc), FundTypeMappingSource.typestandard_code.name)]
        ])
    )

    dn = transform.Dropna(subset=[FundInfo.fund_id.name, FundTypeMappingSource.typestandard_code.name])

    sk = transform.MapSelectKeys(
        {
            FundInfo.fund_id.name: None,
            FundInfo.fund_name.name: None,
            IdMatch.data_source.name: FundTypeMappingSource.data_source.name,
            FundTypeMappingSource.typestandard_code.name: None,
            FundTypeMappingSource.typestandard_name.name: None,
            FundTypeMappingSource.type_code.name: None,
            FundTypeMappingSource.type_name.name: None,
        }
    )

    s = base.Stream(inp, transform=(vm, dn, sk))
    return s
Пример #29
0
def stream_020003():
    session = dbsession(bind=engine_r)
    stmt = session.query(DBondInfo).join(
        BondInfo, DBondInfo.bond_id == BondInfo.bond_id).filter(
            DBondInfo.source_id == "020003").with_entities(
                DBondInfo.bond_id, DBondInfo.source_id,
                DBondInfo.bond_full_name, DBondInfo.interest_type,
                DBondInfo.interest_freq)
    inp = MysqlInput(session.bind, stmt)

    def clean_interest_freq(freq, fname):
        if fname.find("贴现") >= 0:
            return "贴现"

        d_freq = {
            "到期一次还本付息": "到期一次还本付息",
            "贴现": "贴现",
            "按年付息": "年付息",
            "半年付息": "半年付息",
            "按季付息": "季度付息",
        }
        return d_freq.get(freq)

    vm = transform.ValueMap({
        DBondInfo.interest_freq.name:
        (lambda freq, fname: clean_interest_freq(freq, fname),
         DBondInfo.interest_freq.name, DBondInfo.bond_full_name.name)
    })

    dn = transform.Dropna(subset=[BondInfo.interest_freq.name])

    km = transform.MapSelectKeys({
        DBondInfo.bond_id.name: BondInfo.bond_id.name,
        BondInfo.interest_freq.name: None,
    })

    s = Stream(inp, (vm, dn, km))

    return s
Пример #30
0
    def stream_602_3(cls):
        sql = "SELECT fi.fund_id, fi.fund_name, fts.type_code FROM fund_info fi " \
              "JOIN fund_type_source fts ON fi.fund_id = fts.fund_id " \
              "WHERE fi.fund_id NOT IN (" \
              "SELECT DISTINCT fund_id FROM base.fund_type_mapping_import WHERE typestandard_code = 602)" \
              "AND fts.type_code IN (10201, 10202, 30201, 30202)"

        inp = MysqlInput(ENGINE, sql)

        structured, unstructured = (60202, "结构化", None, None), (60201, "非结构化", None, None)

        d = {
            10201: structured,
            10202: unstructured,
            30201: structured,
            30202: unstructured
        }

        vm0 = transform.ValueMap({
            "__tmp": (lambda x:  d.get(x, (None, None, None, None)), "type_code")
        })

        vm = transform.ValueMap({
            FundTypeMappingImport.typestandard_code.name: 602,
            FundTypeMappingImport.typestandard_name.name: "按结构类型分类",
            FundTypeMappingImport.type_code.name: (lambda x: x[0], "__tmp"),
            FundTypeMappingImport.type_name.name: (lambda x: x[1], "__tmp"),
            FundTypeMappingImport.stype_code.name: (lambda x: x[2], "__tmp"),
            FundTypeMappingImport.stype_name.name: (lambda x: x[3], "__tmp"),
            FundTypeMappingImport.classified_by.name: CLASSIFIED_BY
        })

        sk = transform.MapSelectKeys(cls.FIXED_FIELDS)

        dn = transform.Dropna(subset=[FundTypeMappingImport.type_code.name, FundTypeMappingImport.stype_code.name], how="all")

        s = base.Stream(inp, [vm0, vm, sk, dn])
        return s