예제 #1
0
def stream_020001_op():
    sql = "SELECT im.matched_id, person_name FROM crawl_private.d_org_person tb_main " \
          "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.d_org_person GROUP BY person_id) tb_latest " \
          "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \
          "JOIN base.id_match im ON im.source_id = tb_main.person_id " \
          "AND im.id_type = 3 AND im.source = '020001' AND im.is_used = 1 "

    inp = MysqlInput(ENGINE, sql)

    vm = transform.ValueMap(
        OrderedDict([
            [
                PersonInfo.person_name_py.name,
                (lambda x: "".join(
                    [x[0]
                     for x in py(x, style=py_style.FIRST_LETTER)]).upper(),
                 "person_name")
            ],
        ]))

    sk = transform.MapSelectKeys({
        "matched_id": PersonInfo.person_id.name,
        "person_name": PersonInfo.person_name.name,
        PersonInfo.person_name_py.name: None,
    })
    s = base.Stream(inp, transform=(vm, sk))

    return s
예제 #2
0
파일: person_info.py 프로젝트: dxcv/fund
    def stream_y_person_info(cls):
        """
            清洗 y_person_info;

        """

        sql = "SELECT person_id, person_name, gender, background, education," \
              "graduate_school, investment_years FROM crawl_private.y_person_info"

        inp = MysqlInput(ENGINE_RD, sql)

        vm = transform.ValueMap({
            "person_name_py": (lambda x: "".join(
                [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(),
                               "person_name"),
        })

        sk = transform.MapSelectKeys({
            "person_id": "person_id",
            'person_name': 'person_name',
            "person_name_py": "person_name_py",
            "gender": "gender",
            "background": "background",
            "graduate_school": "graduate_school",
            "investment_years": "investment_years"
        })
        s = Stream(inp, transform=[vm, sk])
        return s
예제 #3
0
파일: person_info.py 프로젝트: dxcv/fund
    def stream_020003(cls):
        """
            清洗 d_person_info;

        """


        sql = "SELECT im.matched_id, person_name, background FROM crawl_private.d_person_info tb_main " \
              "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.d_person_info GROUP BY person_id) tb_latest " \
              "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \
              "JOIN base.id_match im ON im.source_id = tb_main.person_id " \
              "AND im.id_type = 3 AND im.source = '020003' AND im.is_used = 1 "

        inp = MysqlInput(ENGINE_RD, sql)

        vm = transform.ValueMap({
            "background":
            lambda x: cls.BACKGROUND.get(x) if type(x) is str else x,
            "person_name_py": (lambda x: "".join(
                [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(),
                               "person_name"),
        })

        sk = transform.MapSelectKeys({
            "matched_id": "person_id",
            'background': 'background',
            'person_name': 'person_name',
            "person_name_py": "person_name_py"
        })
        s = Stream(inp, transform=[vm, sk])
        return s
예제 #4
0
def stream_010001():
    import re
    sql = "SELECT im.matched_id, `name`, qualifying_way FROM crawl_private.x_org_executive_info tb_main " \
          "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.x_org_executive_info GROUP BY person_id) tb_latest " \
          "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \
          "JOIN base.id_match im ON im.source_id = tb_main.person_id " \
          "AND im.id_type = 3 AND im.source = '010001' AND im.is_used = 1 "

    inp = MysqlInput(ENGINE, sql)

    vm = transform.ValueMap(
        OrderedDict([
            [
                PersonInfo.person_name.name,
                (lambda x: re.sub("(.*)", "", x), "name")
            ],
            [
                PersonInfo.person_name_py.name,
                (lambda x: "".join(
                    [x[0]
                     for x in py(x, style=py_style.FIRST_LETTER)]).upper(),
                 PersonInfo.person_name.name)
            ],
            [
                PersonInfo.fund_qualification_way.name,
                (lambda x: {
                    "通过考试": "通过考试",
                    "资格认定": "资格认定"
                }.get(x), "qualifying_way")
            ],
            [
                PersonInfo.is_fund_qualification.name,
                (lambda x: int(bool(x)),
                 PersonInfo.fund_qualification_way.name)
            ],
        ]))

    sk = transform.MapSelectKeys({
        "matched_id": PersonInfo.person_id.name,
        PersonInfo.person_name.name: None,
        PersonInfo.person_name_py.name: None,
        PersonInfo.fund_qualification_way.name: None,
        PersonInfo.is_fund_qualification.name: None
    })

    s = base.Stream(inp, transform=(vm, sk))

    return s
예제 #5
0
def stream_020002():
    sql = "SELECT im.matched_id, person_name, background FROM crawl_private.d_person_info tb_main " \
          "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.d_person_info GROUP BY person_id) tb_latest " \
          "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \
          "JOIN base.id_match im ON im.source_id = tb_main.person_id " \
          "AND im.id_type = 3 AND im.source = '020002' AND im.is_used = 1 "

    inp = MysqlInput(ENGINE, sql)
    inp.dataframe
    inp.dataframe["background"].drop_duplicates().tolist()

    vm = transform.ValueMap(
        OrderedDict([
            [PersonInfo.person_name.name, lambda x: x.strip()],
            [
                PersonInfo.person_name_py.name,
                (lambda x: "".join(
                    [x[0]
                     for x in py(x, style=py_style.FIRST_LETTER)]).upper(),
                 "person_name")
            ],
            [
                PersonInfo.background.name,
                (lambda x: {
                    '公募': '公募',
                    '其它': "其他",
                    '券商': "券商",
                    '海外': "海外",
                    "民间": "民间",
                    "学者": "学者",
                    '实业': "实业",
                    '保险': "保险",
                    '媒体': "媒体",
                    '期货': '期货'
                }.get(x, "其他"), "background")
            ],
        ]))

    sk = transform.MapSelectKeys({
        "matched_id": PersonInfo.person_id.name,
        "person_name": PersonInfo.person_name.name,
        PersonInfo.person_name_py.name: None,
        PersonInfo.background.name: None,
    })
    s = base.Stream(inp, transform=(vm, sk))

    return s
예제 #6
0
파일: person_info.py 프로젝트: dxcv/fund
    def stream_010001(cls):
        """
            清洗 x_org_executive_info;

        """

        sql = "SELECT im.matched_id, `name`, qualifying_way FROM crawl_private.x_org_executive_info tb_main " \
              "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.x_org_executive_info GROUP BY " \
              "person_id) tb_latest " \
              "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \
              "JOIN base.id_match im ON im.source_id = tb_main.person_id " \
              "AND im.id_type = 3 AND im.source = '010001' AND im.is_used = 1 "

        inp = MysqlInput(ENGINE_RD, sql)

        vm = transform.ValueMap({
            "name": (lambda x: re.sub("(.*)", "", x), "name"),
            "qualifying_way":
            lambda x: {
                "通过考试": "通过考试",
                "资格认定": "资格认定"
            }.get(x) if type(x) is str else x,
            "person_name_py": (lambda x: "".join(
                [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(),
                               "name"),
        })

        vm2 = transform.ValueMap({
            "is_fund_qualification": (lambda x: int(bool(x)), "qualifying_way")
        })

        sk = transform.MapSelectKeys({
            "matched_id":
            "person_id",
            'qualifying_way':
            'fund_qualification_way',
            'name':
            'person_name',
            "person_name_py":
            "person_name_py",
            "is_fund_qualification":
            "is_fund_qualification"
        })
        s = Stream(inp, transform=[vm, vm2, sk])
        return s
예제 #7
0
def stream():
    session = dbsession(bind=ENGINE_RD)
    query = session.query(OrgInfo).with_entities(OrgInfo.org_id,
                                                 OrgInfo.org_name,
                                                 OrgInfo.org_name_py)
    inp = MysqlInput(session.bind, query)

    vm1 = transform.ValueMap(
        {OrgInfo.org_name.name: lambda x: re.sub("(.*)|\(.*\)", "", x)})

    vm2 = transform.ValueMap({
        OrgInfo.org_name_py.name:
        (lambda x: "".join([x[0] for x in py(x, style=py_style.FIRST_LETTER)]).
         upper(), OrgInfo.org_name.name)
    })

    sk = transform.MapSelectKeys({
        OrgInfo.org_id.name: None,
        OrgInfo.org_name_py.name: None
    })
    s = Stream(inp, transform=(vm1, vm2, sk))
    return s
예제 #8
0
    def conflu_pinyin(cls):
        # org_name_py

        sql = "SELECT org_id, org_name FROM {tb_test}".format(tb_test=TEST_TABLE)
        inp = MysqlInput(ENGINE_RD, sql)

        vm1 = transform.ValueMap({
            OrgInfo.org_name.name: lambda x: re.sub("(.*)|\(.*\)", "", x)
        })

        vm2 = transform.ValueMap({
            OrgInfo.org_name_py.name: (
                lambda x: "".join([x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), OrgInfo.org_name.name
            )
        })

        sk = transform.MapSelectKeys({
            OrgInfo.org_id.name: None,
            OrgInfo.org_name_py.name: None
        })
        s = Stream(inp, transform=(vm1, vm2, sk))

        return Confluence(s)
예제 #9
0
def articleToObject(article, keymap):
    lhTone = py(article, style=ST.INITIALS, errors=parseOtherStr)
    rhTone = py(article, style=ST.FINALS_TONE, errors=parseOtherStr)
    rhCh = py(article, style=ST.FINALS, errors=parseOtherStr)

    # tone = [[lhTone[i][0], rhTone[i][0]] for i in range(len(article))]
    tone = []
    for i in range(len(lhTone)):
        tmp = []
        if lhTone[i][0] != '':
            for x in lhTone[i]:
                tmp.append(x)
        if rhTone[i][0] != '':
            for x in rhTone[i]:
                tmp.append(x)
        tone.append(tmp)
    keys = []
    for i in range(len(lhTone)):
        tmp = []
        if not lhTone[i][0] in keymap:
            pass
        else:
            # tmp.append(keymap[lhTone[i][0]])  # dmap按键方案
            tmp.append(lhTone[i][0])  # dmap声韵母方案
        if not rhCh[i][0] in keymap:
            pass
        else:
            # tmp.append(keymap[rhCh[i][0]])    # dmap按键方案
            tmp.append(rhCh[i][0])  # dmap声韵母方案
        # if len(tmp) == 1:    # 单韵母情况
        # if tmp[0] == '':    # 单韵母情况
        #     if keymap['0'] == '1':  # 非固定零声母统一方案
        #         tmp[0] = rhCh[i][0][0]
        #     elif keymap['0'] == '2':    # 非固定零声母二字母全拼方案
        #         if len(rhCh[i][0]) == 2:
        #             tmp = [rhCh[i][0][0], rhCh[i][0][1]]
        #         else:
        #             tmp[0] = rhCh[i][0][0]
        #     else:           # 固定零声母方案,keymap['0']表示零声母键
        #         tmp[0] = keymap['0']
        keys.append(tmp)

    # 非汉字desc/dmap置空项
    # rlt = [
    #     {
    #         'word': article[i],
    #         'desc': tone[i],
    #         'dmap': keys[i]
    #     } for i in range(len(article))
    # ]
    # 非汉字desc/dmap删除项目
    whiteChar = re.compile('^ $|^\n$|^ $|^\t$')
    ret = {'article': []}
    rlt = ret['article']
    iWord = 0
    for iKey in range(len(tone)):
        tmp = {'word': article[iWord]}
        if tone[iKey][0] != article[iWord][0]:
            tmp['tone'] = tone[iKey]
            tmp['spell'] = keys[iKey]
            tmp['canInput'] = True
            rlt.append(tmp)
            iWord += 1
        else:
            for iIdx in range(int(len(tone[iKey]) / 2)):
                tmp = {'word': tone[iKey][iIdx], 'canInput': False}
                iWord += 1
                if whiteChar.match(tmp['word']) != None:
                    continue
                rlt.append(tmp)
    return ret