def stream_020001_op(): sql = "SELECT im.matched_id, person_name FROM crawl_private.d_org_person tb_main " \ "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.d_org_person GROUP BY person_id) tb_latest " \ "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \ "JOIN base.id_match im ON im.source_id = tb_main.person_id " \ "AND im.id_type = 3 AND im.source = '020001' AND im.is_used = 1 " inp = MysqlInput(ENGINE, sql) vm = transform.ValueMap( OrderedDict([ [ PersonInfo.person_name_py.name, (lambda x: "".join( [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), "person_name") ], ])) sk = transform.MapSelectKeys({ "matched_id": PersonInfo.person_id.name, "person_name": PersonInfo.person_name.name, PersonInfo.person_name_py.name: None, }) s = base.Stream(inp, transform=(vm, sk)) return s
def stream_y_person_info(cls): """ 清洗 y_person_info; """ sql = "SELECT person_id, person_name, gender, background, education," \ "graduate_school, investment_years FROM crawl_private.y_person_info" inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "person_name_py": (lambda x: "".join( [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), "person_name"), }) sk = transform.MapSelectKeys({ "person_id": "person_id", 'person_name': 'person_name', "person_name_py": "person_name_py", "gender": "gender", "background": "background", "graduate_school": "graduate_school", "investment_years": "investment_years" }) s = Stream(inp, transform=[vm, sk]) return s
def stream_020003(cls): """ 清洗 d_person_info; """ sql = "SELECT im.matched_id, person_name, background FROM crawl_private.d_person_info tb_main " \ "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.d_person_info GROUP BY person_id) tb_latest " \ "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \ "JOIN base.id_match im ON im.source_id = tb_main.person_id " \ "AND im.id_type = 3 AND im.source = '020003' AND im.is_used = 1 " inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "background": lambda x: cls.BACKGROUND.get(x) if type(x) is str else x, "person_name_py": (lambda x: "".join( [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), "person_name"), }) sk = transform.MapSelectKeys({ "matched_id": "person_id", 'background': 'background', 'person_name': 'person_name', "person_name_py": "person_name_py" }) s = Stream(inp, transform=[vm, sk]) return s
def stream_010001(): import re sql = "SELECT im.matched_id, `name`, qualifying_way FROM crawl_private.x_org_executive_info tb_main " \ "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.x_org_executive_info GROUP BY person_id) tb_latest " \ "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \ "JOIN base.id_match im ON im.source_id = tb_main.person_id " \ "AND im.id_type = 3 AND im.source = '010001' AND im.is_used = 1 " inp = MysqlInput(ENGINE, sql) vm = transform.ValueMap( OrderedDict([ [ PersonInfo.person_name.name, (lambda x: re.sub("(.*)", "", x), "name") ], [ PersonInfo.person_name_py.name, (lambda x: "".join( [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), PersonInfo.person_name.name) ], [ PersonInfo.fund_qualification_way.name, (lambda x: { "通过考试": "通过考试", "资格认定": "资格认定" }.get(x), "qualifying_way") ], [ PersonInfo.is_fund_qualification.name, (lambda x: int(bool(x)), PersonInfo.fund_qualification_way.name) ], ])) sk = transform.MapSelectKeys({ "matched_id": PersonInfo.person_id.name, PersonInfo.person_name.name: None, PersonInfo.person_name_py.name: None, PersonInfo.fund_qualification_way.name: None, PersonInfo.is_fund_qualification.name: None }) s = base.Stream(inp, transform=(vm, sk)) return s
def stream_020002(): sql = "SELECT im.matched_id, person_name, background FROM crawl_private.d_person_info tb_main " \ "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.d_person_info GROUP BY person_id) tb_latest " \ "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \ "JOIN base.id_match im ON im.source_id = tb_main.person_id " \ "AND im.id_type = 3 AND im.source = '020002' AND im.is_used = 1 " inp = MysqlInput(ENGINE, sql) inp.dataframe inp.dataframe["background"].drop_duplicates().tolist() vm = transform.ValueMap( OrderedDict([ [PersonInfo.person_name.name, lambda x: x.strip()], [ PersonInfo.person_name_py.name, (lambda x: "".join( [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), "person_name") ], [ PersonInfo.background.name, (lambda x: { '公募': '公募', '其它': "其他", '券商': "券商", '海外': "海外", "民间": "民间", "学者": "学者", '实业': "实业", '保险': "保险", '媒体': "媒体", '期货': '期货' }.get(x, "其他"), "background") ], ])) sk = transform.MapSelectKeys({ "matched_id": PersonInfo.person_id.name, "person_name": PersonInfo.person_name.name, PersonInfo.person_name_py.name: None, PersonInfo.background.name: None, }) s = base.Stream(inp, transform=(vm, sk)) return s
def stream_010001(cls): """ 清洗 x_org_executive_info; """ sql = "SELECT im.matched_id, `name`, qualifying_way FROM crawl_private.x_org_executive_info tb_main " \ "JOIN (SELECT person_id, MAX(version) latest_ver FROM crawl_private.x_org_executive_info GROUP BY " \ "person_id) tb_latest " \ "ON tb_main.version = tb_latest.latest_ver AND tb_main.person_id = tb_latest.person_id " \ "JOIN base.id_match im ON im.source_id = tb_main.person_id " \ "AND im.id_type = 3 AND im.source = '010001' AND im.is_used = 1 " inp = MysqlInput(ENGINE_RD, sql) vm = transform.ValueMap({ "name": (lambda x: re.sub("(.*)", "", x), "name"), "qualifying_way": lambda x: { "通过考试": "通过考试", "资格认定": "资格认定" }.get(x) if type(x) is str else x, "person_name_py": (lambda x: "".join( [x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), "name"), }) vm2 = transform.ValueMap({ "is_fund_qualification": (lambda x: int(bool(x)), "qualifying_way") }) sk = transform.MapSelectKeys({ "matched_id": "person_id", 'qualifying_way': 'fund_qualification_way', 'name': 'person_name', "person_name_py": "person_name_py", "is_fund_qualification": "is_fund_qualification" }) s = Stream(inp, transform=[vm, vm2, sk]) return s
def stream(): session = dbsession(bind=ENGINE_RD) query = session.query(OrgInfo).with_entities(OrgInfo.org_id, OrgInfo.org_name, OrgInfo.org_name_py) inp = MysqlInput(session.bind, query) vm1 = transform.ValueMap( {OrgInfo.org_name.name: lambda x: re.sub("(.*)|\(.*\)", "", x)}) vm2 = transform.ValueMap({ OrgInfo.org_name_py.name: (lambda x: "".join([x[0] for x in py(x, style=py_style.FIRST_LETTER)]). upper(), OrgInfo.org_name.name) }) sk = transform.MapSelectKeys({ OrgInfo.org_id.name: None, OrgInfo.org_name_py.name: None }) s = Stream(inp, transform=(vm1, vm2, sk)) return s
def conflu_pinyin(cls): # org_name_py sql = "SELECT org_id, org_name FROM {tb_test}".format(tb_test=TEST_TABLE) inp = MysqlInput(ENGINE_RD, sql) vm1 = transform.ValueMap({ OrgInfo.org_name.name: lambda x: re.sub("(.*)|\(.*\)", "", x) }) vm2 = transform.ValueMap({ OrgInfo.org_name_py.name: ( lambda x: "".join([x[0] for x in py(x, style=py_style.FIRST_LETTER)]).upper(), OrgInfo.org_name.name ) }) sk = transform.MapSelectKeys({ OrgInfo.org_id.name: None, OrgInfo.org_name_py.name: None }) s = Stream(inp, transform=(vm1, vm2, sk)) return Confluence(s)
def articleToObject(article, keymap): lhTone = py(article, style=ST.INITIALS, errors=parseOtherStr) rhTone = py(article, style=ST.FINALS_TONE, errors=parseOtherStr) rhCh = py(article, style=ST.FINALS, errors=parseOtherStr) # tone = [[lhTone[i][0], rhTone[i][0]] for i in range(len(article))] tone = [] for i in range(len(lhTone)): tmp = [] if lhTone[i][0] != '': for x in lhTone[i]: tmp.append(x) if rhTone[i][0] != '': for x in rhTone[i]: tmp.append(x) tone.append(tmp) keys = [] for i in range(len(lhTone)): tmp = [] if not lhTone[i][0] in keymap: pass else: # tmp.append(keymap[lhTone[i][0]]) # dmap按键方案 tmp.append(lhTone[i][0]) # dmap声韵母方案 if not rhCh[i][0] in keymap: pass else: # tmp.append(keymap[rhCh[i][0]]) # dmap按键方案 tmp.append(rhCh[i][0]) # dmap声韵母方案 # if len(tmp) == 1: # 单韵母情况 # if tmp[0] == '': # 单韵母情况 # if keymap['0'] == '1': # 非固定零声母统一方案 # tmp[0] = rhCh[i][0][0] # elif keymap['0'] == '2': # 非固定零声母二字母全拼方案 # if len(rhCh[i][0]) == 2: # tmp = [rhCh[i][0][0], rhCh[i][0][1]] # else: # tmp[0] = rhCh[i][0][0] # else: # 固定零声母方案,keymap['0']表示零声母键 # tmp[0] = keymap['0'] keys.append(tmp) # 非汉字desc/dmap置空项 # rlt = [ # { # 'word': article[i], # 'desc': tone[i], # 'dmap': keys[i] # } for i in range(len(article)) # ] # 非汉字desc/dmap删除项目 whiteChar = re.compile('^ $|^\n$|^ $|^\t$') ret = {'article': []} rlt = ret['article'] iWord = 0 for iKey in range(len(tone)): tmp = {'word': article[iWord]} if tone[iKey][0] != article[iWord][0]: tmp['tone'] = tone[iKey] tmp['spell'] = keys[iKey] tmp['canInput'] = True rlt.append(tmp) iWord += 1 else: for iIdx in range(int(len(tone[iKey]) / 2)): tmp = {'word': tone[iKey][iIdx], 'canInput': False} iWord += 1 if whiteChar.match(tmp['word']) != None: continue rlt.append(tmp) return ret