def parseMember_save(source_company_id, item, download_crawler): logger.info("parseMember_save") companyKey = item["key"] d = pq(html.fromstring(item['content'].decode("utf-8"))) members = d('.startups-member') for m in members: name = d(m)('.media-heading').text() logger.info(name) desc = d(m)('.desc').text() position = d(m)('.title').text() logo = 'http:' + d(m)(".media-object").attr('src').replace( '@!logom', '') if logo.find('deafult') >= 0 or logo.find('default') >= 0: logo = None if logo: logo = logo.replace("https://", "http://") sourceId = d(m)('.media-body a').attr('href') if sourceId is not None: sourceId = str(companyKey) + '_' + sourceId.split( 'person/')[-1].strip() else: sourceId = str( companyKey) + '_' + kr36_company_parser_2.get_company_code( name) source_member = { "source": SOURCE, "sourceId": sourceId, "name": name, "photo_url": logo, "weibo": None, "location": 0, "role": position[:50], "description": desc, "education": None, "work": None } ptype = name_helper.position_check(position) source_company_member_rel = { "sourceCompanyId": source_company_id, "position": position[:50], "joinDate": None, "leaveDate": None, "type": ptype } try: parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel) # logger.info(source_member) # logger.info(source_company_member_rel) except Exception, ex: logger.info("%s:%s", Exception, ex) exit()
def parseMember_save(source_company_id, item, download_crawler): if item is None: return None company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) members = [] # members logger.info("*** member ****") lis = d('ul.team-list> li') for li in lis: try: l = pq(li) member_name = l('div.per-name> a').text().strip() member_key = l('div.per-name> a').attr("href").split("/")[-1] position = l('div.per-position').text().strip() logo = l('a.avatar> img').attr("src") desc = l('div.per-des').text().strip() logger.info( "member_key: %s, member_name: %s, position: %s, desc: %s" % (member_key, member_name, position, desc)) source_member = { "source": SOURCE, "sourceId": str(member_key), "name": member_name, "photo_url": logo, "weibo": None, "location": 0, "role": position, "description": desc, "education": None, "work": None } # member = { # "key":member_key, # "name":member_name, # "position":position # } ptype = name_helper.position_check(position) source_company_member_rel = { "sourceCompanyId": source_company_id, "position": position, "joinDate": None, "leaveDate": None, "type": ptype } parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel) # members.append(member) except Exception,ex: logger.exception(ex)
def parseMember_save(source_company_id, item, download_crawler): company_key = item["sourceId"] logger.info("parseMember_save") if item.has_key("jqkaBrief") is False: return m = {"name": item["jqkaBrief"]["chairman"], "job": "董事长"} try: if m["name"] is None or m["name"].strip() == "": return position = m.get("job", "") if position.find("董事长") == -1: return logger.info("%s-%s", m["name"], position) source_member = { "source": SOURCE, "sourceId": str(company_key) + '_' + get_company_code(m["name"]), "name": m["name"], "photo_url": None, "weibo": None, "location": 0, "role": position, "description": None, "education": None, "work": None } # ptype = name_helper.position_check(position) ptype = 5010 source_company_member_rel = { "sourceCompanyId": source_company_id, "position": position, "joinDate": None, "leaveDate": None, "type": ptype } # try: logger.info( json.dumps(source_member, ensure_ascii=False, cls=util.CJsonEncoder)) logger.info( json.dumps(source_company_member_rel, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel) pass except: pass
def parseMember_save(source_company_id, item, download_crawler): name = item['name'] logger.info('parseMember_save:%s' % name) members = item['content']['member']['current_employees'] for m in members: # logger.info('*******%s'%m) person = m.get('person_identifier', '') name = person.get('value', '') # logger.info('name:%s', name) uuid = person.get('uuid', '') desc = None position = m.get('title', '') # logger.info('position:%s', position) logo = person.get('image_id', '') if logo: logo = 'https://crunchbase-production-res.cloudinary.com/image/upload/c_thumb,h_200,w_200,f_auto,g_faces,z_0.7,b_white,q_auto:eco/%s' % logo # logger.info('logo:%s', logo) source_member = { "source": SOURCE, "sourceId": uuid, "name": name, "photo_url": logo, "weibo": None, "location": 0, "role": position, "description": desc, "education": None, "work": None } ptype = name_helper.crunchbase_position_check(position) # logger.info('ptype:%s',ptype) source_company_member_rel = { "sourceCompanyId": source_company_id, "position": position, "joinDate": None, "leaveDate": None, "type": ptype } logger.info(json.dumps(source_member, ensure_ascii=False, indent=2)) logger.info( json.dumps(source_company_member_rel, ensure_ascii=False, indent=2)) try: parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel) except: pass
def parseMember_save(source_company_id, type, members, download_crawler): logger.info("parseMember_save") for m in members: if not m.has_key("name"): continue logger.info(m["name"]) desc = m.get("intro") member_type = type_map.get(m.get("type"), "") position = m.get("position", "") if len(position) > 20: if desc is None: desc = position else: desc += '\n' + position position = member_type else: position = member_type + position logo = m.get("avatar") if logo: logo = logo.replace("https://", "http://") source_member = { "source": SOURCE, "sourceId": str(m["id"]), "name": m["name"], "photo_url": logo, "weibo": None, "location": 0, "role": None, "description": desc, "education": None, "work": None } source_company_member_rel = { "sourceCompanyId": source_company_id, "position": position, "joinDate": None, "leaveDate": None, "type": type } try: parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel) except: pass
def parseMember_save(source_company_id, item, download_crawler): logger.info("parseMember_save") members = item["content"]["member"]["data"]["members"] for m in members: if not m.has_key("name"): continue logger.info(m["name"]) desc = m.get("intro") position = m.get("position", "") logo = m.get("avatar") if logo: logo = logo.replace("https://", "http://") source_member = { "source": SOURCE, "sourceId": str(m["id"]), "name": m["name"], "photo_url": logo, "weibo": None, "location": 0, "role": position, "description": desc, "education": None, "work": None } ptype = name_helper.position_check(position) source_company_member_rel = { "sourceCompanyId": source_company_id, "position": position, "joinDate": None, "leaveDate": None, "type": ptype } try: parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel) # logger.info(source_member) # logger.info(source_company_member_rel) except: pass
def parseMember_save(source_company_id, item): if item is None: return logger.info("*** member ***") html = item["content"] d = pq(html) lis = d('.manager_list > li') member_rank = 0 if len(lis) > 0: for li in lis: mem = pq(li) try: logo_url = mem('img').attr('src') if logo_url.startswith("http") or logo_url.startswith("https"): pass else: logo_url = "http:" + logo_url member_rank += 1 member_key = str(item["key"]) + '_' + str(member_rank) member_name = mem('p.item_manager_name > span').text() member_link = mem('p.item_manager_name > a').attr('href') member_position = mem('p.item_manager_title').text() member_desc = mem('div.item_manager_content').text() # print member_position # print member_name # print member_desc weibo = None if member_link is not None: if 'weibo.com' in member_link: weibo = member_link source_member = { 'name': member_name, 'photo_url': logo_url, 'weibo': weibo, 'location': None, 'role': member_position, 'description': member_desc, 'education': None, 'work': None, 'source': SOURCE, 'sourceId': member_key, } ptype = name_helper.position_check(member_position) source_company_member_rel = { 'sourceCompanyId': source_company_id, 'position': member_position, 'joinDate': None, 'leaveDate': None, 'type': ptype } logger.info( json.dumps(source_member, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel) except Exception, ex: logger.exception(ex)