예제 #1
0
def parse_companyjobs(source_company_id, item, sourceId):
    source_jobs = []
    logger.info("source_company_id is %s", source_company_id)

    d = pq((html.fromstring(item["content"].decode("utf-8"))))
    # logger.info("this page has %s jobs", position_type, len(jobs))
    for li in d('.job-info'):
        dj = pq(li)
        job_link = dj('a.title').attr("href")
        job_key = job_link.split("/")[-1].replace(".shtml", "")
        born_time = dj('time').attr('title')
        position = dj('a.title').attr("title")

        city = dj('.condition span:nth-child(2)').text().split('-')[0]
        work_year = dj('.condition span:nth-child(4)').text()
        education = dj('.condition span:nth-child(3)').text()
        logger.info("%s - %s - %s", city, work_year, education)

        salary = dj('.condition span:nth-child(1)').text()
        # update_time = born_time

        domain = 0

        location_id = 0
        location_new = parser_db_util.get_location(city)
        if location_new != None:
            location_id = location_new["locationId"]

        education_type = 0
        if '大专' in education:
            education_type = 6020
        elif '本科' in education:
            education_type = 6030
        elif '硕士' in education:
            education_type = 6040
        elif '博士' in education:
            education_type = 6050

        workYear_type = 7000
        if '应届' in work_year:
            workYear_type = 7010
        elif '1年以下' in work_year:
            workYear_type = 7020
        elif '1年以上' in work_year or '2年以上' in work_year:
            workYear_type = 7030
        elif '3年以上' in work_year:
            workYear_type = 7040
        elif '5年以上' in work_year or '6年以上' in work_year or '8年以上' in work_year:
            workYear_type = 7050
        elif '10年以上' in work_year:
            workYear_type = 7060

        logger.info("born time: %s", born_time)
        try:
            born_time = born_time.replace("发布于", "")
            if born_time.find("月") >= 0:
                logger.info("neeewdate:%s", born_time)
                update_time = datetime.datetime.strptime(
                    born_time, "%Y年%m月%d日")
            else:
                update_time = datetime.datetime.now()
        except Exception, e:
            logger.info(e)
            update_time = datetime.datetime.now()

        source_job = {
            "source": SOURCE,
            "sourceId": job_key,
            "recruit_company_id": str(source_company_id),
            "position": position,
            "salary": salary,
            "description": None,
            "domain": domain,
            "locationId": location_id,
            "educationType": education_type,
            "workYearType": workYear_type,
            "startDate": update_time,
            "updateDate": update_time,
            "jobNature": None,
            "positionAdvantage": None,
            "companyLabelList": None,
            "financeStage": None,
            "district": None,
        }
        logger.info("job_key: %s", job_key)
        for i in source_job:
            logger.info("%s->%s", i, source_job[i])

        source_jobs.append(source_job)
예제 #2
0
def parse_investor(item):
    logger.info("parse_investor")
    investor_key = item["key"]

    c = item["content"]["basic"]["data"]

    establish_date = None
    if c.has_key("startDate"):
        d = time.localtime(c["startDate"]/1000)
        if d.tm_year > 1980:
            establish_date = datetime.datetime(d.tm_year,d.tm_mon,d.tm_mday)

    addresses = []

    for ad in c.get("addresses",[]):
        address1 = None
        address2 = None
        if ad.has_key("city"):
            address1 = ad["city"]
        if ad.has_key("address"):
            address2 = ad["address"]

        location_id = 0
        if address1 is not None and address1.strip()!="":
            city = address1
            if city != None:
                location = parser_db_util.get_location(formCityName(city))
                if location != None:
                    location_id= location["locationId"]

        if location_id==0 and address2 != None and address2.strip()!="":
            city = address2
            if city != None:
                location = parser_db_util.get_location(formCityName(city))
                if location != None:
                    location_id = location["locationId"]

        if (address2 is not None and address2.strip()!="") or (ad.get("phone",None) is not None and ad.get("phone",None).strip() !="") or \
                                ad.get("email", None) is not None and ad.get("email", None).strip() != "" :
            addresses.append({
                "locationId": location_id,
                "address": address2,
                "phone": ad.get("phone",None) if ad.get("phone",None) is not None and ad.get("phone",None).strip() !="" else None,
                "email": ad.get("email", None) if ad.get("email", None) is not None and ad.get("email",
                                                                                               None).strip() != "" else None,
            })

    name = c["nameAbbr"]
    fullName = c["name"]
    if (name is None or name.strip() == "") and (fullName is None or fullName.strip() ==""):
        logger.info("*******************wrong thing")
        return {
            "wrong": 1,
            "key": investor_key
        }
    else:
        return {
            "name": fullName if name is None or name.strip() == "" else name,
            "fullName": None if fullName is None or fullName.strip() == "" else fullName,
            "description": None if c.get("intro",None) is None or c.get("intro",None).strip() == "" else c["intro"],
            "website": None if c.get("website",None) is None or c.get("website",None).strip() == "" else c["website"],
            "logo": None if c.get("logo",None) is None or c.get("logo",None).strip() == "" else c["logo"],
            "source": SOURCE,
            "sourceId": str(investor_key),
            "wechatId": None if c.get("weixin",None) is None or c.get("weixin",None).strip() == "" else c["weixin"],
            "weibo": None if c.get("weibo",None) is None or c.get("weibo",None).strip() == "" else c["weibo"],
            "enName": None if c.get("enNameAbbr",None) is None or c.get("enNameAbbr",None).strip() == "" else c["enNameAbbr"],
            "enFullName": None if c.get("enName",None) is None or c.get("enName",None).strip() == "" else c["enName"],
            "establishDate": establish_date,
            "addresses": addresses
            }
예제 #3
0
def parse_company(item):
    logger.info("parse_company")
    company_key = item["postdata"]["id"]

    #company basic info
    c = item["data"]["basic"]

    tags = c["tags"]

    tags_str = tags.replace("|",",")

    logo=c["icon"]
    if logo.find("product_default.png") >= 0:
        logo = None

    establish_date = None
    if c.has_key("open_time"):
        try:
            establish_date = datetime.datetime.strptime(c["open_time"], "%Y-%m-%d")
        except:
            pass

    address1 = None
    address2 = None
    if c.has_key("city"):
        address2 = c["city"]
    if c.has_key("province"):
        address1 = c["province"]

    location_id = 0
    if address2!=None and address2.strip()!="":
        location = parser_db_util.get_location(address2)
        if location != None:
            location_id= location["locationId"]

    if location_id==0 and address1 != None and address1.strip()!="":
        location = parser_db_util.get_location(address1)
        if location != None:
            location_id = location["locationId"]

    fullName = c["company"]
    if fullName is None or fullName.strip() == "":
        fullName = None
    else:
        fullName = fullName.replace("_","")
        idx = fullName.rfind(u"公司")
        if idx != -1:
            fullName = fullName[:(idx+len(u"公司"))]
        fullName = name_helper.company_name_normalize(fullName)

    name = c["product"]
    desc = ""
    brief = ""
    productDesc = None
    modelDesc = None
    operationDesc = None
    teamDesc = None
    marketDesc = None
    compititorDesc = None
    advantageDesc = None
    planDesc = None
    otherDesc = None


    if c.has_key("desc"):  # 其他
        # otherDesc = c["intro"].strip()
        desc = c["desc"].strip()

    if c.has_key("yewu"):  # 其他
        # otherDesc = c["intro"].strip()
        brief = c["yewu"].strip()

    if name is None or fullName is None:
        return {
            "status": "No_Name",
        }

    artifacts = []
    websites = []
    if c.has_key("gw_link") is True and c["gw_link"].strip() !="" and c["gw_link"] not in websites:
        websites.append(c["gw_link"])
    if c.has_key("source_gw_link") is True and c["source_gw_link"].strip() != "" and c["source_gw_link"] not in websites:
        websites.append(c["source_gw_link"])
    if item["data"].has_key("productinfos") is True:
        for pi in item["data"]["productinfos"]:
            if pi.has_key("link") is True and pi["link"].strip() !="" and pi["link"] not in websites:
                websites.append(pi["link"])

    for website in websites:
        type, app_market, app_id = url_helper.get_market(website)
        if type == 4010:
            if item["url"] != website and website.find("qimingpian.com") == -1:
                flag, domain = url_helper.get_domain(website)
                if flag is not None:
                    if flag is False:
                        domain = None
                    artifacts.append({
                        "type": 4010,
                        "name": name,
                        "description": brief,
                        "link": website,
                        "domain": domain
                    })
        elif type == 4020 or type == 4030:
            domain = None
            if domain is not None:
                artifacts.append({
                    "type": type,
                    "name": name,
                    "description": brief,
                    "link": website,
                    "domain": domain
                })
        elif type == 4040:
            domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4040,
                    "name": name,
                    "description": brief,
                    "link": website,
                    "domain": domain
                })
        elif type == 4050:
            domain = None
            if app_market == 16010 or app_market == 16020:
                android_app = parser_db_util.find_android_market(app_market, app_id)
                if android_app:
                    domain = android_app["apkname"]
            else:
                domain = app_id
            if domain is not None:
                artifacts.append({
                    "type": 4050,
                    "name": name,
                    "description": brief,
                    "link": website,
                    "domain": domain
                })

    return {
        "name": name,
        "fullName": fullName,
        "description": desc,
        "productDesc": productDesc,
        "modelDesc": modelDesc,
        "operationDesc": operationDesc,
        "teamDesc": teamDesc,
        "marketDesc": marketDesc,
        "compititorDesc": compititorDesc,
        "advantageDesc": advantageDesc,
        "planDesc": planDesc,
        "otherDesc": otherDesc,
        "brief": brief,
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": None,
        "phone": None,
        "establishDate": establish_date,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": tags_str,
        "headCountMin": None,
        "headCountMax": None,
        "artifacts": artifacts,

    }
예제 #4
0
def parse_company(item):
    logger.info("parse_neeq_stock")
    # logger.info(item)
    company_key = item["sourceId"]

    #company basic info
    c = item["baseinfo"]

    field = c.get("industry",None)


    result = parser_db_util.get_location(c["area"].replace("市",""))
    if result != None:
        location_id = result["locationId"]
    else:
        location_id = None

    fullName = c["name"]
    logger.info("parsing :%s|%s", fullName, company_key)
    desc = None
    brief = None
    try:
        desc = item["memectBrief"][u"公司简介"]
    except:
        pass

    if desc is None:
        try:
            desc = item["jqkaBrief"]["desc"]
        except:
            pass

    try:
        brief = item["memectBrief"][u"主营产品"].split("。")[0]
        logger.info(len(brief))
        if len(brief) > 99:
            brief = None
    except:
        pass

    if brief is None:
        try:
            brief = item["jqkaBrief"]["brief"].split("。")[0]
            logger.info(len(brief))
            if len(brief) > 99:
                brief = None
            if brief is not None and brief.strip() == "-":
                brief = None
        except:
            pass
    productDesc = None
    modelDesc = None
    operationDesc = None
    teamDesc = None
    marketDesc = None
    compititorDesc = None
    advantageDesc = None
    planDesc = None
    otherDesc = None


    return {
        "name": c["shortname"].replace('0',"").replace('7',"").replace('8',"").replace('9',"").replace('1',"").replace('2',"").replace('3',"").replace('4',"").replace('5',"").replace('6',""),
        "fullName": fullName.strip(),
        "description": desc,
        "productDesc": productDesc,
        "modelDesc": modelDesc,
        "operationDesc": operationDesc,
        "teamDesc": teamDesc,
        "marketDesc": marketDesc,
        "compititorDesc": compititorDesc,
        "advantageDesc": advantageDesc,
        "planDesc": planDesc,
        "otherDesc": otherDesc,
        "brief": brief,
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": c.get("address",None),
        "phone": str(c["phone"]) if c.has_key("phone") and c["phone"] is not None and str(c["phone"]).strip()!="" else None,
        "establishDate": None,
        "logo": None,
        "source": SOURCE,
        "sourceId": company_key,
        "field": field,
        "subField": None,
        "tags": None,
        "headCountMin": None,
        "headCountMax": None,
        "englishName": c["englishName"] if c.has_key("englishName") else None
    }
예제 #5
0
def parse_company(item):
    # logger.info("parse_company")

    d = pq(html.fromstring(item['content'].decode("utf-8")))
    company_key = item["key"]

    # company basic info
    tags = []
    for tag in d('.word_list').text().split():
        if tag.strip() not in tags: tags.append(tag)

    tags_str = ",".join(tags)

    logo = d('.peoimg img').attr('src')
    if logo:
        logo = logo.replace("https://", "http://")

    establish_date = None
    time_content = d('.time_content li:last-child')
    if d(time_content)('.upword').text().find('成立') > 0:
        establish_date = d(time_content)('.time_up').text()
        establish_date = datetime.datetime.strptime(establish_date, '%Y-%m-%d')

    companyName = d('.company_div h5').text()
    city = name_helper.get_location_from_company_name(companyName)[0]
    location_id = 0
    if city != None:
        location = parser_db_util.get_location(city)
        if location != None:
            location_id = location["locationId"]

    # logger.info("locationid =%s",location_id)

    fullName = companyName.replace("_", "")
    fullName = name_helper.company_name_normalize(fullName)

    desc = d('#intro_srocll p').text()
    productDesc = ''
    website = ''
    for p in d('.procont_lis p'):
        if d(p).text().find('官网') > 0 and d(p)('a').attr('href') is not None:
            website = d(p)('a').attr('href')
            continue
        productDesc += d(p).text() + '\n'

    if desc == '' or desc is None: desc = productDesc

    shortName = d('.peo_center h4').text().split(':')[0].split(':')[0].split(
        '——')[0].split(',')[0].split('|')[0]

    companyResult = {}
    # isCompany
    # print companyName,company_key, ',', name_helper.name_check(companyName)[1], ',', len(website)>0
    if name_helper.name_check(companyName)[1] == True:
        # English name
        if name_helper.name_check(shortName)[0] == False:
            pass
        else:
            cnt = 0
            for s in shortName:
                if s in companyName: cnt += 1

            if not cnt > 2:
                shortName = companyName
    else:
        if not len(website) > 0:
            return 0
        else:
            companyResult['fakeName'] = fullName
            fullName = None

    companyResult.update({
        "name": shortName,
        "fullName": fullName,
        "description": desc,
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": None,
        "phone": None,
        "establishDate": establish_date,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": tags_str,
        "headCountMin": None,
        "headCountMax": None,
        "brief": None,
        "website": website,
    })

    return companyResult
예제 #6
0
def parse_company(item):
    name = item['name']
    logger.info('parse_company:%s' % name)

    c = item['content']['company_base']
    company_key = item['key']

    tags_str = None
    tags = c['tags']
    tagss = []
    if tags:
        for tag in tags:
            tagss.append(tag['value'])
    tags_str = ','.join(tagss)
    # logger.info('tags:%s'%tags_str)

    logo = c['logo']
    if logo:
        logo = "https://crunchbase-production-res.cloudinary.com/image/upload/c_lpad,h_100,w_100,f_auto,b_white,q_auto:eco/%s" % logo
    # logger.info('logo:%s'%logo)

    establish_date = None
    if c['overview_fields'].has_key('founded_on'):
        establish_date = c['overview_fields']['founded_on']['value']
        if establish_date:
            es = establish_date.split('-')
            int_es = map(int, es)
            if int_es[0] > 1980:
                establish_date = datetime.datetime(int_es[0], int_es[1],
                                                   int_es[2])
            else:
                establish_date = None
    # logger.info('establish_date:%s'%establish_date)

    locaotions_str = None
    locationss = []
    if c['locations'].has_key('location_identifiers'):
        locations = c['locations']['location_identifiers']
        if locations:
            for locat in locations:
                locationss.append(locat['value'])
    locaotions_str = ','.join(locationss)
    # logger.info('locaotions_str:%s'%locaotions_str)

    location_id = 421
    contry = '国外'
    if locaotions_str.lower().find('china') >= 0:
        contry = '中国'
    location = parser_db_util.get_location(contry)
    if location != None:
        location_id = location["locationId"]
    # logger.info('location_id:%s'%location_id)

    brief = None
    if c['locations'].has_key('short_description'):
        brief = c['locations']['short_description']
    # logger.info('brief:%s'%brief)

    desc = ""
    descriptions = c['description']
    if descriptions.has_key('description'):
        desc = descriptions['description']

    fullName = None
    productDesc = None
    modelDesc = None
    operationDesc = None
    teamDesc = None
    marketDesc = None
    compititorDesc = None
    advantageDesc = None
    planDesc = None
    otherDesc = None
    headCountMin = None
    headCountMax = None
    return {
        "name":
        name,
        "fullName":
        fullName if fullName is not None and fullName.strip() != "" else None,
        "description":
        desc,
        "productDesc":
        productDesc,
        "modelDesc":
        modelDesc,
        "operationDesc":
        operationDesc,
        "teamDesc":
        teamDesc,
        "marketDesc":
        marketDesc,
        "compititorDesc":
        compititorDesc,
        "advantageDesc":
        advantageDesc,
        "planDesc":
        planDesc,
        "otherDesc":
        otherDesc,
        "brief":
        brief,
        "round":
        0,
        "roundDesc":
        None,
        "companyStatus":
        2010,
        'fundingType':
        0,
        "locationId":
        location_id,
        "address":
        None,
        "phone":
        None,
        "establishDate":
        establish_date,
        "logo":
        logo,
        "source":
        SOURCE,
        "sourceId":
        company_key,
        "field":
        None,
        "subField":
        None,
        "tags":
        tags_str,
        "headCountMin":
        headCountMin,
        "headCountMax":
        headCountMax
    }
예제 #7
0
파일: liepin.py 프로젝트: yujiye/Codes
def process(crawler_job, key, content, source_company_id):
    # logger.info(content)
    # if has_content(content):
    if 1:
        j = json.loads(content)
        positions = j['data']['list']
        domain = 0

        jobs = []
        for p in positions:
            logger.info(p['url'])
            key = p['url'].split('job/')[-1].split('.shtml')[0]

            location_id = 0
            location_new = parser_db_util.get_location(p['city'].split('-')[0])
            if location_new != None:
                location_id = location_new["locationId"]

            education = p['eduLevel']
            education_type = 0
            if '大专' in education:
                education_type = 6020
            elif '本科' in education:
                education_type = 6030
            elif '硕士' in education:
                education_type = 6040
            elif '博士' in education:
                education_type = 6050

            work_year = p['workYear']
            workYear_type = 7000
            if '应届' in work_year:
                workYear_type = 7010
            elif '1年以下' in work_year:
                workYear_type = 7020
            elif '1年以上' in work_year or '2年以上' in work_year:
                workYear_type = 7030
            elif '3年以上' in work_year:
                workYear_type = 7040
            elif '5年以上' in work_year or '6年以上' in work_year or '8年以上' in work_year:
                workYear_type = 7050
            elif '10年以上' in work_year:
                workYear_type = 7060

            update_time = datetime.datetime.strptime(p['time'], "%Y年%m月%d日")

            source_job = {
                "source": SOURCE,
                "sourceId": key,
                "recruit_company_id": str(source_company_id),
                "position": p['title'],
                "salary": p['salary'],
                "description": None,
                "domain": domain,
                "locationId": location_id,
                "educationType": education_type,
                "workYearType": workYear_type,
                "startDate": update_time,
                "updateDate": update_time,
                "jobNature": None,
                "positionAdvantage": None,
                "companyLabelList": None,
                "financeStage": None,
                "district": None,
            }

            logger.info(json.dumps(source_job, ensure_ascii=False, cls=util.CJsonEncoder))
            jobs.append(source_job)

        logger.info(len(jobs))
        if len(jobs) > 0:
            save_job_mongo(jobs)
예제 #8
0
def parse_company(item):
    logger.info("parse_company")
    company_key = item["key"]

    #company basic info
    c = item["content"]["company_base"]["data"]["company"]
    #check if page is under development or is completed(CREATED)
    if c["status"] == "INIT":
        return {
            "status": c["status"],
        }

    tags = item["content"]["company_base"]["data"]["tags"]
    tags2 = []
    for tag in tags:
        tags2.append(tag["name"])
    tags_str = ",".join(tags2)

    logo = c["logo"]
    if logo:
        logo = logo.replace("https://", "http://")
    establish_date = None
    if c.has_key("startDate"):
        d = time.localtime(c["startDate"] / 1000)
        if d.tm_year > 1980:
            establish_date = datetime.datetime(d.tm_year, d.tm_mon, d.tm_mday)

    address1 = None
    address2 = None
    if c.has_key("address1"):
        address1 = c["address1"]
    if c.has_key("address2"):
        address2 = c["address2"]

    location_id = 0
    if address2 != None:
        city = kr36_cities.get(str(address2), None)
        if city != None:
            location = parser_db_util.get_location(formCityName(city))
            if location != None:
                location_id = location["locationId"]

    if location_id == 0 and address1 != None:
        city = kr36_cities.get(str(address1), None)
        if city != None:
            location = parser_db_util.get_location(formCityName(city))
            if location != None:
                location_id = location["locationId"]

    #logger.info("locationid =%s",location_id)

    fullName = c["fullName"]
    fullName = fullName.replace("_", "")
    idx = fullName.rfind(u"公司")
    if idx != -1:
        fullName = fullName[:(idx + len(u"公司"))]
    fullName = name_helper.company_name_normalize(fullName)

    desc = ""
    productDesc = None
    modelDesc = None
    operationDesc = None
    teamDesc = None
    marketDesc = None
    compititorDesc = None
    advantageDesc = None
    planDesc = None

    if c.has_key("projectAdvantage"):
        productDesc = c["projectAdvantage"].strip()
    if c.has_key("dataLights"):
        operationDesc = c["dataLights"].strip()
    if c.has_key("projectPlan"):
        modelDesc = c["projectPlan"].strip()
    if c.has_key("competitor"):
        compititorDesc = c["competitor"].strip()
    if c.has_key("intro"):
        desc = c["intro"].strip()
    if c.has_key("story"):
        teamDesc = c["story"].strip()

    return {
        "status": c["status"],
        "name": c["name"],
        "fullName": fullName,
        "description": desc,
        "productDesc": productDesc,
        "modelDesc": modelDesc,
        "operationDesc": operationDesc,
        "teamDesc": teamDesc,
        "marketDesc": marketDesc,
        "compititorDesc": compititorDesc,
        "advantageDesc": advantageDesc,
        "planDesc": planDesc,
        "brief": c["brief"],
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": None,
        "phone": None,
        "establishDate": establish_date,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": c.get("industry"),
        "subField": None,
        "tags": tags_str,
        "headCountMin": None,
        "headCountMax": None
    }
예제 #9
0
def parse_company(item):
    logger.info("parse_company")
    company_key = item["key"]

    #company basic info
    c = item["content"]["company_base"]["data"]
    #check if page is under development or is completed(CREATED)
    # if c["status"] == "INIT":
    #     return {
    #         "status":c["status"],
    #     }

    tags = item["content"]["company_base"]["data"]["industryTag"]
    tags2 = []
    for tag in tags:
        tags2.append(tag["name"])
    tags_str = ",".join(tags2)

    logo = c["logo"]
    if logo:
        logo = logo.replace("https://", "http://")
    establish_date = None
    if c.has_key("startDate"):
        d = time.localtime(c["startDate"] / 1000)
        if d.tm_year > 1980:
            establish_date = datetime.datetime(d.tm_year, d.tm_mon, d.tm_mday)

    address1 = None
    address2 = None
    if c.has_key("address1"):
        address1 = c["address1"]
    if c.has_key("address2"):
        address2 = c["address2"]

    location_id = 0
    if address2 != None:
        city = kr36_cities.get(str(address2), None)
        if city != None:
            location = parser_db_util.get_location(formCityName(city))
            if location != None:
                location_id = location["locationId"]

    if location_id == 0 and address1 != None:
        city = kr36_cities.get(str(address1), None)
        if city != None:
            location = parser_db_util.get_location(formCityName(city))
            if location != None:
                location_id = location["locationId"]

    #logger.info("locationid =%s",location_id)

    fullName = c["fullName"]
    fullName = fullName.replace("_", "")
    idx = fullName.rfind(u"公司")
    if idx != -1:
        fullName = fullName[:(idx + len(u"公司"))]
    fullName = name_helper.company_name_normalize(fullName)

    desc = ""
    productDesc = None
    modelDesc = None
    operationDesc = None
    teamDesc = None
    marketDesc = None
    compititorDesc = None
    advantageDesc = None
    planDesc = None
    otherDesc = None

    if c.has_key("companyIntroduce"):
        if c["companyIntroduce"]["productService"] is not None and c[
                "companyIntroduce"]["productService"].strip(
                ) != "":  # productService
            productDesc = c["companyIntroduce"]["productService"]
        if c["companyIntroduce"]["userMarket"] is not None and c[
                "companyIntroduce"]["userMarket"].strip() != "":
            marketDesc = c["companyIntroduce"]["userMarket"]
    # if c.has_key("dataLights"): # 我们的用户
    #     operationDesc = c["dataLights"].strip()
    # if c.has_key("projectPlan"): # 未来的我们
    #     modelDesc = c["projectPlan"].strip()
    # if c.has_key("competitor"): # 与我们相似的产品
    #     compititorDesc = c["competitor"].strip()
    if c.has_key("intro"):  # 其他
        # otherDesc = c["intro"].strip()
        desc = c["intro"].strip()
    # if c.has_key("story"): # 团队介绍
    #     teamDesc = c["story"].strip()

    headCount = c["scale"].replace("人", "")
    min_staff = None
    max_staff = None
    if headCount.strip() != "":
        if headCount == "少于15":
            min_staff = 1
            max_staff = 15
        else:
            staffarr = headCount.split('-')
            if len(staffarr) > 1:
                try:
                    min_staff = int(staffarr[0])
                    max_staff = int(staffarr[1])
                except:
                    pass
            else:
                try:
                    min_staff = int(staffarr[0].strip())
                    max_staff = None
                except:
                    pass

    return {
        "name":
        c["name"],
        "fullName":
        fullName if fullName is not None and fullName.strip() != "" else None,
        "description":
        desc,
        "productDesc":
        productDesc,
        "modelDesc":
        modelDesc,
        "operationDesc":
        operationDesc,
        "teamDesc":
        teamDesc,
        "marketDesc":
        marketDesc,
        "compititorDesc":
        compititorDesc,
        "advantageDesc":
        advantageDesc,
        "planDesc":
        planDesc,
        "otherDesc":
        otherDesc,
        "brief":
        c["brief"],
        "round":
        None,
        "roundDesc":
        None,
        "companyStatus":
        2010,
        'fundingType':
        0,
        "locationId":
        location_id,
        "address":
        None,
        "phone":
        None,
        "establishDate":
        establish_date,
        "logo":
        logo,
        "source":
        SOURCE,
        "sourceId":
        company_key,
        "field":
        None,
        "subField":
        None,
        "tags":
        tags_str,
        "headCountMin":
        min_staff,
        "headCountMax":
        max_staff
    }
예제 #10
0
def parse_companyjobs(source_company_id, item, sourceId):
    source_jobs = []
    logger.info("source_company_id is %s", source_company_id)
    if item["content"].has_key("version") and item["content"]["version"] == 2:
        logger.info("version 2!!!!")
        for type in item["content"]:
            if type == "version":
                continue
            position_type = type
            for jobpage in item["content"][type]:
                jobs = jobpage['content']['data']['page']['result']
                logger.info("%s has %s jobs", position_type, len(jobs))
                for job_content in jobs:
                    #logger.info("%s has %s jobs", position_type, len(jobs))
                    if sourceId != str(job_content["companyId"]):
                        logger.info("sourceId is not correct")
                        continue
                    domain = 0
                    if position_type == '技术':
                        domain = 15010
                    elif position_type == '产品':
                        domain = 15020
                    elif position_type == '设计':
                        domain = 15030
                    elif position_type == '运营':
                        domain = 15040
                    elif position_type == '市场与销售':
                        domain = 15050
                    elif position_type == '职能':
                        domain = 15060
                    elif position_type == '金融':
                        domain = 15070

                    job_key = job_content["positionId"]
                    born_time = None
                    position = job_content.get("positionName")
                    education = job_content.get("education")
                    city = job_content.get("city")
                    salary = job_content["salary"]
                    work_year = job_content["workYear"]
                    update_time = job_content["createTime"]

                    location_id = 0
                    location_new = parser_db_util.get_location(city)
                    if location_new != None:
                        location_id = location_new["locationId"]

                    education_type = 0
                    if education == '大专':
                        education_type = 6020
                    elif education == '本科':
                        education_type = 6030
                    elif education == '硕士':
                        education_type = 6040
                    elif education == '博士':
                        education_type = 6050

                    workYear_type = 7000
                    if work_year == '应届毕业生':
                        workYear_type = 7010
                    elif work_year == '1年以下':
                        workYear_type = 7020
                    elif work_year == '1-3年':
                        workYear_type = 7030
                    elif work_year == '3-5年':
                        workYear_type = 7040
                    elif work_year == '5-10年':
                        workYear_type = 7050
                    elif work_year == '10年以上':
                        workYear_type = 7060

                    date = "%s" % time.strftime("%Y-%m-%d", time.localtime())
                    if '-' not in update_time and update_time != None:
                        update_time = date + ' ' + update_time.strip()

                    source_job = {
                        "sourceId": job_key,
                        "sourceCompanyId": source_company_id,
                        "position": position,
                        "salary": salary,
                        "description": None,
                        "domain": domain,
                        "locationId": location_id,
                        "educationType": education_type,
                        "workYearType": workYear_type,
                        "startDate": born_time,
                        "updateDate": update_time,
                    }
                    logger.info("job_key: %s", job_key)
                    # for i in source_job:
                    #    logger.info("%s->%s",i,source_job[i])

                    source_jobs.append(source_job)
    else:
        jobs = item["content"]['content']['data']['page']['result']
        for job_content in jobs:
            job_key = job_content["positionId"]

            born_time = job_content.get("bornTime")
            position = job_content.get("positionName")
            education = job_content.get("education")
            city = job_content["city"]
            # keywords = job_content["keyWords"]
            salary = job_content["salary"]
            work_year = job_content["workYear"]
            position_type = job_content["positionFirstType"]
            update_time = job_content["createTime"]

            #location_id = parser_util.get_location_id(city)
            location_id = 0
            location_new = parser_db_util.get_location(city)
            if location_new != None:
                location_id = location_new["locationId"]

            education_type = 0
            if education == '大专':
                education_type = 6020
            elif education == '本科':
                education_type = 6030
            elif education == '硕士':
                education_type = 6040
            elif education == '博士':
                education_type = 6050

            workYear_type = 7000
            if work_year == '应届毕业生':
                workYear_type = 7010
            elif work_year == '1年以下':
                workYear_type = 7020
            elif work_year == '1-3年':
                workYear_type = 7030
            elif work_year == '3-5年':
                workYear_type = 7040
            elif work_year == '5-10年':
                workYear_type = 7050
            elif work_year == '10年以上':
                workYear_type = 7060

            domain = 0
            if position_type == '技术':
                domain = 15010
            elif position_type == '产品':
                domain = 15020
            elif position_type == '设计':
                domain = 15030
            elif position_type == '运营':
                domain = 15040
            elif position_type == '市场与销售':
                domain = 15050
            elif position_type == '职能':
                domain = 15060
            elif position_type == '金融':
                domain = 15070

            date = "%s" % time.strftime("%Y-%m-%d", time.localtime())
            if '-' not in born_time:
                born_time = date + ' ' + born_time.strip()
            if '-' not in update_time and update_time != None:
                update_time = date + ' ' + update_time.strip()

            source_job = {
                "sourceId": job_key,
                "sourceCompanyId": source_company_id,
                "position": position,
                "salary": salary,
                "description": None,
                "domain": domain,
                "locationId": location_id,
                "educationType": education_type,
                "workYearType": workYear_type,
                "startDate": born_time,
                "updateDate": update_time,
            }
            logger.info("job_key: %s", job_key)
            #for i in source_job:
            #    logger.info("%s->%s",i,source_job[i])

            source_jobs.append(source_job)

    logger.info(
        json.dumps(source_jobs, ensure_ascii=False, cls=util.CJsonEncoder))
    logger.info("scid %s, sourceId %s currently has %s jobs",
                source_company_id, sourceId, len(source_jobs))
    return source_jobs
예제 #11
0
def parse_company(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    logger.info(company_key)
    d = pq(html)

    # logo_id processed in parser_db_util
    '''
    logo_id = None
    if logo_url is not None:
        logo_id = parser_util.get_logo_id(source, company_key, 'company', logo_url)
    '''

    if html.decode("utf-8").find("这个公司的主页还在建设") >= 0:
        return {
            "status": "No_Name",
        }
    name = d('.company_main > h1 > a').text()
    link = d('.company_main > h1 > a').attr('href')
    fullName = d('.company_main > h1 > a').attr('title')
    fullName = name_helper.company_name_normalize(fullName)
    if name is None or fullName is None or name.find("拉勾") >= 0:
        return {
            "status": "No_Name",
        }
    if len(name) > len(fullName):
        name = fullName
    if name is None or name.strip() == "":
        name = fullName

    chinese, companycheck = name_helper.name_check(fullName)
    if companycheck is not True:
        return {
            "status": "No_Name",
        }
    logo = d('.top_info_wrap > img').attr('src')

    if logo.startswith("http") or logo.startswith("https"):
        pass
    else:
        logo = "http:" + logo

    if logo.find("logo_default") >= 0:
        logo = None

    brief = d('.company_word').text()
    desc_text = d('.company_intro_text').text()

    if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10:
        desc = None
    else:
        desc = d('.company_intro_text > .company_content').html()
        desc = desc.replace('<span class="text_over">展开</span>', '')

        soup = BeautifulSoup(desc, "lxml")
        raw = soup.getText()

        # logger.info(desc)
        #logger.info(raw)

        desc = raw

    # if desc is None or desc.strip() == "":
    #     return {
    #         "status": "No_Name",
    #     }
    field = ''
    stage = ''
    headCount = ''
    location = ''
    address = ''
    try:
        field = d(
            '#basic_container > .item_content >ul > li:eq(0) > span').text()
        stage = d(
            '#basic_container > .item_content >ul > li:eq(1) > span').text()
        headCount = d(
            '#basic_container > .item_content >ul > li:eq(2) > span').text()
        headCount = headCount[0:headCount.index(u'人')]
        location = d(
            '#basic_container > .item_content >ul > li:eq(3) > span').text()
        address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text()
    except:
        pass

    headCount = headCount.replace("people", "")

    if headCount == "少于15":
        min_staff = 1
        max_staff = 15
    else:
        staffarr = headCount.split('-')
        if len(staffarr) > 1:
            min_staff = staffarr[0]
            max_staff = staffarr[1]
        else:
            try:
                min_staff = int(staffarr[0].strip())
                max_staff = None
            except:
                min_staff = None
                max_staff = None

    funding_type = 0
    if stage == '不需要融资':
        stage = 0
        funding_type = 8010
    elif stage == '未融资':
        stage = 0
    elif stage == '天使轮':
        stage = 1010
    elif stage == 'A轮':
        stage = 1030
    elif stage == 'B轮':
        stage = 1040
    elif stage == 'C轮':
        stage = 1050
    elif stage == 'D轮及以上':
        stage = 1060
    elif stage == '上市公司':
        stage = 1110
    else:
        stage = 0

    location_id = 0
    location_new = parser_db_util.get_location(location)
    if location_new != None:
        location_id = location_new["locationId"]

    #website = util.norm_url(link)
    website = url_helper.url_normalize(link)
    logger.info("website: %s" % website)

    artifacts = []
    type, app_market, app_id = url_helper.get_market(website)
    if type == 4010:
        if item["url"] != website and website.find("lagou.com") == -1:
            flag, domain = url_helper.get_domain(website)
            if flag is not None:
                if flag is False:
                    domain = None
                artifacts.append({
                    "type": 4010,
                    "name": name,
                    "description": desc,
                    "link": website,
                    "domain": domain
                })
    elif type == 4020 or type == 4030:
        domain = None
        if domain is not None:
            artifacts.append({
                "type": type,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4040:
        domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4040,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })
    elif type == 4050:
        domain = None
        if app_market == 16010 or app_market == 16020:
            android_app = parser_db_util.find_android_market(
                app_market, app_id)
            if android_app:
                domain = android_app["apkname"]
        else:
            domain = app_id
        if domain is not None:
            artifacts.append({
                "type": 4050,
                "name": name,
                "description": desc,
                "link": website,
                "domain": domain
            })

    source_company = {
        "name":
        name,
        "fullName":
        fullName if fullName is not None and fullName.strip() != "" else None,
        "description":
        desc,
        "productDesc":
        None,
        "modelDesc":
        None,
        "operationDesc":
        None,
        "teamDesc":
        None,
        "marketDesc":
        None,
        "compititorDesc":
        None,
        "advantageDesc":
        None,
        "planDesc":
        None,
        "brief":
        brief,
        "round":
        None,
        "roundDesc":
        None,
        "companyStatus":
        2010,
        'fundingType':
        funding_type,
        "locationId":
        location_id,
        "address":
        address,
        "phone":
        None,
        "establishDate":
        None,
        "logo":
        logo,
        "source":
        SOURCE,
        "sourceId":
        company_key,
        "field":
        field,
        "subField":
        None,
        "tags":
        None,
        "headCountMin":
        min_staff,
        "headCountMax":
        max_staff,
        "artifacts":
        artifacts,
        "status":
        1
    }

    return source_company
예제 #12
0
def parse_companyjobs(source_company_id, item, sourceId):
    source_jobs = []
    logger.info("source_company_id is %s", source_company_id)
    if item["content"].has_key("version") and item["content"]["version"] == 2:
        logger.info("version 2!!!!")
        for ptype in item["content"]:
            if ptype == "version":
                continue
            position_type = ptype
            for content in item["content"][position_type]:
                d = pq((html.fromstring(content.decode("utf-8"))))
                # logger.info("this page has %s jobs", position_type, len(jobs))
                domain = 0
                if position_type == '技术':
                    domain = 15010
                elif position_type == '产品':
                    domain = 15020
                elif position_type == '设计':
                    domain = 15030
                elif position_type == '运营':
                    domain = 15040
                elif position_type == '市场' or position_type == '销售':
                    domain = 15050
                elif position_type.find('职能') >= 0:
                    domain = 15060
                elif position_type == '金融':
                    domain = 15070

                for li in d('div.job-list> ul> li'):

                    dj = pq(li)
                    job_link = dj('a').eq(0).attr("href")
                    job_key = job_link.split("/")[-1].replace(".html", "")
                    born_time = dj('div.info-publis> p').text()
                    position = dj('div.title-box> div.job-title').text()
                    (city, work_year, education) = (None, None, None)
                    lll = dj(
                        'div.job-primary> div.info-primary> p').text().strip()
                    logger.info("lll %s", lll)
                    if len(lll.split(" ")) == 3:
                        education = lll.split(" ")[2]
                        city = lll.split(" ")[0]
                        work_year = lll.split(" ")[1]
                    logger.info("%s - %s - %s", city, work_year, education)
                    salary = dj('div.title-box> span').text()

                    # update_time = born_time

                    location_id = 0
                    location_new = parser_db_util.get_location(city)
                    if location_new != None:
                        location_id = location_new["locationId"]

                    education_type = 0
                    if education == '大专':
                        education_type = 6020
                    elif education == '本科':
                        education_type = 6030
                    elif education == '硕士':
                        education_type = 6040
                    elif education == '博士':
                        education_type = 6050

                    workYear_type = 7000
                    if work_year == '应届毕业生':
                        workYear_type = 7010
                    elif work_year == '1年以下':
                        workYear_type = 7020
                    elif work_year == '1-3年':
                        workYear_type = 7030
                    elif work_year == '3-5年':
                        workYear_type = 7040
                    elif work_year == '5-10年':
                        workYear_type = 7050
                    elif work_year == '10年以上':
                        workYear_type = 7060

                    t = datetime.datetime.today()
                    logger.info("born time: %s", born_time)
                    try:
                        if born_time.find("发布于") >= 0:
                            born_time = born_time.replace("发布于", "")
                            if born_time.find("月") >= 0:
                                logger.info("neeewdate:%s",
                                            str(t.year) + "年" + born_time)
                                update_time = datetime.datetime.strptime(
                                    str(t.year) + "年" + born_time, "%Y年%m月%d日")
                                if update_time > datetime.datetime.now():
                                    update_time = update_time - datetime.timedelta(
                                        days=365)
                            elif born_time.find("昨天") >= 0:
                                update_time = datetime.datetime.now(
                                ) - datetime.timedelta(days=1)
                            else:
                                update_time = datetime.datetime.now()
                        else:
                            update_time = datetime.datetime.now()
                    except Exception, e:
                        logger.info(e)
                        update_time = datetime.datetime.now()

                    source_job = {
                        "source": 13055,
                        "sourceId": job_key,
                        "recruit_company_id": str(source_company_id),
                        "position": position,
                        "salary": salary,
                        "description": None,
                        "domain": domain,
                        "locationId": location_id,
                        "educationType": education_type,
                        "workYearType": workYear_type,
                        "startDate": update_time,
                        "updateDate": update_time,
                        "jobNature": None,
                        "positionAdvantage": None,
                        "companyLabelList": None,
                        "financeStage": None,
                        "district": None,
                    }
                    logger.info("job_key: %s", job_key)
                    for i in source_job:
                        logger.info("%s->%s", i, source_job[i])

                    source_jobs.append(source_job)
예제 #13
0
def parse_company(item):
    # logger.info("parse_company")

    d = pq(html.fromstring(item['content'].decode("utf-8")))
    company_key = item["key"]

    # company basic info
    tags = []

    for tag in d('.portfolio-user-tag .label').text().split():
        if tag.strip() not in tags: tags.append(tag.strip())

    tags_str = ",".join(tags)

    logo = 'http:' + d('.portfolio-user-photo img').attr('src')
    if logo:
        logo = logo.replace("https://", "http://")
        logo = logo.replace("@!msgs", "")

    establish_date = None

    companyName = d('.corp-name').text()

    location_id = 0
    city = d('.portfolio-user-tag').text().split(' ')[0]
    if city != None: location = parser_db_util.get_location(city)
    if location is None:
        city = name_helper.get_location_from_company_name(companyName)[0]

    if city != None:
        location = parser_db_util.get_location(city)
        if location != None:
            location_id = location["locationId"]

    # logger.info("locationid =%s",location_id)

    fullName = companyName.replace("_", "")
    fullName = name_helper.company_name_normalize(fullName)

    # desc = d('.portfolio-corp p').text()
    desc = d('.portfolio-user-bio .text').text()
    productDesc = d('.portfolio-text').text()

    website = d('.user-contact a').text()

    if desc == '' or desc is None: desc = productDesc

    shortName = d('.portfolio-user-info h1').text()

    companyResult = {}

    companyResult.update({
        "name": shortName,
        "fullName": fullName,
        "description": desc,
        "productDesc": productDesc,
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": None,
        "phone": None,
        "establishDate": None,
        "logo": logo,
        "source": SOURCE,
        "sourceId": company_key,
        "field": None,
        "subField": None,
        "tags": tags_str,
        "headCountMin": None,
        "headCountMax": None,
        "brief": None,
        "website": website,
    })

    return companyResult
예제 #14
0
def parse_company(item):
    logger.info("parse_szse_stock")
    # logger.info(item)
    company_key = item["sourceId"]

    #company basic info
    c = item["baseinfo"]
    try:
        field = c["industry"].split(" ")[-1]
    except:
        field = None


    result = parser_db_util.get_location(c["city"].replace("市",""))
    if result != None:
        location_id = result["locationId"]
    else:
        location_id = None

    fullName = item["name"]
    logger.info("parsing :%s|%s", fullName, company_key)
    desc = None
    brief = None
    try:
        desc = item["jqkaBrief"]["desc"]
    except:
        pass
    try:
        brief = item["jqkaBrief"]["brief"].split("。")[0]
        logger.info(len(brief))
        if len(brief) > 99:
            brief = None
    except:
        pass
    productDesc = None
    modelDesc = None
    operationDesc = None
    teamDesc = None
    marketDesc = None
    compititorDesc = None
    advantageDesc = None
    planDesc = None
    otherDesc = None

    sourceId = item["stockwebsite"].split("=")[-1]

    return {
        "name": c["shortname"].replace("N",""),
        "fullName": fullName.strip(),
        "description": desc,
        "productDesc": productDesc,
        "modelDesc": modelDesc,
        "operationDesc": operationDesc,
        "teamDesc": teamDesc,
        "marketDesc": marketDesc,
        "compititorDesc": compititorDesc,
        "advantageDesc": advantageDesc,
        "planDesc": planDesc,
        "otherDesc": otherDesc,
        "brief": brief,
        "round": 0,
        "roundDesc": None,
        "companyStatus": 2010,
        'fundingType': 0,
        "locationId": location_id,
        "address": c["regLocation"] if c["regLocation"] is not None and c["regLocation"].strip()!="" else None,
        "phone": None,
        "establishDate": None,
        "logo": None,
        "source": SOURCE,
        "sourceId": sourceId,
        "field": field,
        "subField": None,
        "tags": None,
        "headCountMin": None,
        "headCountMax": None,
        "englishName": c["englishName"] if c.has_key("englishName") else None
    }
예제 #15
0
def parse_company(item):
    logger.info("parse_sse_stock")
    # logger.info(item)
    company_key = item["sourceId"]

    #company basic info
    c = item["baseinfo"]

    field = c["CSRC_GREAT_CODE_DESC"]

    result = parser_db_util.get_location(c["AREA_NAME_DESC"].replace("市", ""))
    if result != None:
        location_id = result["locationId"]
    else:
        location_id = None

    fullName = c["FULLNAME"]
    logger.info("parsing :%s|%s", fullName, company_key)
    desc = None
    brief = None
    try:
        desc = item["jqkaBrief"]["desc"]
    except:
        pass
    try:
        brief = item["jqkaBrief"]["brief"].split("。")[0]
        logger.info(len(brief))
        if len(brief) > 99:
            brief = None
    except:
        pass
    productDesc = None
    modelDesc = None
    operationDesc = None
    teamDesc = None
    marketDesc = None
    compititorDesc = None
    advantageDesc = None
    planDesc = None
    otherDesc = None

    return {
        "name":
        c["shortname"],
        "fullName":
        fullName.strip(),
        "description":
        desc,
        "productDesc":
        productDesc,
        "modelDesc":
        modelDesc,
        "operationDesc":
        operationDesc,
        "teamDesc":
        teamDesc,
        "marketDesc":
        marketDesc,
        "compititorDesc":
        compititorDesc,
        "advantageDesc":
        advantageDesc,
        "planDesc":
        planDesc,
        "otherDesc":
        otherDesc,
        "brief":
        brief,
        "round":
        0,
        "roundDesc":
        None,
        "companyStatus":
        2010,
        'fundingType':
        0,
        "locationId":
        location_id,
        "address":
        c["COMPANY_ADDRESS"]
        if c.has_key("COMPANY_ADDRESS") and c["COMPANY_ADDRESS"] is not None
        and c["COMPANY_ADDRESS"].strip() != "" else None,
        "phone":
        str(c["REPR_PHONE"])
        if c.has_key("COMPANY_ADDRESS") and c["REPR_PHONE"] is not None
        and str(c["REPR_PHONE"]).strip() not in ["", "-"] else None,
        "establishDate":
        None,
        "logo":
        None,
        "source":
        SOURCE,
        "sourceId":
        company_key,
        "field":
        field,
        "subField":
        None,
        "tags":
        None,
        "headCountMin":
        None,
        "headCountMax":
        None,
        "englishName":
        c["FULL_NAME_IN_ENGLISH"]
        if c.has_key("FULL_NAME_IN_ENGLISH") else None
    }
예제 #16
0
def parse_base(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    company_short_name = ""
    product_name = d('div.line-title> span> h1').clone().children().remove().end().text().strip()
    if product_name is None or product_name.strip() == "":
        product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip()
    temps = product_name.split("/",1)
    if len(temps) == 2:
        product_name = temps[0].strip()
        company_short_name = temps[1].strip()
    if company_short_name == "":
        company_short_name = product_name
    logger.info("product name: %s" % product_name)
    logger.info("company short name: %s" % company_short_name)

    company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","")
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""

    if company_name is None or company_name.strip() == "":
        try:
            company_name = d('div.des-more> h2').text().strip()
        except:
            pass
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""

    company_name = name_helper.company_name_normalize(company_name)
    logger.info("company name: %s" % company_name)

    if company_short_name == "" and company_name == "":
        return

    establish_date = None
    str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","")
    result = util.re_get_result('(\d*)\.(\d*)',str)
    if result != None:
        (year, month) = result
        try:
            if int(month) > 12:
                month = "1"
        except:
            month = "1"
        establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d')
    logger.info("establish date: %s" % establish_date)

    locationId=0
    str = d('span.loca').text().strip()
    #logger.info(str)
    result = util.re_get_result(u'(.*?)·(.*?)$',str)
    if result != None:
        (province, city) = result
        province = province.strip()
        city = city.strip()
        logger.info("location: %s-%s" % (province, city))

        locationId = 0
        result = parser_db_util.get_location(city)
        if result != None:
            locationId = result["locationId"]
        else:
            result = parser_db_util.get_location(province)
            if result != None:
                locationId = result["locationId"]

    if locationId == 0:
        loc1,loc2 = name_helper.get_location_from_company_name(company_name)
        if loc1 is not None:
            result = parser_db_util.get_location(loc1)
            if result != None:
                locationId = result["locationId"]
    logger.info("locationId: %d" % locationId)

    company_status = 2010
    str = d('div.des-more> div').eq(2).text().strip()
    if str == "已关闭":
        company_status = 2020
    logger.info("company_status: %d" % company_status)

    funding_type = 0
    str = d("span.tag.bg-c").text().strip()
    logger.info("融资需求: %s" % str)
    if str == "融资需求 · 需要融资":
        funding_type = 8020
    elif str == "融资需求 · 寻求收购":
        funding_type = 8020
    logger.info("funding_type=%d" % funding_type)
    try:
        brief = d("h2.seo-slogan").text().strip()
    except:
        brief = ""
    logger.info("brief: %s" % brief)

    if brief.find("暂未收录"):
        brief = ""
    field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
    logger.info("field: %s" % field)

    sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
    logger.info("sub field: %s" % sub_field)

    tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",")
    logger.info("tags: %s" % tags)

    desc = d("div.des,div.desc,div.introduction,div.abstract,div.summary").text().\
        replace("购买数据请联系","").replace('*****@*****.**',"").replace("itjuzi是一家数据服务公司","").strip()
    logger.info("********desc: %s" % desc)

    #logo
    logo = d("div.pic >img").attr("src")
    #if logo:
    #    logo = logo.replace("http://", "https://")
    logger.info("logo: %s", logo)


    # website = d('div.link-line> a').text().strip()
    # if website is None or website == "":
    #     website = d('div.link-line> a.webTink').text().strip()
    # if website is None or website == "":
    #     try:
    #         logger.info("here")
    #         website = d('div.link-line> span.weblink> a').eq(1).text().strip()
    #         logger.info(website)
    #     except:
    #         pass
    artifacts = []
    for ty in [1,2,3]:
        if ty == 1:
            was = d('div.link-line> a')
        else:
            was = d('div.link-line> span.weblink,span.webTink> a')

        for wa in was:
            webs =[]

            try:
                website = pq(wa).attr("href").strip()
                if website=="http://%e6%9a%82%e6%97%a0" or website == "http://tt":
                    website = ""
                website = url_helper.url_normalize(website)
                logger.info("website: %s" % website)
                webs.append(website)
            # else:

            #     website = pq(wa).text().strip()
            except:
                pass
            try:
                website = pq(wa).text().strip()
                if website == "http://%e6%9a%82%e6%97%a0" or website == "http://tt":
                    website = ""
                website = url_helper.url_normalize(website)
                logger.info("website: %s" % website)
                webs.append(website)
            # else:
            #     website = pq(wa).text().strip()
            except:
                pass

            #
            # if website=="http://%e6%9a%82%e6%97%a0":
            #     website = ""
            # website = url_helper.url_normalize(website)
            # logger.info("website: %s" % website)

            # artifacts = []
            for website in webs:
                type, app_market, app_id = url_helper.get_market(website)
                if type == 4010:
                    flag, domain = url_helper.get_domain(website)
                    if flag is not None:
                        if flag is False:
                            domain = None
                        artifacts.append({
                            "type":4010,
                            "name":product_name,
                            "desc":desc,
                            "link":website,
                            "domain": domain
                        })

                elif type == 4020:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type": 4020,
                            "name": product_name,
                            "desc": None,
                            "link": website,
                            "domain": website
                        })

                elif type == 4030:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type": 4030,
                            "name": product_name,
                            "desc": None,
                            "link": website,
                            "domain": None
                        })

                elif type == 4040:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                                "type":4040,
                                "name":product_name,
                                "desc":desc,
                                "link":website,
                                "domain": domain
                        })
                elif type == 4050:
                    domain = None
                    if app_market == 16010 or app_market == 16020:
                        android_app = parser_db_util.find_android_market(app_market, app_id)
                        if android_app:
                            domain = android_app["apkname"]
                    else:
                        domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type":4050,
                            "name":product_name,
                            "desc":desc,
                            "link":website,
                            "domain": domain
                        })


    #获投状态
    roundStr = d('span.t-small.c-green').text().replace("(","").replace(")","").replace("获投状态:","").strip()
    fundingRound,roundStr = itjuzi_helper.getFundingRound(roundStr)
    logger.info("获投状态: %d, %s", fundingRound, roundStr)

    logger.info("")


    return {
        "shortName": company_short_name,
        "fullName": company_name if company_name is not None and company_name.strip() != "" else None,
        "productName": product_name,
        "description": desc,
        "brief": brief,
        "round": fundingRound,
        "roundDesc": roundStr,
        "companyStatus": company_status,
        "fundingType": funding_type,
        "locationId": locationId,
        "establishDate": establish_date,
        "logo": logo,
        "sourceId": company_key,
        "field": field,
        "subField": sub_field,
        "tags": tags,
        "type":41010,
        "artifacts":artifacts
    }