예제 #1
0
def get_new_car_pic(url: str):
    """

    :param url: 範例 https://c.8891.com.tw/audi/a1-sportback/HDPhoto.html
    :return: 該車型所有圖片url
    """
    pic_url_list = []  # 放圖片url
    ss = myutils.get_session()  # 可以換成requests.session()
    res = ss.get(url=url, headers=myutils.get_header())  # header裡只有user agent
    print("get response from", res.url)
    # print(req.text)
    scriptsoup = myutils.get_soup(res.text).find_all('script',
                                                     type="text/javascript")
    for script in scriptsoup:
        # print(script)
        tmp = str(script)
        if tmp.find("InitData") != -1:
            # print(tmp.index(": ["), tmp.index("]"))
            pid_str = tmp[tmp.index(": [") + 3:tmp.index("]")]
            pid_list = pid_str.split(",")
            print(pid_list)
            photo_lib_url = "https://c.8891.com.tw/photoLibrary-ajaxList.html?pid="
            pidstr = ""
            for idx, pid in enumerate(pid_list):
                pidstr += pid
                # 一次獲取多少張圖片的網址
                num_of_photo = 7
                if idx % num_of_photo == 0 or idx % len(pid_list) == 0:
                    # print(pidstr)
                    # 向https://c.8891.com.tw/photoLibrary-ajaxList.html 發出請求
                    r = ss.get(url=photo_lib_url +
                               myutils.url_encoding(pidstr),
                               headers=myutils.get_header())  # 網址裡的,需要轉換編碼
                    # print(r.url, "photo rul result:")
                    # print(r.text)
                    try:
                        json_obj = json.loads(r.text)["data"]
                    except Exception as err:
                        print("error ", "~" * 20)
                        print(err)
                        print(r.text)

                    for photo_json in json_obj:
                        photo_url = photo_json["smallPic"].replace(
                            r"\/", "/")  # 把反斜線弄掉
                        pic_url_list.append(photo_url)
                    pidstr = ""
                else:
                    pidstr += ","
    return pic_url_list
예제 #2
0
def meta_search(kw, ss, url, cata, total_car_num):
    search_header = myutils.get_header()
    search_header[
        "Content-Type"] = "application/x-www-form-urlencoded;charset=UTF-8"
    search_header["Accept"] = "*/*"
    search_header["Host"] = "tw.usedcar.yahoo.com"
    search_header["Accept-Language"] = "zh-tw"
    search_header["Accept-Encoding"] = "br,gzip,deflate"
    search_header["Origin"] = "https://tw.usedcar.yahoo.com"
    search_header["Referer"] = url
    search_header["Connection"] = "keep-alive"
    search_header["Content-Length"] = "56"
    search_header["X-Requested-With"] = "XMLHttpRequest"
    post_data = {
        "MIME 類型": "application/x-www-form-urlencoded; charset=UTF-8",
        "cata": "000000515224",
        "cateid": cata,
        "action": "dataPrepare"
    }
    req = ss.post(url="https://tw.usedcar.yahoo.com/search/search_services",
                  headers=search_header,
                  data=post_data)
    json_data = json.loads(req.text)
    print("meta search---------------------")
    # print(json_data)
    car_search(ss, url, cata, total_car_num, kw)
예제 #3
0
def yahoo_car():
    url = "https://tw.usedcar.yahoo.com"
    ss = myutils.get_session()
    req = ss.get(url=url, headers=myutils.get_header())
    soup = BeautifulSoup(req.text, "html.parser")
    # print(soup.prettify())
    # 車型
    car_type_list = soup.select("form select[name='catb'] option")
    car_type_dict = {
        t["value"]: t.text
        for t in car_type_list if len(t["value"]) > 0
    }
    # 廠牌
    brand_list = soup.select("form select[name='catid'] option")
    brand_dict = {
        t["value"]: t.text
        for t in brand_list if len(t["value"]) > 0
    }

    input_data = {
        i["name"]: i["value"]
        for i in soup.select("form input[type='hidden']")
    }
    print(car_type_dict)
    print(brand_dict)
    action = soup.select_one("form")["action"]
    print("input data", input_data)
    for brand in brand_dict:
        search_page("000000515224", input_data, action, url, ss, kw=brand)
예제 #4
0
def get_job_content(job_url):
    job_id = myutils.get_jobid_by_url(job_url)
    content_rul = "https://www.104.com.tw/job/ajax/content/" + job_id
    # 製作header
    header = myutils.get_header()
    header["Accept"] = "application/json, text/plain, */*"
    header["Accept-Language"] = "zh-tw"
    header["Host"] = "www.104.com.tw"
    header["Referer"] = job_url
    header["Accept-Encoding"] = "br, gzip, deflate"
    header["Sec-Fetch-Dest"] = "empty"
    header["Sec-Fetch-Mode"] = "cors"
    header["Sec-Fetch-Site"] = "same-origin"
    header["Connection"] = "keep-alive"
    req = ss.get(url=content_rul, headers=header)
    # print(json.dumps(json.loads(req.text), indent=4, ensure_ascii=False))
    try:
        content_data = json.loads(req.text)
    except JSONDecodeError as err:
        print(err)
        print(job_url)
        print(req.text)
    job_content = {}
    job_content["id"] = job_id
    job_content["job_name"] = content_data["data"]["header"]["jobName"]
    job_content["url"] = job_url
    job_content["company_name"] = content_data["data"]["header"]["custName"]
    job_content["company_url"] = content_data["data"]["header"]["custUrl"]
    job_content["contact"] = content_data["data"]["contact"]
    job_content["skill"] = content_data["data"]["condition"]["specialty"]
    job_content["job_detail"] = content_data["data"]["jobDetail"]["jobDescription"]
    print("get content url:", job_url, "success")
    return job_content
예제 #5
0
def get_page(page_num: int) -> dict:
    header = myutils.get_header()
    header["Accept"] = "application/json, text/javascript, */*; q=0.01"
    header["Accept-Encoding"] = "gzip, deflate, br"
    header["Accept-Language"] = "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7"
    header["Connection"] = "keep-alive"
    header["Host"] = "www.104.com.tw"
    header["Referer"] = first_url + "&order=1"
    header["Sec-Fetch-Dest"] = "empty"
    header["Sec-Fetch-Mode"] = "cors"
    header["Sec-Fetch-Site"] = "same-origin"
    header["X-Requested-With"] = "XMLHttpRequest"
    global keyword

    list_url = "https://www.104.com.tw/jobs/search/list?ro=0&kwop=7&keyword={}&order=15&asc=0&page={}&mode=s&jobsource=2018indexpoc"
    list_url = list_url.format(keyword, str(page_num))
    print("get page ", list_url)
    req = ss.get(url=list_url, headers=header)
    jd = json.loads(req.text)
    print(list_url, "status", jd["status"])
    # print(jd["data"]["list"])
    job_dict = {
        myutils.get_jobid_by_url(job["link"]["job"]): {"job_name": job["jobName"], "url": "https:" + job["link"]["job"]}
        for job in jd["data"]["list"]}
    # print(job_dict)
    return job_dict
예제 #6
0
def main():
    global first_url
    global keyword
    first_url = first_url.format(keyword)
    page_num = 1

    req = ss.get(url=first_url,
                 headers=myutils.get_header())
    soup = get_soup(req.text)
    total_page = get_total_page(req.text)
    job_data = {}
    for idx, bs in enumerate(soup.select("article div.b-block__left")):
        # print(bs)
        # print(idx, idx, idx)
        job = bs.select("a.js-job-link")
        for j in job:
            # print("url", j["href"], idx)
            if j["href"].find("hotjob_chr") == -1:
                job_data[myutils.get_jobid_by_url(j["href"])] = {"url": "https:" + j["href"], "job_name": j.text}
            # print("job_name", j.text)
        # print("-----------------")
    print(job_data)

    job_result = []
    for job in job_data:
        job_url = job_data[job]["url"]
        # print(job_url)
        job_content = get_job_content(job_url)
        job_service.add_job(job_content)
        job_result.append(job_content)

    for i in range(2, page_num + 1):
        job_data = get_page(i)
        for job in job_data:
            job_url = job_data[job]["url"]
            # print(job_url)
            sleep_time = random.uniform(1, 2)
            print("sleep {} sec".format(sleep_time))
            time.sleep(sleep_time)
            job_content = get_job_content(job_url)
            job_service.add_job(job_content)
            job_result.append(job_content)
    # print(len([bs.select("a.js-job-link") for bs in soup.select("div.b-block__left")]))
    # 計算技能名稱出現次數
    skill_dict = defaultdict(lambda: 0)
    for job in job_result:
        for skill in job["skill"]:
            skill_dict[skill["description"]] += 1

    print(skill_dict)
    with open("./dict/skill.txt", "a") as file:
        file.write(json.dumps(skill_dict))
예제 #7
0
def get_new_car_type(url: str):
    header = myutils.get_header()
    header["referer"] = "https://c.8891.com.tw/Models"
    ss = myutils.get_session()
    res = ss.get(url=url, headers=header)
    brandsoup = myutils.get_soup(res.text)
    # 獲取車型清單
    car_type_dict = {
        t.text: t["href"]
        for t in brandsoup.select(
            "div.brand-list-main.IndexKindContent a.brand-list-type")
    }
    return car_type_dict
예제 #8
0
def get_article():
    ss = myutils.get_session()
    # make header
    header = myutils.get_header()
    header["Accept"] = "application/json, text/plain, */*"
    header["Accept-Encoding"] = "gzip, deflate"
    header["Host"] = "www.carplushk.com"
    header["Accept-Language"] = "zh-tw"
    header["Referer"] = "http://www.carplushk.com/category/review/"
    header["Connection"] = "keep-alive"

    url = '''http://www.carplushk.com/wp-admin/admin-ajax.php?id=&post_id=4036&slug=review&canonical_url=http%3A%2F%20%20%20%20%20%2Fwww.carplushk.com%2Fcategory%2Freview%2F&posts_per_page=12&page={}&offset=25&post_type=post&repeater=template_1%20%20%20%20%20&seo_start_page=1&preloaded=false&preloaded_amount=0&cta[cta]=true&cta[cta_position]=after:12&cta[%20%20%20%20%20cta_repeater]=template_3&cta[cta_theme_repeater]=null&category=review&order=DESC&orderby=date&action=alm_get_posts&query_type=standard'''

    urlajax = url.format("0")
    print(urlajax)
    res = ss.get(url=urlajax, headers=header)
    data_dict = json.loads(res.text)
    try:
        total_post = int(data_dict["meta"]["totalposts"])
    except ValueError as err:
        print("*" * 50)
        print("total post is not a number")
    # for k in data_dict:
    #     print(k, " : ", data_dict[k])
    # soup = myutils.get_soup(data_dict["html"])
    # print(soup.prettify())

    # for s in soup.select("div.ajaxmoreblk a"):
    #     a = {"_id": s["href"], "title": s.text, "from": "http://www.carplushk.com", "type": "review"}
    #     article_list.append(a)
    # print(article_list)
    # result = mongo_service.insert_many("data", "car_article", article_list)
    # print(result)

    for i in range( int(total_post/12) + 1):
        article_list = []
        urlajax = url.format(i)
        res = ss.get(url=urlajax, headers=header)
        data_dict = json.loads(res.text)
        soup = myutils.get_soup(data_dict["html"])
        for s in soup.select("div.ajaxmoreblk a")[:-1]:
            a = {"_id": s["href"], "title": s.text, "from": "http://www.carplushk.com", "type": "review"}
            if not mongo_service.is_exist(idd=a["_id"], collection="car_article"):
                article_list.append(a)
            else:
                print(a["_id"], " already in article db")
        print(article_list)
        if len(article_list) > 0:
            result = mongo_service.insert_many("data", "car_article", article_list)
            print(result)
예제 #9
0
def download_pic(ss, car):
    pic_path = "./pic/{}/{}/{}/{}/{}/{}_{}_{}_{}".format(
        car["廠牌"], car.get("型號a", "0"), car.get("型號", "0"),
        car["auto_build_year"], car["mid"], car["廠牌"], car.get("型號a", "0"),
        car.get("型號", "0"), car["auto_build_year"])
    car_pic = car.pop("pic")
    car["pic"] = []
    for i, pic in enumerate(car_pic):
        q = ss.get(url=pic, headers=myutils.get_header())
        car["pic"].append({
            "url":
            pic,
            "file_path":
            myutils.write_pic_file(pic_path + "_{}.jpg".format(i), q.content)
        })
예제 #10
0
def get_used_car_page(url):
    logger.info("{} get url:{}".format(__name__, url))
    ss = myutils.get_session()
    res = ss.get(url=url, headers=myutils.get_header())
    soup = myutils.get_soup(res.text)
    logger.info(str(soup.prettify()))
    car = {}
    car_type = soup.select("div.breadcrumb a.NormalLink")
    print(car_type)
    car["brand"] = car_type[2].text
    if len(car_type) >= 5:
        car["type"] = car_type[4].text
    car["type2"] = car_type[3].text
    car["title"] = soup.select_one(
        "div.right-info info-right-width div.infos-head-title span").text
    car["price"] = soup.select_one("div.car-price-box div#price b").text
예제 #11
0
def do_search(key_word: str, page_num):
    global first_url
    page_num = int(page_num)
    if key_word is None or len(key_word) == 0:
        return "error keyword"
    if page_num is None:
        return "page_num"
    # 取得第一頁資料
    first_page_url = first_url.format(key_word)
    req = ss.get(url=first_page_url,
                 headers=myutils.get_header())
    soup = get_soup(req.text)
    job_data = {}
    for idx, bs in enumerate(soup.select("article div.b-block__left")):
        # print(bs)
        # print(idx, idx, idx)
        job = bs.select("a.js-job-link")
        for j in job:
            # print("url", j["href"], idx)
            if j["href"].find("hotjob_chr") == -1:
                job_data[myutils.get_jobid_by_url(j["href"])] = {"url": "https:" + j["href"], "job_name": j.text}

    job_result = []
    for job in job_data:
        job_url = job_data[job]["url"]
        # print(job_url)
        job_content = get_job_content(job_url)
        job_service.add_job(job_content)
        job_result.append(job_content)
    # 取得第二頁以後資料
    key_word = parse.quote(key_word)
    first_url = first_url.format(key_word)
    if page_num != 0:
        for i in range(2, page_num + 1):
            job_data = get_page(keyword=key_word, page_num=i)
            for job in job_data:
                job_url = job_data[job]["url"]
                # print(job_url)
                sleep_time = random.uniform(1, 2)
                print("sleep {} sec".format(sleep_time))
                time.sleep(sleep_time)
                job_content = get_job_content(job_url)
                job_service.add_job(job_content)
                job_result.append(job_content)
    myutils.write_json_file(job_result, str(int(time.time())) + "job.json")
예제 #12
0
def get_new_car_brand():
    # print("start")
    url = "https://c.8891.com.tw"
    ss = myutils.get_session()
    # get https://c.8891.com.tw/Models
    res = ss.get(url=url + "/Models", headers=myutils.get_header())
    soup = myutils.get_soup(res.text)
    # print(soup.select("div.scroll-area"))
    # 獲取車輛品牌清單
    new_car_brand_list = []
    for a in soup.select("div.scroll-area li"):
        new_car_brand = {}
        new_car_brand["country"] = a["country"]
        new_car_brand["brand_id"] = a["id"]
        atag = a.select_one("a")
        new_car_brand["brand"] = atag.text.strip()
        new_car_brand["link"] = url + atag["href"]
        new_car_brand_list.append(new_car_brand)
    return new_car_brand_list
예제 #13
0
def search_page(cata, input_data, action, url, ss, kw=""):
    if len(kw) == 0:
        input_data["catb"] = cata
    input_data["kw"] = kw
    print(input_data)

    search_header = myutils.get_header()
    search_header[
        "Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
    search_header["Accept-Encoding"] = "br, gzip, deflate"
    search_header["Host"] = "tw.usedcar.yahoo.com"
    search_header["Accept-Language"] = "zh-tw"
    search_header["Referer"] = "https://tw.usedcar.yahoo.com/"
    search_header["Connection"] = "keep-alive"

    searchreq = ss.get(url + action, params=input_data, headers=search_header)
    print("search page", searchreq.url)
    soup2 = myutils.get_soup(searchreq.text)
    total_car_num = soup2.select_one("div .infol.mei-u em").text
    print("total num:", total_car_num)
    meta_search(kw, ss, searchreq.url, cata, total_car_num)
예제 #14
0
def get_article_content(url: str, ss):
    conn = mongo_service.get_mongo_conn()
    db = conn["data"]
    coll = db["car_article"]
    cursor = coll.find({})

    ss = myutils.get_session()
    header = myutils.get_header()
    header["Accept"] = "application/json, text/plain, */*"
    header["Accept-Encoding"] = "gzip, deflate"
    header["Host"] = "www.carplushk.com"
    header["Accept-Language"] = "zh-tw"
    header["Referer"] = "http://www.carplushk.com/category/review/"
    header["Connection"] = "keep-alive"
    count = 1
    for art_url in cursor:
        art_dict = {}
        print(art_url["_id"], "\n")
        art_dict["_id"] = art_url["_id"]
        res = ss.get(url=art_url["_id"], headers=header)
        soup = myutils.get_soup(res.text)
        content = soup.select_one("div.entry-content.single-page")
        pdate = content.select_one("div.postdayau").text
        art_dict["post_time"] = datetime.datetime.strptime(pdate.split("By")[0].strip(), "%d %b, %Y")
        print(art_dict["post_time"])
        main_content = ""
        for tag in content:
            if tag.name == "p":
                if tag.text.find("Text & Photo") == -1:
                    main_content += tag.text
                    main_content += "\n"
            elif tag.name == "h2":
                main_content += "=t{}=t\n".format(tag.string)
        art_dict["content"] = main_content
        print(art_dict)
        count += 1
        if count == 5:
            break
        time.sleep(random.randint(1, 5))
예제 #15
0
def car_search(ss, url, cata, total_car_num, kw):
    each_page = 30
    search_header = myutils.get_header()
    search_header[
        "Content-Type"] = "application/x-www-form-urlencoded;charset=UTF-8"
    search_header["Accept"] = "*/*"
    search_header["Host"] = "tw.usedcar.yahoo.com"
    search_header["Accept-Language"] = "zh-tw"
    search_header["Accept-Encoding"] = "br,gzip,deflate"
    search_header["Origin"] = "https://tw.usedcar.yahoo.com"
    search_header["Referer"] = url
    search_header["Connection"] = "keep-alive"
    search_header["Content-Length"] = "268"
    search_header["X-Requested-With"] = "XMLHttpRequest"
    post_data = {
        "MIME 類型": "application/x-www-form-urlencoded; charset=UTF-8",
        "cata": "000000515224",
        "catb": cata,
        "undedup": 0,
        "unspc": 0,
        "areaa": "tw",
        "sort": 3,
        "total": total_car_num,
        "cp": 1,
        "ppa": each_page,
        "pa": 10,
        "type": "srplist",
        "vmode": 0,
        "action": "srplistquery",
        "kw": kw
    }
    if len(kw) > 0:
        post_data["catid"] = "000000515224"
    print("car search---------------------")
    # print(json_data)
    total_page = (int(total_car_num) // each_page) + 1
    for page in range(1, total_page + 1):
        print("total_page:", total_page, "current page :", page)
        post_data["cp"] = page
        try:
            req = ss.post(
                url="https://tw.usedcar.yahoo.com/search/search_services",
                headers=search_header,
                data=post_data)
            json_data = json.loads(req.text)
        except Exception as err:
            print("-" * 30)
            file_path = "./err/msg/{}.txt".format(kw + str(page))
            myutils.write_text_file(file_path=file_path, content=req.text)
            # error_log = {
            #     "err": err,
            #     "data": file_path
            # }
            # mongo_service.insert_data(collection="err", db_name="data", json_data=json.dumps(error_log))
            # raise err
        try:
            for car in json_data["data"][1:]:
                if not mongo_service.is_exist(car["mid"]):
                    print("cat id {} already existed".format(car["mid"]))
                    continue
                url = car["mlink"]
                r = ss.get(url, headers=myutils.get_header())
                print("get car detail : url", url)
                car_soup = myutils.get_soup(r.text)
                # print(car_soup.prettify())
                # 車輛廠牌分類
                car_brand = [a.text for a in car_soup.select("div.itemhd a")]
                car["新舊"] = car_brand[0]
                car["車型"] = car_brand[1]
                car["廠牌"] = car_brand[2].replace("/", "[sl]")
                if len(car_brand) > 3:
                    car["型號a"] = car_brand[3]
                    car["型號"] = "fix_" + car_brand[3]
                    if len(car_brand) > 4:
                        car["型號"] = car_brand[4]
                # 車輛狀態
                car_status = []
                for i in car_soup.select("div#ycoptions ul#itemAttrs li")[0:3]:
                    for j in i:
                        car_status.extend(j.select("td"))
                # print("car_status", car_status)
                for i in range(0, len(car_status), 2):
                    if "hide" not in car_status[i]["class"]:
                        car[car_status[i].text] = car_status[i + 1].text
                # 車輛配備
                car_equipment = car_soup.select(
                    "div#ycoptions ul#itemAttrs li.col2 td span")
                print("car_equipment", car_equipment)
                for i in car_equipment:
                    car[i.text] = 1
                # 車輛圖案
                car_pic = car_soup.select_one(
                    "div#ycitemslideshow div.sft input")
                car_pic = car_pic["value"].replace(
                    "'", '"')  # 取value屬性的值,接着用雙引號取代單引號
                if len(car_pic) > 0:
                    car_pic = json.loads(car_pic)  # 將文字轉成物件
                    car_pic = [pic["i"] for pic in car_pic]
                # print("car_pic", car_pic)
                car["pic"] = car_pic
                # get_picture(ss, car_pic["href"])

                # 寫入圖片到本地
                # print(car)
                # download_pic(ss, car)
                car["_id"] = car.pop("mid")
                # car.remove_key("mid")
                # print(car)
                mongo_service.insert_data("data", "car", car)
                item_sleep_time = random.uniform(0, 2)
                print("item sleep :", item_sleep_time)
                time.sleep(item_sleep_time)
            page_sleep_time = random.uniform(0, 5)
            print("page sleep :", page_sleep_time)
            time.sleep(page_sleep_time)
        except Exception as err:
            print("-" * 20)
            # print(car)
            # print("car pic", car_pic)
            error_log = {"err": err, "data": car}
            mongo_service.insert_data(collection="err",
                                      json_data=error_log,
                                      db_name="data")
예제 #16
0
def get_pic_page_url(url: str):
    ss = myutils.get_session()
    req = ss.get(url=url, headers=myutils.get_header())
    soup = myutils.get_soup(req.text)
    url_list = [a["href"] for a in soup.select("div.jp-bg-color.mt10 a")]
    return url_list[1]