示例#1
0
    def getcompany_url(self, job_url):
        logger = Logger(logname='error.log', logger="58com").getlog()
        company_list = {}
        try:
            data = proxy.proxy_request(job_url)
            soup = BeautifulSoup(data, 'html.parser')
            tags = soup.find_all('div', class_="comp_name")
            for tag in tags:
                company_name = tag.a.get_text()
                company_url = tag.a['href']
                #处理相对路径和绝对路径
                if company_url.startswith('http'):
                    company_list[company_name] = company_url
                else:
                    company_list[
                        company_name] = "http://qy.58.com" + company_url

        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reason)
            logger.error("get compang url failed, url: %s", job_url)  #记录失败日志
        except Exception as e:
            print("exception:" + str(e))
            sleep(1)

        return company_list
示例#2
0
    def getcompany_info(self, name, url):
        logger = Logger(logname='error.log', logger="58com").getlog()
        ds = DataStore()
        try:
            company_text = []
            html = proxy.proxy_request(url)
            soup = BeautifulSoup(html, 'html.parser')
            tag = soup.find(class_="basicMsg")
            ul = tag.find("ul")
            li_tags = ul.find_all(name='li')
            strinfo = re.compile('\s')
            for li in li_tags:
                txt = strinfo.sub('', li.get_text())
                company_text.append(txt.split(':')[1])
            #获取工商信息
            #gongshang_info = tianyan.tianyan_search(name)
            #gongshang_info = ','.join(gongshang_info)
            ds.insert_database(name, company_text)

        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reason)
            logger.error("Get company info fail, company name: %s, url: %s",
                         name, url)  #记录解析失败的公司和url
        except Exception as e:
            print("exception:" + str(e))
            sleep(1)
示例#3
0
    def get_url(self, start_url):
        logger = Logger(logname='error.log', logger="58com").getlog()
        url_dict = {}
        try:
            data = proxy.proxy_request(start_url)
            soup = BeautifulSoup(data, 'html.parser')
            tags = soup.find(id="sidebar-right")
            tags_li = tags.find_all('li')
            for tag in tags_li:
                a_tags = tag.find_all('a')
                job_class = a_tags[0].string
                job_urlname = a_tags[0].attrs.get('href')
                ##处理相对路径和绝对路径
                if job_urlname.startswith('/'):
                    job_urlname = "http://bj.58.com" + job_urlname + "pn"
                    url_dict[job_class] = job_urlname
                else:
                    url_dict[job_class] = job_urlname

        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reason)
            logger.error("Can not open start url: %s", start_url)
        except Exception as e:
            print("exception:" + str(e))
            sleep(1)

        return url_dict
示例#4
0
def get(search_element, max_product_count):

    # request for webpage and parse it
    url = 'https://www.amazon.in/s?k=' + search_element
    response = proxy_request(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    with open('web/amazon-temp.html', 'w') as file:
        file.write(soup.prettify())

    # check for empty result set
    products = soup.find_all('li', class_='s-result-item celwidget', limit=1)
    if 0 == len(products):
        print('empty result set from amazon.in')
        with open('json/amazon_in.json', 'w') as file:
            json.dump([], file)
        exit()

    # create threads to search for each attribute in parallel
    name_link_thread = Thread(target=get_name_and_link,
                              args=(
                                  soup,
                                  max_product_count,
                              ))
    image_thread = Thread(target=get_image, args=(
        soup,
        max_product_count,
    ))
    price_thread = Thread(target=get_price, args=(
        soup,
        max_product_count,
    ))

    # search items
    name_link_thread.start()
    image_thread.start()
    price_thread.start()

    # wait for the threads to end
    name_link_thread.join()
    image_thread.join()
    price_thread.join()

    # create array storing dictionaries containing all product details
    product_arr = []
    for i in range(len(product_names)):
        product_arr.append({
            'name': product_names[i],
            'price': product_prices[i],
            'image': product_images[i],
            'link': product_links[i]
        })

    # storing this object into a JSON file
    with open('json/amazon_in.json', 'w') as file:
        json.dump(product_arr, file)

    print('amazon: done')
示例#5
0
def get(search_element, max_product_count):

    # request web page and parse it
    url = 'https://www.snapdeal.com/search?keyword=' + search_element
    response = proxy_request(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # get all product details
    products = soup.find_all('img',
                             class_='product-image',
                             limit=max_product_count)

    # check for empty result set
    if 0 == len(products):
        print('empty result set from snapdeal.com')
        with open('json/snapdeal.json', 'w') as file:
            json.dump([], file)
        exit()

    # create threads to search for each attribute in parallel
    name_image_thread = Thread(target=get_name_and_image, args=(products, ))
    link_thread = Thread(target=get_link, args=(
        soup,
        max_product_count,
    ))
    price_thread = Thread(target=get_price, args=(
        soup,
        max_product_count,
    ))

    # search items
    name_image_thread.start()
    link_thread.start()
    price_thread.start()

    # wait for the threads to end
    name_image_thread.join()
    link_thread.join()
    price_thread.join()

    # create array storing dictionaries containing all product details
    product_arr = []
    for i in range(len(product_names)):
        product_arr.append({
            'name': product_names[i],
            'price': product_price[i],
            'image': product_images[i],
            'link': product_link[i]
        })

    # store this object into a local JSON file
    with open('json/snapdeal.json', 'w') as file:
        json.dump(product_arr, file)

    print('snapdeal: done')
示例#6
0
def catch_all(path):
    '''
        All requests are caught by this route, unless explicitly caught by
        other more specific patterns.
        http://flask.pocoo.org/docs/0.12/design/#the-routing-system
    '''
    def fallback():
        return render_template('generic.html', context={'heading': "lil blog media uploader",
                                                        'message': "a static site helper"})

    try:
        proxied_response = proxy_request(request, path)
        if proxied_response:
            return proxied_response
        else:
            app.logger.warning("No response returned by proxied endpoint.")
            return fallback()
    except NameError:
        app.logger.warning("No proxy function available.")
        return fallback()
示例#7
0
    def get_info(self, start_url):
        try:
            data = proxy.proxy_request(start_url)
            soup = BeautifulSoup(data, 'lxml')
            total_data = soup.find_all(type="hidden")
            temp_num = 1
            for data in total_data:
                temp_data = data.get('value')
                if temp_num == 2:
                    data_list = temp_data.split('!!')
                if temp_num == 3:
                    station_list = temp_data.split('!!')
                if temp_num == 4:
                    attr_list = temp_data.split('!!')
                if temp_num >= 5:
                    break
                else:
                    temp_num += 1

        except Exception as e:
            print("exception:" + str(e))
            sleep(1)

        return data_list, station_list, attr_list