def getcompany_url(self, job_url): logger = Logger(logname='error.log', logger="58com").getlog() company_list = {} try: data = proxy.proxy_request(job_url) soup = BeautifulSoup(data, 'html.parser') tags = soup.find_all('div', class_="comp_name") for tag in tags: company_name = tag.a.get_text() company_url = tag.a['href'] #处理相对路径和绝对路径 if company_url.startswith('http'): company_list[company_name] = company_url else: company_list[ company_name] = "http://qy.58.com" + company_url except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) logger.error("get compang url failed, url: %s", job_url) #记录失败日志 except Exception as e: print("exception:" + str(e)) sleep(1) return company_list
def getcompany_info(self, name, url): logger = Logger(logname='error.log', logger="58com").getlog() ds = DataStore() try: company_text = [] html = proxy.proxy_request(url) soup = BeautifulSoup(html, 'html.parser') tag = soup.find(class_="basicMsg") ul = tag.find("ul") li_tags = ul.find_all(name='li') strinfo = re.compile('\s') for li in li_tags: txt = strinfo.sub('', li.get_text()) company_text.append(txt.split(':')[1]) #获取工商信息 #gongshang_info = tianyan.tianyan_search(name) #gongshang_info = ','.join(gongshang_info) ds.insert_database(name, company_text) except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) logger.error("Get company info fail, company name: %s, url: %s", name, url) #记录解析失败的公司和url except Exception as e: print("exception:" + str(e)) sleep(1)
def get_url(self, start_url): logger = Logger(logname='error.log', logger="58com").getlog() url_dict = {} try: data = proxy.proxy_request(start_url) soup = BeautifulSoup(data, 'html.parser') tags = soup.find(id="sidebar-right") tags_li = tags.find_all('li') for tag in tags_li: a_tags = tag.find_all('a') job_class = a_tags[0].string job_urlname = a_tags[0].attrs.get('href') ##处理相对路径和绝对路径 if job_urlname.startswith('/'): job_urlname = "http://bj.58.com" + job_urlname + "pn" url_dict[job_class] = job_urlname else: url_dict[job_class] = job_urlname except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) logger.error("Can not open start url: %s", start_url) except Exception as e: print("exception:" + str(e)) sleep(1) return url_dict
def get(search_element, max_product_count): # request for webpage and parse it url = 'https://www.amazon.in/s?k=' + search_element response = proxy_request(url) soup = BeautifulSoup(response.text, 'html.parser') with open('web/amazon-temp.html', 'w') as file: file.write(soup.prettify()) # check for empty result set products = soup.find_all('li', class_='s-result-item celwidget', limit=1) if 0 == len(products): print('empty result set from amazon.in') with open('json/amazon_in.json', 'w') as file: json.dump([], file) exit() # create threads to search for each attribute in parallel name_link_thread = Thread(target=get_name_and_link, args=( soup, max_product_count, )) image_thread = Thread(target=get_image, args=( soup, max_product_count, )) price_thread = Thread(target=get_price, args=( soup, max_product_count, )) # search items name_link_thread.start() image_thread.start() price_thread.start() # wait for the threads to end name_link_thread.join() image_thread.join() price_thread.join() # create array storing dictionaries containing all product details product_arr = [] for i in range(len(product_names)): product_arr.append({ 'name': product_names[i], 'price': product_prices[i], 'image': product_images[i], 'link': product_links[i] }) # storing this object into a JSON file with open('json/amazon_in.json', 'w') as file: json.dump(product_arr, file) print('amazon: done')
def get(search_element, max_product_count): # request web page and parse it url = 'https://www.snapdeal.com/search?keyword=' + search_element response = proxy_request(url) soup = BeautifulSoup(response.text, 'html.parser') # get all product details products = soup.find_all('img', class_='product-image', limit=max_product_count) # check for empty result set if 0 == len(products): print('empty result set from snapdeal.com') with open('json/snapdeal.json', 'w') as file: json.dump([], file) exit() # create threads to search for each attribute in parallel name_image_thread = Thread(target=get_name_and_image, args=(products, )) link_thread = Thread(target=get_link, args=( soup, max_product_count, )) price_thread = Thread(target=get_price, args=( soup, max_product_count, )) # search items name_image_thread.start() link_thread.start() price_thread.start() # wait for the threads to end name_image_thread.join() link_thread.join() price_thread.join() # create array storing dictionaries containing all product details product_arr = [] for i in range(len(product_names)): product_arr.append({ 'name': product_names[i], 'price': product_price[i], 'image': product_images[i], 'link': product_link[i] }) # store this object into a local JSON file with open('json/snapdeal.json', 'w') as file: json.dump(product_arr, file) print('snapdeal: done')
def catch_all(path): ''' All requests are caught by this route, unless explicitly caught by other more specific patterns. http://flask.pocoo.org/docs/0.12/design/#the-routing-system ''' def fallback(): return render_template('generic.html', context={'heading': "lil blog media uploader", 'message': "a static site helper"}) try: proxied_response = proxy_request(request, path) if proxied_response: return proxied_response else: app.logger.warning("No response returned by proxied endpoint.") return fallback() except NameError: app.logger.warning("No proxy function available.") return fallback()
def get_info(self, start_url): try: data = proxy.proxy_request(start_url) soup = BeautifulSoup(data, 'lxml') total_data = soup.find_all(type="hidden") temp_num = 1 for data in total_data: temp_data = data.get('value') if temp_num == 2: data_list = temp_data.split('!!') if temp_num == 3: station_list = temp_data.split('!!') if temp_num == 4: attr_list = temp_data.split('!!') if temp_num >= 5: break else: temp_num += 1 except Exception as e: print("exception:" + str(e)) sleep(1) return data_list, station_list, attr_list