def offer_to_sell(): results = tb_category_info.select().where(pid__ne = '0').execute() for result in results: url_path = SELL_BASE_URL + result.cid + '/' + result.url while url_path: print url_path resp = session.get(url_path) url_path = next_pattern.findall(resp.text) if url_path: url_path = url_path[0] urls = sell_url_pattern.findall(resp.text) for url in urls: try: sell_info = {} print url sell_id, product_name = sell_id_pattern.findall(url)[0] resp = session.get(url) # groups = company_info_pattern.findall(resp.text) company_id = company_id_pattern.findall(resp.text)[0] product_description = sell_description_pattern.findall(resp.text)[0].strip() sell_info['cid'] = result.cid sell_info['sell_id'] = sell_id sell_info['company_id'] = company_id sell_info['product_name'] = product_name.strip() sell_info['description'] = strip_tags(product_description).strip() print sell_info sell_info_db = tb_sell_info(**sell_info) sell_info_db.save() except Exception: print traceback.format_exc() with open('sell_error.txt', 'a') as FILE: FILE.write(url + '\n') print '出错'
def products_info(): results = tb_category_info.select().where(pid__ne = '0').execute() for result in results: url_path = PRODUCTS_BASE_URL + result.cid + '/' + result.url while url_path: print url_path resp = session.get(url_path) url_path = next_pattern.findall(resp.text) if url_path: url_path = url_path[0] urls = product_url_pattern.findall(resp.text) for url in urls: try: product_info = {} print url product_id, product_name = product_id_pattern.findall(url)[0] resp = session.get(url) # groups = company_info_pattern.findall(resp.text) company_id = company_id_pattern.findall(resp.text)[0] product_description = product_description_pattern.findall(resp.text)[0].strip() product_info['cid'] = result.cid product_info['product_id'] = product_id product_info['company_id'] = company_id product_info['product_name'] = product_name.strip() product_info['description'] = clean_html(product_description) print product_info product_info_db = tb_product_info(**product_info) product_info_db.save() except Exception: print traceback.format_exc() with open('product_error.txt', 'a') as FILE: FILE.write(url + '\n') print '出错'
def company_crawler(): results = tb_category_info.select().where(pid__ne = '0').execute() for result in results: url_path = COMPANY_BASE_URL + result.cid + '/' + result.url while url_path: print url_path resp = session.get(url_path) url_path = next_pattern.findall(resp.text) if url_path: url_path = url_path[0] urls = company_url_pattern.findall(resp.text) for url in urls: try: company_info = {} print url company_id = company_id_pattern.findall(url)[0] company_info['cid'] = result.cid company_info['company_id'] = company_id resp = session.get(url) groups = company_info_pattern.findall(resp.text) company_info['company_name'] = groups[0][0] company_info['company_description'] = groups[0][1].strip() company_profile_block = groups[0][2] matchs = company_profile_pattern.findall(company_profile_block) for match in matchs: if 'Business' in match[0]: company_info['business_type'] = match[1] elif 'Industry' in match[0]: company_info['industry_focus'] = match[1] elif 'Services' in match[0]: company_info['services_products'] = match[1] elif 'Year' in match[0]: company_info['year_established'] = match[1] elif 'Employees' in match[0]: company_info['employees'] = match[1] elif 'Annual' in match[0]: company_info['annual_revenue'] = match[1] elif 'Geographic' in match[0]: company_info['geographic_markets'] = match[1] elif 'Certificates' in match[0]: company_info['certificates'] = match[1] elif 'Brand' in match[0]: company_info['brand_name'] = match[1] else: print match company_info['contact_person'] = groups[0][3] company_info['company_address'] = groups[0][4] company_info['city'] = groups[0][5] company_info['province'] = groups[0][6] company_info['country'] = groups[0][7] company_info['zip'] = groups[0][8] company_info['phone_number'] = groups[0][9] company_info['fax_number'] = groups[0][10] company_info['homepage'] = groups[0][11] print company_info company_info_db = tb_company_info(**company_info) company_info_db.save() except Exception as e: print traceback.format_exc() if 'list index out of range' in e.message: with open('error.txt', 'a') as FILE: FILE.write(url + '\n') print '出错'