def get_links(): brands = hu.get_brands_link( hu.get_html('http://www.profit-msk.ru/goods/zip/index.html')) for brand in brands: models = hu.get_models_link(hu.get_html(brand)) df = pandas.DataFrame(models) df.to_csv('models', index=False, mode='a', header=False, sep=";")
def parser_site(): # data = pandas.read_csv('models', sep=';') data = fu.load_file('models') # models = data.values.tolist() for model in data: if model: proxy = {'http': 'http://' + get_proxy.get_proxies_list()} useragent = {'User-Agent': get_proxy.get_useregent_list()} soup = hu.get_html(model[1], useragent, proxy) if soup == 404: continue brand_name, model_name, device_spec, device_data = hu.model_parser( soup, model[0]) model_name = re.sub('/', ' ', model_name) base_dir = os.path.dirname(__file__) base_dir = f'{base_dir}\\parse\\{brand_name}' if not os.path.exists(base_dir): os.mkdir(base_dir) df = pandas.DataFrame(device_spec) df.to_csv(f'{base_dir}\\{model_name}_spec.csv', index=False, header=False, sep=";") df = pandas.DataFrame(device_data) df.to_csv(f'{base_dir}\\{model_name}_parts.csv', index=False, header=False, sep=";")
def get_topn_words_from_urls(urls,topn,save_reports = False): htmls = [html_utils.get_html(url) for url in urls] # 汇总文本 summary_atricle = '\n'.join([parse_report_article(html) for html in htmls]) if save_reports: with open('reports.txt','w+') as fout: fout.write(summary_atricle) return cut_text_utils.get_topn_words(summary_atricle,topn)
def analyze_info(self, url): """ 解析数据 :param url: 网页地址 """ house_list = [] doc = pyQuery(get_html(url, self.referer)) items = doc('.key-list .item-mod').items() for item in items: address = item.find('.address').text() # 去空格 index = address.find('\xa0', 2) address = ' '.join(address.split()) # 地区 city = '' if index >= 2: city = address[2:index] # 价格 price_desc = item.find('.price').text() or item.find( '.price-txt').text() house_info = { # 城市 'city': city, # 名称 # 'name': item.find('.lp-name h3').text(), 'name': item.find('.items-name').text(), # 户型 'house_type': ' '.join(item.find('.huxing').text().split()), # 地址 'address': address, # 地址链接 'address_link': item.find('.address').attr('href'), # 标签 'tags': item.find('.tag-panel').text(), # 价格 'price': price_desc, 'price_nu': analysis_price(price_desc), # 排名 'rank': item.find('.group-mark').text(), # 图片 'pic': item.find('.pic img').attr('src'), # 图片链接 'pic_link': item.children('.pic').attr('href'), 'report_date': self.report_date } # 加入列表中 house_list.append(house_info) self.total += len(house_list) # 本页数据批量存入MongoDB中 self.collection.insert(house_list) # 获取下一页,如果有下一页的,继续爬取下一页的内容 next_url = doc('.list-page .next-page').attr('href') if next_url: # 引用上一个访问地址 self.referer = url time.sleep(2) self.new_log.logger.info('next => %s' % next_url) self.analyze_info(next_url)
def get_report_urls(summary_url): html = html_utils.get_html(summary_url) soup = BS(html,'html.parser') reports_table = soup.select('#UCAP-CONTENT table tbody')[0] reports = [(atag.text,atag['href']) for trtag in reports_table.select('tr') for tdtag in trtag.select('td') if len(tdtag.select('a')) != 0 for atag in tdtag.select('a')] # 过滤去2017年的URL report_urls = [x for x in reports if x[0] != '2017'] report_urls.append(('2017',REPORT2017_URL)) # 按照年份升序排序 report_urls = sorted(report_urls,key = lambda item:item[0]) return report_urls
def analyze_info(self, url): """ 解析数据 :param url: 网页源码 """ house_list = [] doc = pyQuery(get_html(url, self.referer)) items = doc('#houselist-mod-new .list-item').items() for item in items: detail = ' '.join( item.find('.details-item').text().split()).split(' ') if len(detail) < 3: continue all_price_desc = item.find('.price-det').text() unit_price_desc = item.find('.unit-price').text() house_info = { # 区域 'city': detail[2].split('-')[0], # 名称 'name': detail[1], # 户型 'house_type': detail[0][0:detail[0].find('造') + 1], # 地址 'address': detail[2], # 标签 'tags': item.find('.tags-bottom').text(), # 总价 'all_price': all_price_desc, 'all_price_nu': analysis_price(all_price_desc), # 单价 'unit-price': unit_price_desc, 'unit-price_nu': analysis_price(unit_price_desc), # 图片 'pic': item.find('.item-img img').attr('src'), # 房源真实性 'authenticity': item.find('.house-title .house-icon').text(), 'report_date': self.report_date } # 加入列表中 house_list.append(house_info) self.total += len(house_list) # 批量存入MongoDB中 self.collection.insert(house_list) # 获取下一页,如果有下一页的,继续爬取下一页的内容 next_url = doc('.multi-page .aNxt').attr('href') if next_url: # 引用上一个访问地址 self.referer = url time.sleep(2) self.sale_log.logger.info('next => %s' % next_url) self.analyze_info(next_url)
def get_topn_words(url,topn): html = html_utils.get_html(url) article = parse_report_article(html) return cut_text_utils.get_topn_words(article,topn)