def parser_models(): file_index = 0 model_links = file_utils.load_links_brand() for brand in model_links: for model in brand: file_index += 1 proxy = {'http': 'http://' + get_proxy.get_proxies_list()} useragent = {'User-Agent': get_proxy.get_useregent_list()} # присваиваем html страницу в переменную soup soup = parser.get_html(model['href'], useragent, proxy) page_count = parser.get_pagination_index_models(soup) model_name = model['name'] brands = re.findall('^[^\s]+', model_name) brand_name = brands[0] print( str(file_index) + ': ' + model_name + ', page count - ' + str(page_count)) erc_csv = parser.parser_errors(soup, brand_name, model_name) file_utils.save_error_code(erc_csv, brand_name, model_name) if page_count > 1: for i in range(page_count): index = i + 2 if index <= page_count: soup = parser.get_html( model['href'] + f'&page={index}', useragent, proxy) erc_csv = parser.parser_errors(soup, brand_name, model_name) file_utils.save_error_code(erc_csv, brand_name, model_name)
def get_save_links(): url = 'https://printcopy.info/?mod=erc' proxy = {'http': 'http://' + get_proxy.get_proxies_list()} useragent = {'User-Agent': get_proxy.get_useregent_list()} brand_list = parser.get_brand_model_links( parser.get_html(url, useragent, proxy), 'brandList') for brand in brand_list: print(brand['name']) proxy = {'http': 'http://' + get_proxy.get_proxies_list()} useragent = {'User-Agent': get_proxy.get_useregent_list()} # присваиваем html страницу в переменную soup soup = parser.get_html(brand['href'], useragent, proxy) page_count = parser.get_pagination_index_models(soup) print(page_count) model_link = parser.get_brand_model_links(soup, 'modelList') file_utils.save_model_links_csv(model_link, brand['name'], brand['name']) if page_count > 1: for i in range(page_count): index = i + 2 if index <= page_count: model_link = parser.get_brand_model_links( parser.get_html(brand['href'] + f'&page={index}', useragent, proxy), 'modelList') file_utils.save_model_links_csv(model_link, brand['name'], brand['name'])
def get_migration_index(city, move_type): """ 获取城市迁徙规模指数 :param city:城市或者省 :param move_type:迁移类型(move_in;move_out) :return: none """ url = MIGRATION_INDEX_BACE_URL + 'dt=city&id=' + str( CITY_NUM[city]) + '&type=' + move_type print(city) print(url) restext = http_utils.get_html(url) internal_flow = json.loads(restext[3:-1])['data']['list'] key = list(internal_flow.keys()) value = list(internal_flow.values()) tempdict = {'date': key, 'value': value} df = pd.DataFrame(tempdict) df.to_csv('./data/' + city + '_' + move_type + '_migration_index.csv', encoding='ANSI', index=False) print(city + '_' + move_type + '_migration_index.csv' + ' have been saved!')
def get_migration_city(startDate, endDate, city, move_type): """ 获取市级迁移数据 :param startDate: 开始时间 :param endDate:结束时间 :param city:城市或者省 :param move_type:迁移类型(move_in;move_out) :return: none """ apiUrl = CITY_RANK_BASE_URL + 'dt=city&id=' + str( CITY_NUM[city]) + '&type=' + move_type + '&date={}' date = datetime.strptime(startDate, "%Y%m%d") end = datetime.strptime(endDate, "%Y%m%d") print(city) final_list = [] while date <= end: currentDate = date.strftime('%Y%m%d') print(currentDate) date = date + timedelta(days=1) url = apiUrl.format(currentDate) print(url) restext = http_utils.get_html(url) migration_data = json.loads(restext[3:-1])['data']['list'] result = [currentDate, city] for data in migration_data: result.append(data['city_name']) result.append(data['value']) final_list.append(result) if move_type == 'move_out': city1 = 'from_city' city2 = 'to_city' else: city1 = 'to_city' city2 = 'from_city' with open('./data/' + city + '_' + move_type + '_migration_city.csv', 'w', encoding='utf-8-sig', newline='') as outFileCsv: writer = csv.writer(outFileCsv) # 表头 result = ['date', city1] for i in range(1, 101): result.append(city2 + str(i)) result.append('ratio' + str(i)) writer.writerow(result) writer = csv.writer(outFileCsv) # 数据 writer.writerows(final_list) print(city + '_' + move_type + '_migration_city.csv' + ' have been saved!')
def parser_models(): file_index = 0 model_links = file_utils.load_links_brand() for brand in model_links: brand_name = brand[0] for model in brand: file_index += 1 proxy = {'http': 'http://' + get_proxy.get_proxies_list()} useragent = {'User-Agent': get_proxy.get_useregent_list()} # присваиваем html страницу в переменную soup soup = parser.get_html(model['href'], useragent, proxy) model_name = model['name'] print(str(file_index) + '. ' + model_name) modules = parser.get_modules(soup, 'pcToc') for module in modules: module_name = module['name'] soup = parser.get_html(module['href'], useragent, proxy) file_utils.save_partcode( parser.get_partcodes(soup, brand_name['brand'], model_name, module_name), brand_name['brand'], model_name)
def get_html(self, url, timeout=20): proxy = random.sample( [p for p in self.proxy_list if p.is_available()], 1)[0] start_at = time.time() html = get_html(url, proxy=proxy.proxy, timeout=timeout) elapsed_sec = time.time() - start_at if html is None: proxy.log_fail(elapsed_sec) return None else: proxy.log_success(elapsed_sec) return html
def get_internal_flow(city): """ 获取城市城内出行强度数据 :param city:只能是城市(只有城市有城内出行强度) :return: none """ url = INTERNAL_FLOW_BACE_URL + 'dt=city&id=' + str( CITY_NUM[city]) + '&date=20200223' print(city) print(url) restext = http_utils.get_html(url) internal_flow = json.loads(restext[3:-1])['data']['list'] key = list(internal_flow.keys()) value = list(internal_flow.values()) tempdict = {'date': key, 'value': value} df = pd.DataFrame(tempdict) df.to_csv('./data/' + city + '_internal_flow.csv', encoding='ANSI', index=False) print(city + '_internal_flow.csv' + ' have been saved!')
def get_answer(parsed, as_text=False, debug=False): # If cache directory doesnt exists, create it if not path.isdir(cache_dir): if debug: print("[Info] Diretório de cache não existe, criando um") try: makedirs(cache_dir) except OSError as e: print( "[Warning] Falha ao criar o diretório de cache, continuando sem cache" ) except Exception as e: print("[Error]: Erro desconhecido. Mensagem: '{0}'".format(str(e))) # Check if cached file exists try: cache_file_path = cache_file_base.format(parsed.country.lower()) except AttributeError as e: err_msg = "[Error] Nenhum país fornecido, impossível encontrar uma resposta." raise ChatbotException(e, err_msg, parsed.question) cache_need_update = True if path.isfile(cache_file_path): # Check cached file timestamp cache_timestamp = datetime.fromtimestamp( path.getmtime(cache_file_path)) diff = datetime.now() - cache_timestamp # If file is more than 1 day old, update it, else just load file if diff.days > 0: cache_need_update = True else: cache_need_update = False if as_text: infobox = _get_cached_webpage(cache_file_path) else: html = _get_cached_webpage(cache_file_path + "-html") infobox = BeautifulSoup(html, "html.parser") if debug: print("[Info] Página encontrada no cache") # No cache or cache needs update, download file if cache_need_update: if debug: print("[Info] Atualizando cache de " + parsed.country) # Create url for indexmundi country = url_encode(parsed.country) url = index_mundi_base_url + country html = get_html(url) soup = BeautifulSoup(html, "html.parser") # Find infobox table infobox = soup.findAll("table", attrs={"class": "infobox"}) try: infobox = infobox[0] except IndexError as e: # NOTE: a página do kiribati não existe # NOTE: a do Vietnã deve ser vietname err_msg = "[Error] Tabela não encontrada" raise ChatbotException(e, err_msg, parsed.question) # Pre-process infobox text # IMPORTANT: NÃO REMOVER O K DA NORMALIZAÇÃO!!!!!! VAI CAGAR TUDO, PARECE UMA BOA IDEIA NA HORA, MAS DEPOIS VAI SER PIOR! if as_text: if debug: print("[Info] Salvando página como texto puro") infobox = unicodedata.normalize("NFKC", infobox.text) infobox = separate_words(infobox) # infobox = re.sub(r"\n", r" ", infobox.lower()) _cache_webpage(infobox, cache_file_path) else: if debug: print("[Info] Salvando página como html") infobox_text = infobox.decode_contents() _cache_webpage(infobox_text, cache_file_path + "-html") # If we are working with pure text (clear html, other source of info, text) # cache, etc if as_text: # Generate lowercase infobox for use in comparations # TODO: use spacy idx infobox_ = re.sub(r"[-–−]", r"-", infobox.lower()) infobox_model = pt_model(infobox) canon_infobox_model = pt_model(infobox) unstoppable_infobox = " ".join( [word.text for word in infobox_model if word.is_stop == False]) canon_unstoppable_infobox = " ".join([ word.text for word in canon_infobox_model if word.is_stop == False ]) # print(unstoppable_infobox, "\n\n", canon_unstoppable_infobox) ans = None # Try searching for answer using question's core first, if no answer, search # using topic. # IMPORTANT: this isnt working and i dunno why, halp # if parsed.core in infobox: # _, start, end = find_between(infobox_, parsed.core.lower(), " - ") # # Get everything between - blahblah - and assume its the correct answer # # TODO: clean answer # ans = infobox[start:end] # No answer found with core, search with topic if not ans and parsed.topic: perm = permutations(parsed.topic.lower().split(" ")) for p in perm: query = " ".join(p) # print(repr(query)) _, start, end = find_between(canon_unstoppable_infobox.lower(), query, " - ") ans = unstoppable_infobox[start:end] if ans: break if not ans: return 'Topic not Found' ans = _process_answer(re.sub(r"[-–−]", r"-", ans)) return ans.strip() # If we are working with raw html (structured data) else: old_cell = None topic_found = False for child in infobox.children: cells = child.findAll("td") for cell in cells: if debug: print() print(cell.prettify()) print() # Topic found in previous cell, try to find answer here if topic_found: ans = cell.text ans = _process_answer(re.sub(r"[-–−]", r"-", ans)) return ans topic_found = False if parsed.topic.lower() in cell.text.lower(): topic_found = True old_cell = cell print(cell.prettify()) print( "[Info] Topic found in this cell, answer may be in the next." ) if debug: print("Topic found? " + str(topic_found)) input("Press anything to continue...") return 'Topic not Found'