def main(): keywords = _load_data() for i, keyword in tqdm(enumerate(keywords[4:])): print(i, keyword) filename_num = str(i).zfill(3) filename = f"{filename_num}{keyword}.xlsx" result = search_keyword(keyword) to_xlsx(filename, result)
def main(): global names list_ = names.split(sep='\n') subword = f" AND ({' OR '.join(['営業所','事業所','拠点'])}) AND 一覧" search_words = [[ kw + subword, ] for kw in list_] from umihico_commons.xlsx_wrapper import to_xlsx to_xlsx("search_words.xlsx", search_words) from pprint import pprint pprint(search_words)
def merge_xlsxs(): merged_rows = [ ["企業名", "トップurl", "情報取得url", "周辺テキスト", "住所", "TEL"], ] for i in tqdm(range(0, 12)): filename = f"eigyousho_final{i}.xlsx" merged_rows.extend(load_xlsx(filename)) merged_rows = [(title, shorten_url(top_url), shorten_url(page_url), text, ad, tel) for title, top_url, page_url, text, ad, tel in merged_rows] for i, chunk_rows in enumerate(chunks(merged_rows, 30000)): to_xlsx(f"final{i}.xlsx", chunk_rows)
def get_date_all_htmls(): path_list = gen_path_list() output_rows = [] for i, file_path in tqdm(enumerate(path_list)): try: name, date_, sec_code = get_metadata(file_path) except (Exception, ) as e: print(file_path) raise row = (ospath.dirname(file_path), name, date_, sec_code) # print(row) output_rows.append(row) to_xlsx("009_metadata.xlsx", output_rows)
def main(command_int=-1): url_dict = get_url_dict() # for title, url in tqdm(list(url_dict.items())[:100]): args_rows = [(title, url) for title, url in url_dict.items()] if command_int == -1: raise Exception(f"command_int:{command_int}") l_slice_index = int(command_int) * 500 r_slice_index = l_slice_index + 500 args_rows = args_rows[l_slice_index:r_slice_index] filename = f"eigyousho_final{command_int}.xlsx" rows_list = map_multithreading(scrap_eigyosho, args_rows=args_rows) excel_rows = [] for rows in rows_list: excel_rows.extend(rows) to_xlsx(filename, excel_rows)
def main(): # paths = gen_path_list(filename="007_merged_html_paths.csv") paths = [x[0] for x in load_xlsx("009_metadata_only_old.xlsx")] _text_rows = load_xlsx("003_images_text_dict.xlsx") text_dict = {image_path: text for image_path, text in _text_rows} _title_rows = load_xlsx("008_image_titles.xlsx") title_dict = {image_path.replace( '/', os.sep): title for html_path, image_path, title in _title_rows} titletext_merged_rows = [] for image_path, text in tqdm(text_dict.items()): title = title_dict[image_path] text = text.replace("__empty__", '') row = (image_path, title, text) titletext_merged_rows.append(row) to_xlsx("011_titletext_merged_rows.xlsx", titletext_merged_rows)
def save_scrapped_proxy(): from umihico_commons.xlsx_wrapper import to_xlsx anonymous_proxy_list = [] try: anonymous_proxy_list.extend(get_cybersyndrome()) except (Exception, ) as e: print_exc() try: anonymous_proxy_list.extend(get_freeproxylists()) except (Exception, ) as e: print_exc() rows = [[ p, ] for p in anonymous_proxy_list] to_xlsx("proxy.xlsx", rows)
def gen_raw_final(): dicts = load_data() owner_title_base_dict = merge_same_id_same_title(dicts) rows = [] field_names = set() for owner_name, productName_base_dict in owner_title_base_dict.items(): for productName, same_productName_item_list in productName_base_dict.items( ): starting_prices = [ d_["開始時の価格"] for d_ in same_productName_item_list ] if any([bool("1円" == text) for text in starting_prices]): print("1円", "hit") continue common_row_keys = [ "出品者", "productName", ] common_row = { key: same_productName_item_list[0][key] for key in common_row_keys } if "バッテリー" in common_row['productName']: print("バッテリー", "hit") continue common_row["url"] = "https://page.auctions.yahoo.co.jp/jp/auction/" + \ same_productName_item_list[0]['productID'] common_row["image_url"] = get_image_url( same_productName_item_list[0]['productID']) joined_row_keys = ["sold_count", "price", "productID"] joined_row = {key: [] for key in joined_row_keys} for dict_ in same_productName_item_list: for key in joined_row_keys: joined_row[key].append(dict_[key]) common_row['sold_count_sum'] = str( int(sum(int(x) for x in joined_row['sold_count']))) joined_row = {k: ','.join(v) for k, v in joined_row.items()} row = {**common_row, **joined_row} rows.append(row) field_names.update(list(row.keys())) break # only oneowner-onetitle in final list rows = [[r.get(fn, "") for fn in field_names] for r in rows] rows.insert(0, field_names) to_xlsx("raw_final.xlsx", rows)
def get_bracket_all_htmls(): path_list = gen_path_list(filename="007_merged_html_paths.csv") output_rows = [] error_logs = [] for i, file_path in tqdm(enumerate(path_list)): try: title_dict = get_bracket(file_path) except (Exception, ) as e: error_logs.append(file_path) error_logs.append(format_exc()) for image_path, title in title_dict.items(): row = (file_path, image_path, title) # print(title) output_rows.append(row) if divmod(i, 10000)[1] == 0: to_xlsx("008_image_titles.xlsx", output_rows) to_xlsx("008_image_titles.xlsx", output_rows) for error_log in error_logs: print(error_log)
def gen_raw_final(): paths = gen_path_list(filename="007_merged_html_paths.csv") # meta_rows = load_xlsx("009_metadata.xlsx") text_rows = load_xlsx("003_images_text_dict.xlsx") title_rows = load_xlsx("008_image_titles.xlsx") text_dict = {image_path: text for image_path, text in text_rows} output_rows = [] for path in tqdm(paths): lxmlroot = tolxml(path) name, date_, sec_code = get_metadata(lxmlroot) dir_path, filename = os.path.split(path) title_dict = { image_path.replace('/', os.sep): title for html_path, image_path, title in title_rows if dir_path in html_path } for image_path, title in title_dict.items(): text = text_dict[image_path].replace("__empty__", "") output_row = (dir_path, name, date_, image_path, title, text) output_rows.append(output_row) to_xlsx("010_raw_final.xlsx", output_rows)
def gen_excel_raw_data(): filenames = ["wiki.txt", "yahoo_finace.txt"] srcs = ["wikipedia", "yahoo_finace"] for filename, src in zip(filenames, srcs): filednames = [] filednames_counter = [] rows = load_from_txt(filename) for dict_ in rows: filednames_counter.extend(dict_.keys()) for key in dict_.keys(): if key not in filednames: filednames.append(key) counter_dict = {word: count for word, count in Counter( filednames_counter).most_common()} filednames = [name for name in filednames if counter_dict[name] > 1000] result = [] result.append(filednames) for dict_ in rows: if dict_: result_row = [dict_.get(key, "") for key in filednames] result.append(result_row) to_xlsx(src + ".xlsx", result)
def test_get_child_category_urls(): parent_cat_url = "https://auctions.yahoo.co.jp/closedsearch/closedsearch?ei=UTF-8&p=&auccat=26318&slider=0" cat_url_list = get_child_cat_urls(parent_cat_url) to_xlsx("cat_url_list.xlsx", cat_url_list)
def _save_current_text_dict(text_dict): data = [(path, text) for path, text in text_dict.items()] to_xlsx("003_images_text_dict.xlsx", data)