示例#1
0
def main():
    keywords = _load_data()
    for i, keyword in tqdm(enumerate(keywords[4:])):
        print(i, keyword)
        filename_num = str(i).zfill(3)
        filename = f"{filename_num}{keyword}.xlsx"
        result = search_keyword(keyword)
        to_xlsx(filename, result)
示例#2
0
def main():
    global names
    list_ = names.split(sep='\n')
    subword = f" AND ({' OR '.join(['営業所','事業所','拠点'])}) AND 一覧"
    search_words = [[
        kw + subword,
    ] for kw in list_]
    from umihico_commons.xlsx_wrapper import to_xlsx
    to_xlsx("search_words.xlsx", search_words)
    from pprint import pprint
    pprint(search_words)
示例#3
0
def merge_xlsxs():
    merged_rows = [
        ["企業名", "トップurl", "情報取得url", "周辺テキスト", "住所", "TEL"],
    ]
    for i in tqdm(range(0, 12)):
        filename = f"eigyousho_final{i}.xlsx"
        merged_rows.extend(load_xlsx(filename))
    merged_rows = [(title, shorten_url(top_url), shorten_url(page_url), text,
                    ad, tel)
                   for title, top_url, page_url, text, ad, tel in merged_rows]
    for i, chunk_rows in enumerate(chunks(merged_rows, 30000)):
        to_xlsx(f"final{i}.xlsx", chunk_rows)
示例#4
0
def get_date_all_htmls():
    path_list = gen_path_list()
    output_rows = []
    for i, file_path in tqdm(enumerate(path_list)):
        try:
            name, date_, sec_code = get_metadata(file_path)
        except (Exception, ) as e:
            print(file_path)
            raise
        row = (ospath.dirname(file_path), name, date_, sec_code)
        # print(row)
        output_rows.append(row)
    to_xlsx("009_metadata.xlsx", output_rows)
示例#5
0
def main(command_int=-1):
    url_dict = get_url_dict()
    # for title, url in tqdm(list(url_dict.items())[:100]):
    args_rows = [(title, url) for title, url in url_dict.items()]
    if command_int == -1:
        raise Exception(f"command_int:{command_int}")
    l_slice_index = int(command_int) * 500
    r_slice_index = l_slice_index + 500
    args_rows = args_rows[l_slice_index:r_slice_index]
    filename = f"eigyousho_final{command_int}.xlsx"
    rows_list = map_multithreading(scrap_eigyosho, args_rows=args_rows)
    excel_rows = []
    for rows in rows_list:
        excel_rows.extend(rows)
    to_xlsx(filename, excel_rows)
示例#6
0
def main():
    # paths = gen_path_list(filename="007_merged_html_paths.csv")
    paths = [x[0] for x in load_xlsx("009_metadata_only_old.xlsx")]
    _text_rows = load_xlsx("003_images_text_dict.xlsx")
    text_dict = {image_path: text for image_path, text in _text_rows}
    _title_rows = load_xlsx("008_image_titles.xlsx")
    title_dict = {image_path.replace(
        '/', os.sep): title for html_path, image_path, title in _title_rows}
    titletext_merged_rows = []
    for image_path, text in tqdm(text_dict.items()):
        title = title_dict[image_path]
        text = text.replace("__empty__", '')
        row = (image_path, title, text)
        titletext_merged_rows.append(row)
    to_xlsx("011_titletext_merged_rows.xlsx", titletext_merged_rows)
示例#7
0
def save_scrapped_proxy():
    from umihico_commons.xlsx_wrapper import to_xlsx
    anonymous_proxy_list = []
    try:
        anonymous_proxy_list.extend(get_cybersyndrome())
    except (Exception, ) as e:
        print_exc()
    try:
        anonymous_proxy_list.extend(get_freeproxylists())
    except (Exception, ) as e:
        print_exc()
    rows = [[
        p,
    ] for p in anonymous_proxy_list]
    to_xlsx("proxy.xlsx", rows)
示例#8
0
def gen_raw_final():
    dicts = load_data()
    owner_title_base_dict = merge_same_id_same_title(dicts)
    rows = []
    field_names = set()
    for owner_name, productName_base_dict in owner_title_base_dict.items():
        for productName, same_productName_item_list in productName_base_dict.items(
        ):
            starting_prices = [
                d_["開始時の価格"] for d_ in same_productName_item_list
            ]
            if any([bool("1円" == text) for text in starting_prices]):
                print("1円", "hit")
                continue

            common_row_keys = [
                "出品者",
                "productName",
            ]
            common_row = {
                key: same_productName_item_list[0][key]
                for key in common_row_keys
            }
            if "バッテリー" in common_row['productName']:
                print("バッテリー", "hit")
                continue
            common_row["url"] = "https://page.auctions.yahoo.co.jp/jp/auction/" + \
                same_productName_item_list[0]['productID']
            common_row["image_url"] = get_image_url(
                same_productName_item_list[0]['productID'])
            joined_row_keys = ["sold_count", "price", "productID"]
            joined_row = {key: [] for key in joined_row_keys}
            for dict_ in same_productName_item_list:
                for key in joined_row_keys:
                    joined_row[key].append(dict_[key])
            common_row['sold_count_sum'] = str(
                int(sum(int(x) for x in joined_row['sold_count'])))
            joined_row = {k: ','.join(v) for k, v in joined_row.items()}
            row = {**common_row, **joined_row}

            rows.append(row)
            field_names.update(list(row.keys()))
            break  # only oneowner-onetitle in final list
    rows = [[r.get(fn, "") for fn in field_names] for r in rows]
    rows.insert(0, field_names)
    to_xlsx("raw_final.xlsx", rows)
示例#9
0
def get_bracket_all_htmls():
    path_list = gen_path_list(filename="007_merged_html_paths.csv")
    output_rows = []
    error_logs = []
    for i, file_path in tqdm(enumerate(path_list)):
        try:
            title_dict = get_bracket(file_path)
        except (Exception, ) as e:
            error_logs.append(file_path)
            error_logs.append(format_exc())
        for image_path, title in title_dict.items():
            row = (file_path, image_path, title)
            # print(title)
            output_rows.append(row)
        if divmod(i, 10000)[1] == 0:
            to_xlsx("008_image_titles.xlsx", output_rows)
    to_xlsx("008_image_titles.xlsx", output_rows)
    for error_log in error_logs:
        print(error_log)
示例#10
0
def gen_raw_final():
    paths = gen_path_list(filename="007_merged_html_paths.csv")
    # meta_rows = load_xlsx("009_metadata.xlsx")
    text_rows = load_xlsx("003_images_text_dict.xlsx")
    title_rows = load_xlsx("008_image_titles.xlsx")
    text_dict = {image_path: text for image_path, text in text_rows}
    output_rows = []
    for path in tqdm(paths):
        lxmlroot = tolxml(path)
        name, date_, sec_code = get_metadata(lxmlroot)
        dir_path, filename = os.path.split(path)
        title_dict = {
            image_path.replace('/', os.sep): title
            for html_path, image_path, title in title_rows
            if dir_path in html_path
        }
        for image_path, title in title_dict.items():
            text = text_dict[image_path].replace("__empty__", "")
            output_row = (dir_path, name, date_, image_path, title, text)
            output_rows.append(output_row)
    to_xlsx("010_raw_final.xlsx", output_rows)
示例#11
0
def gen_excel_raw_data():
    filenames = ["wiki.txt", "yahoo_finace.txt"]
    srcs = ["wikipedia", "yahoo_finace"]
    for filename, src in zip(filenames, srcs):
        filednames = []
        filednames_counter = []
        rows = load_from_txt(filename)
        for dict_ in rows:
            filednames_counter.extend(dict_.keys())
            for key in dict_.keys():
                if key not in filednames:
                    filednames.append(key)
        counter_dict = {word: count for word, count in Counter(
            filednames_counter).most_common()}
        filednames = [name for name in filednames if counter_dict[name] > 1000]
        result = []
        result.append(filednames)
        for dict_ in rows:
            if dict_:
                result_row = [dict_.get(key, "")
                              for key in filednames]
                result.append(result_row)
        to_xlsx(src + ".xlsx", result)
示例#12
0
def test_get_child_category_urls():
    parent_cat_url = "https://auctions.yahoo.co.jp/closedsearch/closedsearch?ei=UTF-8&p=&auccat=26318&slider=0"
    cat_url_list = get_child_cat_urls(parent_cat_url)
    to_xlsx("cat_url_list.xlsx", cat_url_list)
示例#13
0
def _save_current_text_dict(text_dict):
    data = [(path, text) for path, text in text_dict.items()]
    to_xlsx("003_images_text_dict.xlsx", data)