Exemplo n.º 1
0
def _get_wiki_urls():
    raw_merged_dict = {}
    for filename in ["public_company_urls.txt", "all_company_urls.txt"]:
        raw_merged_dict.update(load_from_txt(filename))
    urls = [url for name, url in raw_merged_dict.items()]
    urls = [
        url if url.startswith("http") else "https://ja.wikipedia.org" + url
        for url in urls
    ]
    nondeplicate_urls = set(urls)
    return nondeplicate_urls
Exemplo n.º 2
0
def gen_excel_raw_data():
    filenames = ["wiki.txt", "yahoo_finace.txt"]
    srcs = ["wikipedia", "yahoo_finace"]
    for filename, src in zip(filenames, srcs):
        filednames = []
        filednames_counter = []
        rows = load_from_txt(filename)
        for dict_ in rows:
            filednames_counter.extend(dict_.keys())
            for key in dict_.keys():
                if key not in filednames:
                    filednames.append(key)
        counter_dict = {word: count for word, count in Counter(
            filednames_counter).most_common()}
        filednames = [name for name in filednames if counter_dict[name] > 1000]
        result = []
        result.append(filednames)
        for dict_ in rows:
            if dict_:
                result_row = [dict_.get(key, "")
                              for key in filednames]
                result.append(result_row)
        to_xlsx(src + ".xlsx", result)
Exemplo n.º 3
0
def load_first_dicts():
    filenames = ["dict1gram.txt", "dict2gram.txt"]
    dict1gram, dict2gram = [load_from_txt(filename) for filename in filenames]
    return dict1gram, dict2gram
Exemplo n.º 4
0
 def load_proxy(self):
     time_, proxies = load_from_txt("proxy.txt")
     self.last_proxy_refilled_time = time_
     self._add_proxies(proxies)
Exemplo n.º 5
0
 def refill_proxy(self):
     old_proxies = load_from_txt("proxy.txt")
     self.scrap_new_proxy(old_proxies=old_proxies)
Exemplo n.º 6
0
def load_stock_metadata():
    return load_from_txt("stock_metadata.txt")
Exemplo n.º 7
0
def load_proxy():
    return load_from_txt("proxys.txt")
Exemplo n.º 8
0
def get_url_dict():
    dicts = load_from_txt("wiki.txt")
    url_dict = {d['title']: d["外部リンク"] for d in dicts if "外部リンク" in d.keys()}
    return url_dict