Пример #1
0
def process_csv_multiprocessing_in_pool(csv_file, txt_dir, pdf_dir):
    if csv_file == "":
        csv_file = ph.get_csv_file_path()
    if txt_dir == "":
        txt_dir = ph.get_txt_directory()
    if pdf_dir == "":
        pdf_dir = ph.get_pdf_directory()

    with open(csv_file, encoding="utf-8") as f:

        file_rows = csv.reader(f)
        list_rows = list(file_rows)
        count = len(list_rows)
        print(count)

        params = []
        for index, row in enumerate(list_rows):
            params.append((row, txt_dir, pdf_dir, index))

        #print(params)

        pool = Pool()
        pool.map(child_process_in_pool, params)
        pool.close()
        pool.join()

        print("all done...")
Пример #2
0
def process_csv_multiprocessing(csv_file, txt_dir, pdf_dir):
    if csv_file == "":
        csv_file = ph.get_csv_file_path()
    if txt_dir == "":
        txt_dir = ph.get_txt_directory()
    if pdf_dir == "":
        pdf_dir = ph.get_pdf_directory()

    with open(csv_file, encoding="utf-8") as f:

        file_rows = csv.reader(f)
        list_rows = list(file_rows)
        count = len(list_rows)
        print(count)

        if count <= 256:
            #single process
            child_process(list_rows, 0, txt_dir, pdf_dir)
            return

        #mutilprocessing
        processes = []
        start = 0
        stop = count
        step = round(count/15)
        last = 0

        for i in range(16):
            last = i
            stop = start + step
            if (stop > count):
                stop = count

            p_name = "process_no" + str(last)
            print(p_name)
            p = Process(name = p_name, target = child_process, \
                        args = (list_rows[start:stop], start, txt_dir, pdf_dir) )

            p.start()
            processes.append(p)

            start = stop

        for p in processes:
            p.join()

        print("all done...")
Пример #3
0
def process_csv(csv_file, txt_dir, pdf_dir):
    if csv_file == "":
        csv_file = ph.get_csv_file_path()
    if txt_dir == "":
        txt_dir = ph.get_txt_directory()
    if pdf_dir == "":
        pdf_dir = ph.get_pdf_directory()

    with open(csv_file, encoding="utf-8") as f:

        file_rows = csv.reader(f)
        list_rows = list(file_rows)
        count = len(list_rows)
        print(count)

        i = 0
        for row in list_rows:
            code, title, url = row
            file_name = str(i)
            txt_file = path.join(txt_dir, file_name + ".txt")
            pdf_file = path.join(pdf_dir, file_name + ".pdf")
            __save_to_txt(url, txt_file, pdf_file)
            i = i + 1
Пример #4
0
def save_to_csv(file_path):
    """
    get_all_pdfs()
    save_to_csv()
    i.e.
        pdfs = get_all_pdfs()
        save_to_csv(pdfs)
    """
    if file_path == "":
        file_path = ph.get_csv_file_path()

    print("csv file path is [%s]" % file_path)
    print("begin to get disclosures...")
    disclosures = get_sse_disclosures()
    print("got disclosures of sse...")
    szse = get_szse_disclosures()
    print("got disclosures of szse...")
    disclosures.extend(szse)
    with open(file_path, "w", encoding="utf-8", newline="") as csv_file:
        csv_writer = csv.writer(csv_file)
        for d in disclosures:
            csv_writer.writerow(d)
    print("saved to csv...")
    return disclosures
Пример #5
0
def __save_to_csv(data_list):
    file_path = ph.get_csv_file_path()
    with open(file_path, "w", encoding="utf-8", newline="") as csv_file:
        csv_writer = csv.writer(csv_file)
        for data in data_list:
            csv_writer.writerow(data)