def process_csv_multiprocessing_in_pool(csv_file, txt_dir, pdf_dir): if csv_file == "": csv_file = ph.get_csv_file_path() if txt_dir == "": txt_dir = ph.get_txt_directory() if pdf_dir == "": pdf_dir = ph.get_pdf_directory() with open(csv_file, encoding="utf-8") as f: file_rows = csv.reader(f) list_rows = list(file_rows) count = len(list_rows) print(count) params = [] for index, row in enumerate(list_rows): params.append((row, txt_dir, pdf_dir, index)) #print(params) pool = Pool() pool.map(child_process_in_pool, params) pool.close() pool.join() print("all done...")
def process_csv_multiprocessing(csv_file, txt_dir, pdf_dir): if csv_file == "": csv_file = ph.get_csv_file_path() if txt_dir == "": txt_dir = ph.get_txt_directory() if pdf_dir == "": pdf_dir = ph.get_pdf_directory() with open(csv_file, encoding="utf-8") as f: file_rows = csv.reader(f) list_rows = list(file_rows) count = len(list_rows) print(count) if count <= 256: #single process child_process(list_rows, 0, txt_dir, pdf_dir) return #mutilprocessing processes = [] start = 0 stop = count step = round(count/15) last = 0 for i in range(16): last = i stop = start + step if (stop > count): stop = count p_name = "process_no" + str(last) print(p_name) p = Process(name = p_name, target = child_process, \ args = (list_rows[start:stop], start, txt_dir, pdf_dir) ) p.start() processes.append(p) start = stop for p in processes: p.join() print("all done...")
def process_csv(csv_file, txt_dir, pdf_dir): if csv_file == "": csv_file = ph.get_csv_file_path() if txt_dir == "": txt_dir = ph.get_txt_directory() if pdf_dir == "": pdf_dir = ph.get_pdf_directory() with open(csv_file, encoding="utf-8") as f: file_rows = csv.reader(f) list_rows = list(file_rows) count = len(list_rows) print(count) i = 0 for row in list_rows: code, title, url = row file_name = str(i) txt_file = path.join(txt_dir, file_name + ".txt") pdf_file = path.join(pdf_dir, file_name + ".pdf") __save_to_txt(url, txt_file, pdf_file) i = i + 1
def save_to_csv(file_path): """ get_all_pdfs() save_to_csv() i.e. pdfs = get_all_pdfs() save_to_csv(pdfs) """ if file_path == "": file_path = ph.get_csv_file_path() print("csv file path is [%s]" % file_path) print("begin to get disclosures...") disclosures = get_sse_disclosures() print("got disclosures of sse...") szse = get_szse_disclosures() print("got disclosures of szse...") disclosures.extend(szse) with open(file_path, "w", encoding="utf-8", newline="") as csv_file: csv_writer = csv.writer(csv_file) for d in disclosures: csv_writer.writerow(d) print("saved to csv...") return disclosures
def __save_to_csv(data_list): file_path = ph.get_csv_file_path() with open(file_path, "w", encoding="utf-8", newline="") as csv_file: csv_writer = csv.writer(csv_file) for data in data_list: csv_writer.writerow(data)