def main(): # 创建pdf_download文件夹 if not os.path.exists('pdf_download'): os.mkdir('pdf_download') # os.chdir(os.path.join(os.getcwd(), 'pdf_download')) download_dict = getDownloadUrlAndFilename() # # download_dict = getUrlAndFilenameFromTxt('./download_fail.txt') # print(download_dict) pdf_file_list = getAllFilename('./pdf_download') print(len(pdf_file_list)) download_url_queue = queue.Queue() for i in pdf_file_list: del download_dict[i] for filename, url in download_dict.items(): filename_url = [filename, url] download_url_queue.put(filename_url) thread_list = [] for i in range(100): thread = CrawlThread(download_url_queue) thread.start() thread_list.append(thread) download_url_queue.join() for t in thread_list: t.join() print('MainThread End')
def main(): # f_zhuku_id = open('zhiku_download_list.txt', 'r') # id_list = f_zhuku_id.readlines() # for i in range(len(id_list)): # id_list[i] = id_list[i].replace('\n', '') # print(id_list) pdf_file_list = getAllFilename('./pdf_download') id_list = [] for pdf in pdf_file_list: # print(pdf) id_list.append(pdf.split('.')[0]) print(len(id_list)) getDataByFileurlToExcel(id_list)
def rename(): db, cursor = connectDatabase() sql = 'SELECT id, uuid FROM zhiku_data' id_uuid_dict = {} try: cursor.execute(sql) results = cursor.fetchall() for r in results: id_uuid_dict[str(r[0])] = r[1] except Exception as e: print(e) file_list = getAllFilename('./pdf_download') # print(file_list) for f in file_list: key = f.split('.')[0] new_filename = id_uuid_dict[key] + '.pdf' print(new_filename) os.rename('./pdf_download/' + f, './pdf_download/' + new_filename) # # new_file_list = [] cursor.close() db.close()