def sending_process(file_name, in_transport): try: # 先询问查找最合适的主机; src_size = os.path.getsize(file_name) host_index = get_best_server(src_size) if host_index is None: print("未找到合适主机:", file_name) make_log("WARNING", "未找到合适主机:" + file_name) raise NotImplementedError # 建立连接; sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # 绑定服务器IP地址; host = host_list[host_index] sock.connect((host, port)) file_size = os.stat(file_name).st_size # 发送文件大小和文件名,注意这里不发送文件名的标志位,在本地表示为目标主机的编号; bite_format = '128sl' if get_system_bytes() else '128sq' file_head = struct.pack(bite_format, os.path.basename(file_name).encode(), file_size) sock.send(file_head) print("\n开始传输文件:", file_name) read_file = open(file_name, "rb") sended_size = 0 while True: # process_bar(float(sended_size) / file_size) file_data = read_file.read(10240) if not file_data: break sock.send(file_data) sended_size += len(file_data) read_file.close() # 传输完毕后删除原压缩文件; os.remove(file_name) print() print("sending over:", file_name, '\n') make_log("INFO", "数据发送完毕: %s" % file_name) sock.close() except Exception as e: print(e) finally: in_transport.remove(file_name)
def answer(): sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) host = socket.gethostname() sock.bind((host, answer_port)) sock.listen(5) while True: connection, address = sock.accept() symbol = connection.recv(1024).decode() src_size = int(symbol[1:]) computer_status = get_status(src_size) connection.send(computer_status.encode()) make_log("INFO", "客户机请求,返回系统状态------") connection.close()
def compress_folder(folder, in_compress): try: folder = os.path.abspath(folder) # 压缩文件; make_log("INFO", "开始压缩数据:" + folder) print("开始压缩数据:", folder) create_zip(folder) # 删除该文件夹 # shutil.rmtree(folder) make_log("INFO", "数据压缩完毕:" + folder) print("\n压缩完毕:", folder) except Exception as e: print(e) finally: # 在in_compress列表中删除该元素; in_compress.remove(folder)
def monitor(thread_num=5): # 定义正在处理中的文件; in_monitor = [] while True: time.sleep(3) if not os.path.exists(mask_file): make_log("ERROR", "mask文件不存在") print("mask文件不存在!") exit(1) # 读取mask文件,如果为空,则继续监控,否则处理文件; # mask_str代表了所有需要处理的文件; with open(mask_file, 'r') as read_mask: mask_str = read_mask.readlines() # new_monitor代表了所有未进行处理的文件; new_monitor = [file for file in mask_str if file not in in_monitor] # 计算最大可新建的线程数; free_thread_nums = thread_num - thread_nums("monitor") max_thread_num = free_thread_nums if free_thread_nums \ < len(new_monitor) else len(new_monitor) # 开启max_thread_num个线程用于处理数据; for i in range(max_thread_num): compress_thread = threading.Thread(target=monitor_data, args=(new_monitor[i].strip(), in_monitor), name="monitor") # 将该文件夹放入in_compress列表中; in_monitor.append(new_monitor[i]) compress_thread.start()
# -*- coding: UTF-8 -*- import threading from data_queue import monitor from answer import answer from m_socket import receive from logs import make_log """ answer_thread 用于接收客户端请求并返回系统的状态,帮助客户机决定要向哪台主机发送数据;占用端口号为12346 receive_thread 用于接收客户端传输的文件,并向mask_file中写入记录; mask_thread 用于监控mask_file文件,并进行数据清洗和数据处理; """ if __name__ == '__main__': # 开启线程用于返回系统状态; answer_thread = threading.Thread(target=answer, args=(), name='answer') answer_thread.start() make_log("INFO", "监听程序已开启-------------") print("监听程序已开启-------------") # 开启线程用于接收文件; receive_thread = threading.Thread(target=receive, args=(), name='receive') receive_thread.start() make_log("INFO", "文件接受程序已开启--------------") print("文件接受程序已开启--------------") # 开启线程用于监控mask文件; mask_thread = threading.Thread(target=monitor, args=(5, ), name='mask') mask_thread.start() make_log("INFO", "文件处理程序已开启--------------") print("文件处理程序已开启--------------")
""" 总文件,检测待发送文件,传输文件; """ from transport import transport from compress import compress from logs import make_log import threading if __name__ == '__main__': # 开启压缩数据线程,用于压缩数据; compress_thread = threading.Thread(target=compress, args=(5, )) compress_thread.start() print("数据压缩进程已启动,准备压缩数据") make_log("INFO", "数据压缩进程已启动,准备压缩数据") # 开启传输数据线程,用于传输数据; transport_thread = threading.Thread(target=transport, args=(5, )) transport_thread.start() print("数据传输进程已启动,准备传输数据") make_log("INFO", "数据传输进程已启动,准备压缩数据")
def monitor_data(file, in_monitor): try: print("数据清洗:", file) make_log("INFO", "数据清洗:" + file) # todo 解析数据库名字和日期;利用这些信息清洗和入库; basename = os.path.basename(file) database_name = get_database(basename) month = get_date(basename) # 解压缩数据,并返回压缩后的文件夹; unzip_dir = zip_file.unzip_file(file) # todo 清洗,入库;待调试; # merge_dir = process_dir(unzip_dir, date) # hive(merge_dir, database_name, date) time.sleep(15) # make_log("INFO", "清洗完成:" + file) write_protect.write_lock.acquire() with open(mask_file, 'r') as read_mask: # 重新读取文件;因为在别的线程可能会修改文件; mask_str = read_mask.readlines() mask_str = [mask for mask in mask_str if mask.strip() != file] with open(mask_file, 'w') as write_mask: write_mask.write(''.join(mask_str)) write_protect.write_lock.release() # 删除所有的文件,包括压缩文件,解压文件以及清洗后的文件; shutil.rmtree(os.path.dirname(file)) print("清洗完毕:" + file) make_log("INFO", "清洗完毕:" + file) except subprocess.CalledProcessError: print("数据入库未完成:", file) make_log("ERROR", "数据入库未完成:" + file) except FileNotFoundError: print("file not found") make_log("ERROR", "文件不存在" + file) # 删除该文件的记录; write_protect.write_lock.acquire() with open(mask_file, 'r') as read_mask: # 重新读取文件;因为在别的线程可能会修改文件; mask_str = read_mask.readlines() mask_str.remove(file) with open(mask_file, 'w') as write_mask: write_mask.write(''.join(mask_str)) write_protect.write_lock.release() finally: in_monitor.remove(file + '\n')
def receive_thread(connection): try: connection.settimeout(600) file_info_size = struct.calcsize('128sl') buf = connection.recv(file_info_size) if buf: file_name, file_size = struct.unpack('128sl', buf) file_name = file_name.decode().strip('\00') # 查找最小的目录用于存储文件; disk_list = get_min_disk() # todo 生产时替换; # 在receive下用时间戳创建新的文件夹,防止命名冲突; # file_new_dir = os.path.join('/HDATA', str(disk_list), # 'receive', get_database(file_name), file_name[:-4]) file_new_dir = os.path.join('receive', get_database(file_name), file_name[:-4]) # print(file_name, file_new_dir) if not os.path.exists(file_new_dir): os.makedirs(file_new_dir) file_new_name = os.path.join(file_new_dir, file_name) received_size = 0 w_file = open(file_new_name, 'wb') print("start receiving file:", file_name) make_log("INFO", "开始接收文件:" + file_name) out_contact_times = 0 while not received_size == file_size: r_data = connection.recv(10240) received_size += len(r_data) w_file.write(r_data) # process_bar.process_bar(float(received_size) / file_size) # if file_size - received_size > 10240: # r_data = connection.recv(10240) # received_size += len(r_data) # # else: # r_data = connection.recv(file_size - received_size) # received_size = file_size # 记录未接收到数据的次数; if not r_data: out_contact_times += 1 else: out_contact_times = 0 # 1000次未接收到数据,可断开; if out_contact_times == 1000: connection.close() w_file.close() raise socket.timeout # # 删除掉未接收完毕的数据; # print('连接断开,将清除未传输的文件') # make_log("ERROR", "连接断开,清除未完成的文件") # shutil.rmtree(file_new_dir) # exit(1) # # w_file.write(r_data) w_file.close() print("接收完成!\n") make_log("INFO", "传输完成: %s" % file_new_dir) # 写到记录文件里; # 每个文件记录为一行,第一个代表文件名,第二个代表数据库名; # 好处: # 1.不需要对入库过程上锁,以免造成同时写入库文件的错误; # 2.当系统重启时可以继续执行文件清洗和入库过程; # print('##'+os.path.abspath(file_new_name)) # 注意这里的写保护; write_protect.write_lock.acquire() with open(mask_file, 'a') as record_mask: # print("#######################") record_mask.write(os.path.abspath(file_new_name) + '\n') write_protect.write_lock.release() connection.close() except socket.timeout: print("连接超时!") finally: connection.close()