def create_event(success_q, error_q): """get data from success_q,write to excel""" fp_success = open(SUCCESS_DATA_PATH, 'a+') fp_error = open(ERROR_DATA_PATH, 'a+') logging.info(u'total,success:%s,fail:%s' % (str(success_q.qsize()), str(error_q.qsize()))) error_list = [] while not error_q.empty(): error_list.append(error_q.get()) error_str = '\n'.join([str(i) for i in error_list]) fp_error.write(error_str + '\n') error_list = [] fp_error.close() success_list = [] success_count = 0 while not success_q.empty(): success_list.append(success_q.get()) success_count += 1 if success_count >= 200 or success_q.empty(): success_file_str = '\n'.join([str(i) for i in success_list]) fp_success.write(success_file_str + '\n') success_count = 0 success_list = [] logging.info(u'create event result;') fp_success.close()
def do_track(data_list=None, file_name=IMPORT_EXCEL_PATH): """ get the data from excel then crawl,write to success file then read success file,write to excel """ logging.info('do_process---------------') do_process(process_num=8, time_out=60000, data_list=data_list, file_name=IMPORT_EXCEL_PATH) logging.info('finally done--------------')
def get_result(d, time_out=60000): """ use requests or casperjs to get website html code, then parse,get the transfer result """ state, last_event, list_all_event, track_time = '', '', '', '' try: if d['transfer_way'] in ['ZTO', ]: html = requests.get(d['url'], timeout=6).content else: cmd = '/usr/local/bin/casperjs "%s" "%s" "%s" ' % ( JS_PATH, d['url'], time_out) r = os.popen(cmd) html = r.read() r.close() except Exception, e: logging.info(u'get html fail,' + str(e)) return ''
def ftp_up(filename=json_file_path, store_name=store_name): """""" ftp = FTP() # open the debug level,2.list the detail,0.shut debug ftp.set_debuglevel(2) ftp.connect('host', 'port') # login,if the user or password is none,use an empty string ftp.login('user', 'password') # set the buf size bufsize = 1024 # open the read only file file_handler = open(filename, 'rb') # ftp up ftp.storbinary('STOR %s' % store_name, file_handler, bufsize) ftp.set_debuglevel(0) file_handler.close() ftp.quit() logging.info('ftp_up----------')
def get_result(d, time_out=60000): """ use requests or casperjs to get website html code, then parse,get the transfer result """ state, last_event, list_all_event, track_time = '', '', '', '' try: if d['transfer_way'] in [ 'ZTO', ]: html = requests.get(d['url'], timeout=6).content else: cmd = '/usr/local/bin/casperjs "%s" "%s" "%s" ' % ( JS_PATH, d['url'], time_out) r = os.popen(cmd) html = r.read() r.close() except Exception, e: logging.info(u'get html fail,' + str(e)) return ''
def data_handle(data, success_q, error_q, time_out=15000): """get the crawl result,and debug""" for i, d in enumerate(data): try: result = get_result(d, time_out) logging.info(result) except Exception, e: d['error_reason'] = 'Exception' + str(e) error_q.put(d) continue if result: if result['track_event'] or result['track_state']: result['id'] = d['id'] result['transfer_way'] = d['transfer_way'] success_q.put(result) else: d['error_reason'] = 'no_data' error_q.put(d) else: d['error_reason'] = 'no_html' error_q.put(d)
def do_process(process_num=8, time_out=20000, data_list=None, file_name=IMPORT_EXCEL_PATH): """ use multiprocessing to do the data, then create_event,write to txt, """ # get data from excel if data_list: logging.info(u'get the logistic data from rabbitmq') data_list = get_list(data_list=data_list) else: logging.info(u'get the logistic data from excel') data_list = get_list(file_name=file_name) # use multiprocessing length = len(data_list) logging.info(u'total data length:%s' % str(length)) if length <= process_num: process_num = 1 step = (length / process_num) or 1 nloop = range(0, length, step) if len(nloop) == process_num: nloop.append(length) elif len(nloop) == process_num + 1: nloop[-1] = length pool = LoggingPool(processes=process_num) error_q = multiprocessing.Manager().Queue() success_q = multiprocessing.Manager().Queue() for i, n in enumerate(nloop): if i < len(nloop): data = data_list[n:n + step] pool.apply_async(data_handle, ( data, success_q, error_q, time_out, )) pool.close() pool.join() # write to txt create_event(success_q, error_q)
def do_process(process_num=8, time_out=20000, data_list=None, file_name=IMPORT_EXCEL_PATH): """ use multiprocessing to do the data, then create_event,write to txt, """ # get data from excel if data_list: logging.info(u'get the logistic data from rabbitmq') data_list = get_list(data_list=data_list) else: logging.info(u'get the logistic data from excel') data_list = get_list(file_name=file_name) # use multiprocessing length = len(data_list) logging.info(u'total data length:%s' % str(length)) if length <= process_num: process_num = 1 step = (length / process_num) or 1 nloop = range(0, length, step) if len(nloop) == process_num: nloop.append(length) elif len(nloop) == process_num + 1: nloop[-1] = length pool = LoggingPool(processes=process_num) error_q = multiprocessing.Manager().Queue() success_q = multiprocessing.Manager().Queue() for i, n in enumerate(nloop): if i < len(nloop): data = data_list[n:n + step] pool.apply_async(data_handle, (data, success_q, error_q, time_out, )) pool.close() pool.join() # write to txt create_event(success_q, error_q)