def main(): if len(sys.argv) != 3: usage() input_path = "E:\PycharmProjects\My\crawler\umetrip\hangbianqingqiu.20150720.100.txt" result_path = "result.txt" else: input_path = sys.argv[1] result_path = sys.argv[2] proxy_list = proxy_ip.get_proxy_list_from_file() fo = open(result_path, 'w') f = open(input_path, 'r') ts = int((time.time() - 3600 * 24) * 1000) dt = datetime.datetime.now() dt_m1 = dt - datetime.timedelta(1) dt_p1 = dt + datetime.timedelta(1) dt_str, dt_m1_str, dt_p1_str = dt.strftime('%Y-%m-%d'), dt_m1.strftime( '%Y-%m-%d'), dt_p1.strftime('%Y-%m-%d') dt_flag_dic = {dt_str: ".", dt_m1_str: "-", dt_p1_str: "%%2B"} for line in f: line = line.rstrip("\n") arr = line.split("\t") flightno, dep_code, arr_code, dep_date = arr[3:] if dep_date not in dt_flag_dic: continue url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=%s&queryCxr=%s&queryFlightno=%s&_=%s" \ % (dt_flag_dic[dep_date], flightno[0:2], flightno[2:], ts) s1 = crawl_url(url, proxy_list) while s1 is None: s1 = crawl_url(url, proxy_list) js1_str = s1[s1.find("(") + 1:s1.rfind(")")] js1 = json.loads(js1_str) seg_list = [] for flight_seg in js1: seg_list += flight_seg["actualFlightShowList"] for seg in seg_list: if seg["flightno"] == flightno and seg[ "depAirportCode"] == dep_code and seg[ "arrAirportCode"] == arr_code and seg["std"][ 0:8] == dep_date.replace("-", ""): ata, sta = datetime.datetime.strptime( seg["ata"], "%Y%m%d %H:%M"), datetime.datetime.strptime( seg["sta"], "%Y%m%d %H:%M") late_hours = (ata - sta).seconds / 3600. if late_hours >= 3: fact_result = "延误" else: fact_result = seg["status"] fo.write("%s\t%s\t%s\t%s\t%s\n" % (line, seg["status"].encode("utf8"), fact_result, str(late_hours), js1_str)) fo.flush() time.sleep(1) f.close() fo.close()
def main(): if len(sys.argv) != 3: usage() input_path = "E:\PycharmProjects\My\crawler\umetrip\hangbianqingqiu.20150720.100.txt" result_path = "result.txt" else: input_path = sys.argv[1] result_path = sys.argv[2] proxy_list = proxy_ip.get_proxy_list_from_file() fo = open(result_path, 'w') f = open(input_path, 'r') ts = int((time.time()-3600*24)*1000) dt = datetime.datetime.now() dt_m1 = dt - datetime.timedelta(1) dt_p1 = dt + datetime.timedelta(1) dt_str, dt_m1_str, dt_p1_str = dt.strftime('%Y-%m-%d'), dt_m1.strftime('%Y-%m-%d'), dt_p1.strftime('%Y-%m-%d') dt_flag_dic = {dt_str: ".", dt_m1_str: "-", dt_p1_str: "%%2B"} for line in f: line = line.rstrip("\n") arr = line.split("\t") flightno, dep_code, arr_code, dep_date = arr[3:] if dep_date not in dt_flag_dic: continue url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=%s&queryCxr=%s&queryFlightno=%s&_=%s" \ % (dt_flag_dic[dep_date], flightno[0:2], flightno[2:], ts) s1 = crawl_url(url, proxy_list) while s1 is None: s1 = crawl_url(url, proxy_list) js1_str = s1[s1.find("(")+1:s1.rfind(")")] js1 = json.loads(js1_str) seg_list = [] for flight_seg in js1: seg_list += flight_seg["actualFlightShowList"] for seg in seg_list: if seg["flightno"]==flightno and seg["depAirportCode"]==dep_code and seg["arrAirportCode"]==arr_code and seg["std"][0:8]==dep_date.replace("-",""): ata, sta = datetime.datetime.strptime(seg["ata"], "%Y%m%d %H:%M"), datetime.datetime.strptime(seg["sta"], "%Y%m%d %H:%M") late_hours = (ata-sta).seconds/3600. if late_hours>=3: fact_result = "延误" else: fact_result = seg["status"] fo.write("%s\t%s\t%s\t%s\t%s\n" % (line, seg["status"].encode("utf8"), fact_result, str(late_hours), js1_str)) fo.flush() time.sleep(1) f.close() fo.close()
def main(): if len(sys.argv) != 2: usage() result_path = "result.txt" else: result_path = sys.argv[1] fo = open(result_path, 'w') f = open( "E:\PycharmProjects\My\crawler\umetrip\hangbianqingqiu.20150720.100.txt", 'r') # re.S, the symbol '.' matches any character including \n p1 = re.compile( """class="state">.*?<div class=".*?">(.*?)</div>.*?</div>""", re.S) ind = 1 proxy_list = proxy_ip.get_proxy_list_from_file() for line in f: if ind < 0: ind += 1 continue print ind, line.strip() ind += 1 arr = line.rstrip("\n").split("\t") # arr1 = arr[1].split(",") url = "http://www.umetrip.com/mskyweb/fs/fc.do?flightNo=%s&date=%s&channel=" % ( arr[0], arr[1]) s1 = crawl_url(url, proxy_list) while s1 is None: print "s1 is None, sleep..." time.sleep(1) s1 = crawl_url(url, proxy_list) # print s1 l1 = re.findall(p1, s1) if len(l1) >= 1: result = l1[0].strip() else: result = "no_found" fo.write("%s %s\n" % (" ".join(arr), result)) fo.flush() time.sleep(1) f.close() fo.close()
def main2(): ts = int((time.time() - 3600 * 24) * 1000) url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=%%2B&queryCxr=MU&queryFlightno=5757&_=%s" % ts url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=.&queryCxr=MU&queryFlightno=5319&_=%s" % ts url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=-&queryCxr=MU&queryFlightno=5319&_=%s" % ts # url = "http://www.baidu.com/" proxy_list = proxy_ip.get_proxy_list_from_file() s1 = None print "s1" print len(proxy_list) while s1 is None: print "s1 is none." s1 = crawl_url(url, proxy_list) print s1 js1_str = s1[s1.find("(") + 1:s1.rfind(")")] print js1_str js1 = json.loads(js1_str) for flight_seg in js1: print flight_seg["flightno"],flight_seg["actualFlightShowList"][0]["depAirportCode"],\ flight_seg["actualFlightShowList"][0]["arrAirportCode"],\ flight_seg["actualFlightShowList"][0]["status"]
def main2(): ts = int((time.time()-3600*24)*1000) url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=%%2B&queryCxr=MU&queryFlightno=5757&_=%s" % ts url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=.&queryCxr=MU&queryFlightno=5319&_=%s" % ts url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=-&queryCxr=MU&queryFlightno=5319&_=%s" % ts # url = "http://www.baidu.com/" proxy_list = proxy_ip.get_proxy_list_from_file() s1 = None print "s1" print len(proxy_list) while s1 is None: print "s1 is none." s1 = crawl_url(url, proxy_list) print s1 js1_str = s1[s1.find("(")+1:s1.rfind(")")] print js1_str js1 = json.loads(js1_str) for flight_seg in js1: print flight_seg["flightno"],flight_seg["actualFlightShowList"][0]["depAirportCode"],\ flight_seg["actualFlightShowList"][0]["arrAirportCode"],\ flight_seg["actualFlightShowList"][0]["status"]
def main(): if len(sys.argv) != 2: usage() result_path = "result.txt" else: result_path = sys.argv[1] fo = open(result_path, 'w') f = open("E:\PycharmProjects\My\crawler\umetrip\hangbianqingqiu.20150720.100.txt", 'r') # re.S, the symbol '.' matches any character including \n p1 = re.compile("""class="state">.*?<div class=".*?">(.*?)</div>.*?</div>""", re.S) ind = 1 proxy_list = proxy_ip.get_proxy_list_from_file() for line in f: if ind < 0: ind += 1 continue print ind, line.strip() ind += 1 arr = line.rstrip("\n").split("\t") # arr1 = arr[1].split(",") url = "http://www.umetrip.com/mskyweb/fs/fc.do?flightNo=%s&date=%s&channel=" % (arr[0], arr[1]) s1 = crawl_url(url, proxy_list) while s1 is None: print "s1 is None, sleep..." time.sleep(1) s1 = crawl_url(url, proxy_list) # print s1 l1 = re.findall(p1, s1) if len(l1) >= 1: result = l1[0].strip() else: result = "no_found" fo.write("%s %s\n" % (" ".join(arr), result)) fo.flush() time.sleep(1) f.close() fo.close()