Пример #1
0
def main():
    if len(sys.argv) != 3:
        usage()
        input_path = "E:\PycharmProjects\My\crawler\umetrip\hangbianqingqiu.20150720.100.txt"
        result_path = "result.txt"
    else:
        input_path = sys.argv[1]
        result_path = sys.argv[2]

    proxy_list = proxy_ip.get_proxy_list_from_file()
    fo = open(result_path, 'w')
    f = open(input_path, 'r')
    ts = int((time.time() - 3600 * 24) * 1000)

    dt = datetime.datetime.now()
    dt_m1 = dt - datetime.timedelta(1)
    dt_p1 = dt + datetime.timedelta(1)
    dt_str, dt_m1_str, dt_p1_str = dt.strftime('%Y-%m-%d'), dt_m1.strftime(
        '%Y-%m-%d'), dt_p1.strftime('%Y-%m-%d')
    dt_flag_dic = {dt_str: ".", dt_m1_str: "-", dt_p1_str: "%%2B"}

    for line in f:
        line = line.rstrip("\n")
        arr = line.split("\t")
        flightno, dep_code, arr_code, dep_date = arr[3:]
        if dep_date not in dt_flag_dic:
            continue
        url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=%s&queryCxr=%s&queryFlightno=%s&_=%s" \
              % (dt_flag_dic[dep_date], flightno[0:2], flightno[2:], ts)
        s1 = crawl_url(url, proxy_list)
        while s1 is None:
            s1 = crawl_url(url, proxy_list)
        js1_str = s1[s1.find("(") + 1:s1.rfind(")")]
        js1 = json.loads(js1_str)
        seg_list = []
        for flight_seg in js1:
            seg_list += flight_seg["actualFlightShowList"]
        for seg in seg_list:
            if seg["flightno"] == flightno and seg[
                    "depAirportCode"] == dep_code and seg[
                        "arrAirportCode"] == arr_code and seg["std"][
                            0:8] == dep_date.replace("-", ""):
                ata, sta = datetime.datetime.strptime(
                    seg["ata"], "%Y%m%d %H:%M"), datetime.datetime.strptime(
                        seg["sta"], "%Y%m%d %H:%M")
                late_hours = (ata - sta).seconds / 3600.
                if late_hours >= 3:
                    fact_result = "延误"
                else:
                    fact_result = seg["status"]
                fo.write("%s\t%s\t%s\t%s\t%s\n" %
                         (line, seg["status"].encode("utf8"), fact_result,
                          str(late_hours), js1_str))
                fo.flush()
        time.sleep(1)

    f.close()
    fo.close()
Пример #2
0
def main():
    if len(sys.argv) != 3:
        usage()
        input_path = "E:\PycharmProjects\My\crawler\umetrip\hangbianqingqiu.20150720.100.txt"
        result_path = "result.txt"
    else:
        input_path = sys.argv[1]
        result_path = sys.argv[2]

    proxy_list = proxy_ip.get_proxy_list_from_file()
    fo = open(result_path, 'w')
    f = open(input_path, 'r')
    ts = int((time.time()-3600*24)*1000)

    dt = datetime.datetime.now()
    dt_m1 = dt - datetime.timedelta(1)
    dt_p1 = dt + datetime.timedelta(1)
    dt_str, dt_m1_str, dt_p1_str = dt.strftime('%Y-%m-%d'), dt_m1.strftime('%Y-%m-%d'), dt_p1.strftime('%Y-%m-%d')
    dt_flag_dic = {dt_str: ".", dt_m1_str: "-", dt_p1_str: "%%2B"}

    for line in f:
        line = line.rstrip("\n")
        arr = line.split("\t")
        flightno, dep_code, arr_code, dep_date = arr[3:]
        if dep_date not in dt_flag_dic:
            continue
        url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=%s&queryCxr=%s&queryFlightno=%s&_=%s" \
              % (dt_flag_dic[dep_date], flightno[0:2], flightno[2:], ts)
        s1 = crawl_url(url, proxy_list)
        while s1 is None:
            s1 = crawl_url(url, proxy_list)
        js1_str = s1[s1.find("(")+1:s1.rfind(")")]
        js1 = json.loads(js1_str)
        seg_list = []
        for flight_seg in js1:
            seg_list += flight_seg["actualFlightShowList"]
        for seg in seg_list:
            if seg["flightno"]==flightno and seg["depAirportCode"]==dep_code and seg["arrAirportCode"]==arr_code and seg["std"][0:8]==dep_date.replace("-",""):
                ata, sta = datetime.datetime.strptime(seg["ata"], "%Y%m%d %H:%M"), datetime.datetime.strptime(seg["sta"], "%Y%m%d %H:%M")
                late_hours = (ata-sta).seconds/3600.
                if late_hours>=3:
                    fact_result = "延误"
                else:
                    fact_result = seg["status"]
                fo.write("%s\t%s\t%s\t%s\t%s\n" % (line, seg["status"].encode("utf8"), fact_result, str(late_hours), js1_str))
                fo.flush()
        time.sleep(1)

    f.close()
    fo.close()
Пример #3
0
def main():
    if len(sys.argv) != 2:
        usage()
        result_path = "result.txt"
    else:
        result_path = sys.argv[1]
    fo = open(result_path, 'w')
    f = open(
        "E:\PycharmProjects\My\crawler\umetrip\hangbianqingqiu.20150720.100.txt",
        'r')
    # re.S, the symbol '.' matches any character including \n
    p1 = re.compile(
        """class="state">.*?<div class=".*?">(.*?)</div>.*?</div>""", re.S)
    ind = 1
    proxy_list = proxy_ip.get_proxy_list_from_file()
    for line in f:
        if ind < 0:
            ind += 1
            continue
        print ind, line.strip()
        ind += 1
        arr = line.rstrip("\n").split("\t")
        # arr1 = arr[1].split(",")
        url = "http://www.umetrip.com/mskyweb/fs/fc.do?flightNo=%s&date=%s&channel=" % (
            arr[0], arr[1])
        s1 = crawl_url(url, proxy_list)
        while s1 is None:
            print "s1 is None, sleep..."
            time.sleep(1)
            s1 = crawl_url(url, proxy_list)
        # print s1
        l1 = re.findall(p1, s1)
        if len(l1) >= 1:
            result = l1[0].strip()
        else:
            result = "no_found"
        fo.write("%s %s\n" % (" ".join(arr), result))
        fo.flush()
        time.sleep(1)

    f.close()
    fo.close()
Пример #4
0
def main2():
    ts = int((time.time() - 3600 * 24) * 1000)
    url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=%%2B&queryCxr=MU&queryFlightno=5757&_=%s" % ts
    url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=.&queryCxr=MU&queryFlightno=5319&_=%s" % ts
    url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=-&queryCxr=MU&queryFlightno=5319&_=%s" % ts
    # url = "http://www.baidu.com/"
    proxy_list = proxy_ip.get_proxy_list_from_file()
    s1 = None
    print "s1"
    print len(proxy_list)
    while s1 is None:
        print "s1 is none."
        s1 = crawl_url(url, proxy_list)
    print s1
    js1_str = s1[s1.find("(") + 1:s1.rfind(")")]
    print js1_str
    js1 = json.loads(js1_str)
    for flight_seg in js1:
        print flight_seg["flightno"],flight_seg["actualFlightShowList"][0]["depAirportCode"],\
            flight_seg["actualFlightShowList"][0]["arrAirportCode"],\
            flight_seg["actualFlightShowList"][0]["status"]
Пример #5
0
def main2():
    ts = int((time.time()-3600*24)*1000)
    url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=%%2B&queryCxr=MU&queryFlightno=5757&_=%s" % ts
    url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=.&queryCxr=MU&queryFlightno=5319&_=%s" % ts
    url = "http://www.ceair.com/addservice/new-aoc!queryNewFlightStatus.shtml?qType=0&flightTime=-&queryCxr=MU&queryFlightno=5319&_=%s" % ts
    # url = "http://www.baidu.com/"
    proxy_list = proxy_ip.get_proxy_list_from_file()
    s1 = None
    print "s1"
    print len(proxy_list)
    while s1 is None:
        print "s1 is none."
        s1 = crawl_url(url, proxy_list)
    print s1
    js1_str = s1[s1.find("(")+1:s1.rfind(")")]
    print js1_str
    js1 = json.loads(js1_str)
    for flight_seg in js1:
        print flight_seg["flightno"],flight_seg["actualFlightShowList"][0]["depAirportCode"],\
            flight_seg["actualFlightShowList"][0]["arrAirportCode"],\
            flight_seg["actualFlightShowList"][0]["status"]
Пример #6
0
def main():
    if len(sys.argv) != 2:
        usage()
        result_path = "result.txt"
    else:
        result_path = sys.argv[1]
    fo = open(result_path, 'w')
    f = open("E:\PycharmProjects\My\crawler\umetrip\hangbianqingqiu.20150720.100.txt", 'r')
    # re.S, the symbol '.' matches any character including \n
    p1 = re.compile("""class="state">.*?<div class=".*?">(.*?)</div>.*?</div>""", re.S)
    ind = 1
    proxy_list = proxy_ip.get_proxy_list_from_file()
    for line in f:
        if ind < 0:
            ind += 1
            continue
        print ind, line.strip()
        ind += 1
        arr = line.rstrip("\n").split("\t")
        # arr1 = arr[1].split(",")
        url = "http://www.umetrip.com/mskyweb/fs/fc.do?flightNo=%s&date=%s&channel=" % (arr[0], arr[1])
        s1 = crawl_url(url, proxy_list)
        while s1 is None:
            print "s1 is None, sleep..."
            time.sleep(1)
            s1 = crawl_url(url, proxy_list)
        # print s1
        l1 = re.findall(p1, s1)
        if len(l1) >= 1:
            result = l1[0].strip()
        else:
            result = "no_found"
        fo.write("%s %s\n" % (" ".join(arr), result))
        fo.flush()
        time.sleep(1)

    f.close()
    fo.close()