Exemplo n.º 1
0
def scraping_IP_int(head, raw_data):
    log('start scraping Interpark crawled data')
    raw_json = json_loads(raw_data)  #json.loads(raw_data)
    if raw_json is None:  ## 파싱 에러
        return None
    if type(raw_json['Responses']['GoodsList']) == str:  ## 데이터가 없을 경우 체크
        return None
    raw_list = []
    fare_goods = raw_json['Responses']['GoodsList']['Goods']
    if type(fare_goods) == dict:  ##데이터가 하나인 경우 방지용
        fare_goods = [fare_goods]
    for fare_set in fare_goods:
        air_itns = fare_set['AirAvail']['StartAvail']['AirItn']
        if type(air_itns) == dict:  ##데이터가 하나인 경우 방지용
            air_itns = [air_itns]
        for air_itn in air_itns:
            seg_detail_t = air_itn['seg_detail_t']
            raw_list.append([
                seg_detail_t['car_code'], seg_detail_t['main_flt'],
                fare_set['StartDT'], seg_detail_t['dep_city'],
                seg_detail_t['arr_city'], seg_detail_t['dep_date_time'][8:],
                seg_detail_t['arr_date_time'][8:], fare_set['SaleFare'],
                fare_set['Qcharge'], fare_set['Tax'],
                seg_detail_t['no_of_avail_seat']
            ])
    log('end scraping Interpark crawled data')
    return raw_list
Exemplo n.º 2
0
def crawling_IP_int(airline, dpt, arr, dpt_date):
    log('Crawling Interpark domastic schedule site')
    url = "http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx"
    param = {
        'FLEX': 'N',
        'Soto': 'N',
        'ptype': 'I',
        'SeatAvail': 'Y',
        'comp': 'Y',
        'JSON': 'Y',
        'enc': 'u',
        'BEST': 'Y',
        'Change': '',
        'StayLength': '',
        'SeatType': 'A',
        'trip': 'OW',  # 퍈도 왕복
        'adt': '1',
        'chd': '0',
        'inf': '0',
        'SplitNo': '100',  # 읽어올 데이터 사이즈
        'AirLine': airline,
        'dep0': dpt,  # 출발지
        'arr0': arr,  # 도착지
        'depdate0': dpt_date,  # 출발일
    }
    ## 문자 앞부분과 끝부분을 제외한 부분만 읽어오기 JSON 포맷 에러 발생
    return simple_crawling(url, param, method='get', json=False)[1:-1]
Exemplo n.º 3
0
def crawling_LJ_dom(dpt, arr, dpt_date):
    ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기
    log('Crawling jinair homepage schedule site')
    url = "https://www.jinair.com/RSV/RSV_WebResult.aspx"
    head = {
        'Referer': 'https://www.jinair.com/RSV/Reservation.aspx',
    }
    param = {
        'TASK': 'NormalFare',
        'OWListId': 'ctl00_ContentPlaceHolder1_fltlstDownLine',
        'OWDepDate': dpt_date,
        'OWDep': dpt,
        'OWArr': arr,
        'MemberClass': 'I',
        'DisCode': '',
        'MbrGb': 'N'
    }
    payload = '<REQUEST><TASK>{TASK}</TASK><OWListId>{OWListId}</OWListId><OWDepDate>{OWDepDate}</OWDepDate>'
    payload += '<OWDep>{OWDep}</OWDep><OWArr>{OWArr}</OWArr><MemberClass>{MemberClass}</MemberClass>'
    payload += '<DisCode>{DisCode}</DisCode><MbrGb>{MbrGb}</MbrGb></REQUEST>'

    return payload_crawling(url,
                            payload.format(**param),
                            head=head,
                            method='post',
                            json=False)
Exemplo n.º 4
0
def email_sender(to, subject, html, attachs=None):
    log(['receiver', to, ' ## email -', subject])
    email_user = '******'  # 세팅 처리
    email_pwd = 'bk@813102'
    msg = MIMEMultipart('alternative')
    msg['From'] = email_user
    msg['To'] = to
    msg['Subject'] = Header(s=subject, charset="utf-8")
    msg.attach(MIMEText(html, 'html', _charset="utf-8"))

    #첨부파일
    if attachs:
        for attach in attachs:
            part = MIMEBase('application', 'octet-stream')
            part.set_payload(open(attach, 'rb').read())
            encoders.encode_base64(part)
            part.add_header(
                'Content-Disposition',
                'attachment; filename="%s"' % os.path.basename(attach))
            msg.attach(part)

    s = smtplib.SMTP("spam.eastarjet.com", 587)
    s.ehlo()
    #s.starttls()
    #s.ehlo()
    #s.login(email_user, email_pwd)
    s.sendmail(email_user, to, msg.as_string())
    s.close()
Exemplo n.º 5
0
def crawling_7C(dpt, arr, dpt_date, dom_int):
    ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기
    log('Crawling jejuair homepage schedule site')
    session_url = "https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do"
    session_head = {
        'Referer': 'http://www.jejuair.net/jejuair/main.jsp',
    }

    url = 'https://www.jejuair.net/jejuair/com/jeju/ibe/searchAvail.do'
    head = {
        'Referer': 'https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do',
    }
    param = {
        'AdultPaxCnt': '1',
        'ChildPaxCnt': '0',
        'InfantPaxCnt': '0',
        'RouteType': dom_int,  ## 국내선 D, 국제선 I
        'SystemType': 'IBE',
        'Language': 'KR',
        'DepStn': dpt,
        'ArrStn': arr,
        'SegType': 'DEP',
        'TripType': 'OW',
        'DepDate': dpt_date,
        'Index': '1'  # 국제선용
    }
    return session_crawling(session_url,
                            url,
                            param,
                            session_head=session_head,
                            head=head,
                            method='post',
                            json=False)
Exemplo n.º 6
0
def crawling_func(func, *argv):
    try:
        return func(*argv)
    except TypeError as te:
        return str(te)
    except ConnectionError as ce:
        return str(ce)
    except:
        log("Unexpected error:", sys.exc_info()[0])
        raise
Exemplo n.º 7
0
def get_crawl_site_func(dom_int, site):
    for site_code in CRAWL_SITE_CODES:
        #log(site_code)
        if site_code.dom_int == dom_int and site_code.site == site:
            try:
                log(site_code)
                func = eval(site_code.func)
            except NameError as e:
                log(e, level=logging.ERROR)
                return None
            return func, site_code.isairline
    return None, None
Exemplo n.º 8
0
def get_scrap_site_func(dom_int, site):
    for site_code in SCRAP_SITE_CODES:
        #log(site_code)
        if site_code.dom_int == dom_int and site_code.site == site:
            try:
                log(site_code)
                func = eval(site_code.func)
            except NameError as e:
                log(e, level=logging.ERROR)
                return None
            return func
    return None
Exemplo n.º 9
0
def crawling_KE_dom(dpt, arr, dpt_date):
    ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기
    log('Crawling koreanair homepage schedule site')
    session_url = "https://www.koreanair.com/korea/ko/booking/booking-gate.html#bookingChange"
    session_head = {
        'Referer':
        'https://kr.koreanair.com/korea/ko.html',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    }

    ## Rest Full 방식 https://www.koreanair.com/api/fly/revenue/from/GMP/to/CJU/on/05-25-2017-0000
    url = "https://www.koreanair.com/api/fly/revenue/from/{dpt}/to/{arr}/on/{mm}-{dd}-{yyyy}-0000"
    url_param = {
        'dpt': dpt,
        'arr': arr,
        'yyyy': dpt_date[:4],
        'mm': dpt_date[4:6],
        'dd': dpt_date[6:]
    }
    url = url.format(**url_param)
    head = {
        'page-id':
        '/booking/dow.html',  ## 필수 항목
        'uidd':
        '83^51%8638461@384712',  ## 필수 항목
        'Referer':
        'https://www.koreanair.com/korea/ko/booking/dow.html',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    }
    param ={## 파라미터는 고정, _ 부분만 조회 시점 타임스탬프 적용
        'flexDays':'2',
        'scheduleDriven':'false',
        'purchaseThirdPerson':'',
        'domestic':'true',
        'isUpgradeableCabin':'false',
        'adults':'1',    'children':'0',    'infants':'0',
        'cabinClass':'ECONOMY',
        'adultDiscounts':'',    'adultInboundDiscounts':'',
        'childDiscounts':'',    'childInboundDiscounts':'',
        'infantDiscounts':'',   'infantInboundDiscounts':'',
        '_':str(int(datetime.now().timestamp())),
    }
    return session_crawling(session_url,
                            url,
                            param,
                            session_head=session_head,
                            head=head,
                            method='get',
                            json=False)
Exemplo n.º 10
0
def crawling_IP_dom(airline, dpt, arr, dpt_date):
    log('Crawling Interpark domastic schedule site')
    url = 'http://domair.interpark.com/api/booking/airJourney.do'
    param = {
        'format': 'json',  # JSON 포맷
        'dep': dpt,  # 출발
        'arr': arr,  # 도착
        'depDate': dpt_date,  # 출발일 yyyymmdd
        'airlineCode': airline,  # 항공사 코드
        'tripDivi': '0',  #편도 왕복 구분 0 - 편도 1 - 왕복
        'adt': '1',
        'chd': '0',
        'inf': '0'
    }
    return simple_crawling(url, param, method='get', json=False)
Exemplo n.º 11
0
def crawling_BX_dom(dpt, arr, dpt_date):
    ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기
    log('Crawling airbusan homepage schedule site')
    url = "https://www.airbusan.com/web/bookingApi/domesticAvail"
    head = {
        'Referer': 'https://www.airbusan.com/web/individual/booking/domestic',
    }
    param = {
        'depDate': dpt_date,
        'depCity': dpt,
        'arrCity': arr,
        'bookingCategory': 'Individual',
        'foc': 'N',
        'bookingClass': 'ES'
    }

    return simple_crawling(url, param, head=head, method='get', json=False)
Exemplo n.º 12
0
def crawling_TW(dpt, arr, dpt_date, dom_int):
    ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기
    log('Crawling twayair homepage schedule site')
    session_url = "https://www.twayair.com/booking/availabilityList.do"
    session_head = {
        'Referer': 'https://www.twayair.com/main.do',
    }

    url = 'https://www.twayair.com/booking/ajax/searchAvailability.do'
    head = {
        'Referer':
        'https://www.twayair.com/booking/availabilityList.do',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
    }
    param = {
        'origin': dpt,
        'destination': arr,
        'origin1': dpt,
        'destination1': arr,
        'origin2': arr,
        'destination2': dpt,
        'onwardDateStr': dpt_date,
        'returnDateStr': dpt_date,
        'today': datetime.today().strftime('%Y%m%d'),
        #'searchAvailId':searchAvailId,
        #'currencyCode':'KRW',          'pointOfPurchase':'KR',
        'travelType': 'OW',  #'RT',
        'domesticYn': dom_int,  ## 국내선 'Y', 국제선'N'
        'paxTypeCountStr': '1,0,0',
        'searchType': 'byDate',
        'orderByOW': '',
        'orderByRT': '',
        'fareBasisCodeOW': '',
        'fareBasisCodeRT': '',
        'arrivCntryCode': '',
        'promotionCode': '',
    }
    return session_crawling(session_url,
                            url,
                            param,
                            session_head=session_head,
                            head=head,
                            method='get',
                            json=False)
Exemplo n.º 13
0
def crawling_ZE_dom_int(dpt, arr, dpt_date):
    ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기
    log('Crawling eastarjet homepage schedule site')
    session_url = "https://www.eastarjet.com/newstar/PGWBA00001"
    session_head = {
        'Referer':
        'https://www.eastarjet.com/newstar/PGWBA00001',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
    }
    sess = requests.Session()
    sess.get(session_url, headers=session_head)
    time.sleep(1)  ## 1초가 지연 처리
    url = 'https://www.eastarjet.com/json/dataService'
    head = {
        'Referer':
        'https://www.eastarjet.com/newstar/PGWBA00002',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
    }

    flightSearch = '{"viewType":"","fly_type":"2","person1":"1","person2":"0","person3":"0",\
    "residentCountry":"KR","currency":"","promotion_cd":"",\
    "flySection":[{"departure_cd":"{departure_cd}","arrival_cd":"{arrival_cd}","departure_date_cd":"{departure_date_cd}"}]}'

    #req = requests.post(url,json=payload,headers=head)
    payload = get_ZE_payload(
        flightSearch.replace('{departure_cd}',
                             dpt).replace('{arrival_cd}',
                                          arr).replace('{departure_date_cd}',
                                                       dpt_date))
    return jsonpayload_crawling(url,
                                payload,
                                session=sess,
                                head=head,
                                method='post',
                                json=False)
Exemplo n.º 14
0
def scraping_IP_dom(head, raw_data):
    ## raw_data 에러체크후 정상 파일 스크래핑 후 리스트 리턴
    log('start scraping Interpark crawled data')
    raw_json = json_loads(raw_data)  #json.loads(raw_data)
    if raw_json is None:  ## 파싱 에러
        return None
    if raw_json['replyHeader']['errorCode'] == '1':  ## 에러
        print('## Error : crawling data not found!')
        return None
    loop_field = "['replyAvailFare']['availFareSet']"
    parse_info = {
        'airline':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['carCode']",  #i
        'date':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['depDate']",  #i
        'flt':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['mainFlt']",  #i
        'dpt':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['depCity']",  #i
        'dpt_time':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['depTime']",  #i
        'arr':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['arrCity']",  #i
        'arr_time':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['arrTime']",  #i
        'fare1':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][0]['fare']",  #i,j
        'fare2':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][1]['fare']",  #i,j+1
        'tax':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['airTax']",  #i
        'fuel':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['fuelChg']",  #i
        'seat1':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][0]['noOfAvailSeat']",  #i,j
        'seat2':
        "['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][1]['noOfAvailSeat']",  #i,j+1
    }

    parsed_list = parsing_json_data_to_dict(raw_json, loop_field, parse_info)
    if len(parsed_list) == 0:
        log('scraping data not found!', logging.WARNING)
        return None
    scraped_list = [[d['airline'],d['flt'],d['date'],d['dpt'],d['arr'],d['dpt_time'],d['arr_time'],
                      d['fare1'],d['fuel'],d['tax'],d['seat1']] for d in parsed_list ] +\
                    [[d['airline'],d['flt'],d['date'],d['dpt'],d['arr'],d['dpt_time'],d['arr_time'],
                      d['fare2'],d['fuel'],d['tax'],d['seat2']] for d in parsed_list ]
    log('end scraping Interpark crawled data')
    return scraped_list
Exemplo n.º 15
0
def move_scraped_file(file, fold):
    if file is None or fold is None:
        log('check your file or fold![{},{}]'.format(file, fold),
            level=logging.WARNING)
        return None
    ## 파일 존재 여부, 폴더 존재 여부 체크
    if not os.path.isfile(file):
        log('file not found![{}]'.format(file), level=logging.WARNING)
        return None
    if not os.path.isdir(fold):
        log('fold not found![{}]'.format(fold), level=logging.WARNING)
        return None
    ## 이동 대상 폴더에 파일이 존재 할 경우 해당 폴더의 파일 삭제 처리
    if os.path.isfile(os.path.join(fold, os.path.split(file)[-1])):
        os.remove(os.path.join(fold, os.path.split(file)[-1]))
    return move_file(file, fold)
Exemplo n.º 16
0
def email_multi_sender(to_list, subject, html, attachs=None):
    log('start multi email sender')
    for to in to_list:
        email_sender(to, subject, html, attachs=attachs)
    log('end multi email sender')
Exemplo n.º 17
0
def get_crawled_file_list(crawl_dir):
    file_list = get_files(crawl_dir, check=FILE_NAME_HEAD)
    if file_list is None:
        log('there is no files for scraping in the {} fold!'.format(CRAWL_DIR),
            level=logging.WARNING)
    return file_list
Exemplo n.º 18
0
def json_loads(json_str):
    try:
        return json.loads(json_str)
    except JSONDecodeError as je:
        log(je, logging.ERROR)
        return None