def scraping_IP_int(head, raw_data): log('start scraping Interpark crawled data') raw_json = json_loads(raw_data) #json.loads(raw_data) if raw_json is None: ## 파싱 에러 return None if type(raw_json['Responses']['GoodsList']) == str: ## 데이터가 없을 경우 체크 return None raw_list = [] fare_goods = raw_json['Responses']['GoodsList']['Goods'] if type(fare_goods) == dict: ##데이터가 하나인 경우 방지용 fare_goods = [fare_goods] for fare_set in fare_goods: air_itns = fare_set['AirAvail']['StartAvail']['AirItn'] if type(air_itns) == dict: ##데이터가 하나인 경우 방지용 air_itns = [air_itns] for air_itn in air_itns: seg_detail_t = air_itn['seg_detail_t'] raw_list.append([ seg_detail_t['car_code'], seg_detail_t['main_flt'], fare_set['StartDT'], seg_detail_t['dep_city'], seg_detail_t['arr_city'], seg_detail_t['dep_date_time'][8:], seg_detail_t['arr_date_time'][8:], fare_set['SaleFare'], fare_set['Qcharge'], fare_set['Tax'], seg_detail_t['no_of_avail_seat'] ]) log('end scraping Interpark crawled data') return raw_list
def crawling_IP_int(airline, dpt, arr, dpt_date): log('Crawling Interpark domastic schedule site') url = "http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx" param = { 'FLEX': 'N', 'Soto': 'N', 'ptype': 'I', 'SeatAvail': 'Y', 'comp': 'Y', 'JSON': 'Y', 'enc': 'u', 'BEST': 'Y', 'Change': '', 'StayLength': '', 'SeatType': 'A', 'trip': 'OW', # 퍈도 왕복 'adt': '1', 'chd': '0', 'inf': '0', 'SplitNo': '100', # 읽어올 데이터 사이즈 'AirLine': airline, 'dep0': dpt, # 출발지 'arr0': arr, # 도착지 'depdate0': dpt_date, # 출발일 } ## 문자 앞부분과 끝부분을 제외한 부분만 읽어오기 JSON 포맷 에러 발생 return simple_crawling(url, param, method='get', json=False)[1:-1]
def crawling_LJ_dom(dpt, arr, dpt_date): ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기 log('Crawling jinair homepage schedule site') url = "https://www.jinair.com/RSV/RSV_WebResult.aspx" head = { 'Referer': 'https://www.jinair.com/RSV/Reservation.aspx', } param = { 'TASK': 'NormalFare', 'OWListId': 'ctl00_ContentPlaceHolder1_fltlstDownLine', 'OWDepDate': dpt_date, 'OWDep': dpt, 'OWArr': arr, 'MemberClass': 'I', 'DisCode': '', 'MbrGb': 'N' } payload = '<REQUEST><TASK>{TASK}</TASK><OWListId>{OWListId}</OWListId><OWDepDate>{OWDepDate}</OWDepDate>' payload += '<OWDep>{OWDep}</OWDep><OWArr>{OWArr}</OWArr><MemberClass>{MemberClass}</MemberClass>' payload += '<DisCode>{DisCode}</DisCode><MbrGb>{MbrGb}</MbrGb></REQUEST>' return payload_crawling(url, payload.format(**param), head=head, method='post', json=False)
def email_sender(to, subject, html, attachs=None): log(['receiver', to, ' ## email -', subject]) email_user = '******' # 세팅 처리 email_pwd = 'bk@813102' msg = MIMEMultipart('alternative') msg['From'] = email_user msg['To'] = to msg['Subject'] = Header(s=subject, charset="utf-8") msg.attach(MIMEText(html, 'html', _charset="utf-8")) #첨부파일 if attachs: for attach in attachs: part = MIMEBase('application', 'octet-stream') part.set_payload(open(attach, 'rb').read()) encoders.encode_base64(part) part.add_header( 'Content-Disposition', 'attachment; filename="%s"' % os.path.basename(attach)) msg.attach(part) s = smtplib.SMTP("spam.eastarjet.com", 587) s.ehlo() #s.starttls() #s.ehlo() #s.login(email_user, email_pwd) s.sendmail(email_user, to, msg.as_string()) s.close()
def crawling_7C(dpt, arr, dpt_date, dom_int): ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기 log('Crawling jejuair homepage schedule site') session_url = "https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do" session_head = { 'Referer': 'http://www.jejuair.net/jejuair/main.jsp', } url = 'https://www.jejuair.net/jejuair/com/jeju/ibe/searchAvail.do' head = { 'Referer': 'https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do', } param = { 'AdultPaxCnt': '1', 'ChildPaxCnt': '0', 'InfantPaxCnt': '0', 'RouteType': dom_int, ## 국내선 D, 국제선 I 'SystemType': 'IBE', 'Language': 'KR', 'DepStn': dpt, 'ArrStn': arr, 'SegType': 'DEP', 'TripType': 'OW', 'DepDate': dpt_date, 'Index': '1' # 국제선용 } return session_crawling(session_url, url, param, session_head=session_head, head=head, method='post', json=False)
def crawling_func(func, *argv): try: return func(*argv) except TypeError as te: return str(te) except ConnectionError as ce: return str(ce) except: log("Unexpected error:", sys.exc_info()[0]) raise
def get_crawl_site_func(dom_int, site): for site_code in CRAWL_SITE_CODES: #log(site_code) if site_code.dom_int == dom_int and site_code.site == site: try: log(site_code) func = eval(site_code.func) except NameError as e: log(e, level=logging.ERROR) return None return func, site_code.isairline return None, None
def get_scrap_site_func(dom_int, site): for site_code in SCRAP_SITE_CODES: #log(site_code) if site_code.dom_int == dom_int and site_code.site == site: try: log(site_code) func = eval(site_code.func) except NameError as e: log(e, level=logging.ERROR) return None return func return None
def crawling_KE_dom(dpt, arr, dpt_date): ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기 log('Crawling koreanair homepage schedule site') session_url = "https://www.koreanair.com/korea/ko/booking/booking-gate.html#bookingChange" session_head = { 'Referer': 'https://kr.koreanair.com/korea/ko.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', } ## Rest Full 방식 https://www.koreanair.com/api/fly/revenue/from/GMP/to/CJU/on/05-25-2017-0000 url = "https://www.koreanair.com/api/fly/revenue/from/{dpt}/to/{arr}/on/{mm}-{dd}-{yyyy}-0000" url_param = { 'dpt': dpt, 'arr': arr, 'yyyy': dpt_date[:4], 'mm': dpt_date[4:6], 'dd': dpt_date[6:] } url = url.format(**url_param) head = { 'page-id': '/booking/dow.html', ## 필수 항목 'uidd': '83^51%8638461@384712', ## 필수 항목 'Referer': 'https://www.koreanair.com/korea/ko/booking/dow.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', } param ={## 파라미터는 고정, _ 부분만 조회 시점 타임스탬프 적용 'flexDays':'2', 'scheduleDriven':'false', 'purchaseThirdPerson':'', 'domestic':'true', 'isUpgradeableCabin':'false', 'adults':'1', 'children':'0', 'infants':'0', 'cabinClass':'ECONOMY', 'adultDiscounts':'', 'adultInboundDiscounts':'', 'childDiscounts':'', 'childInboundDiscounts':'', 'infantDiscounts':'', 'infantInboundDiscounts':'', '_':str(int(datetime.now().timestamp())), } return session_crawling(session_url, url, param, session_head=session_head, head=head, method='get', json=False)
def crawling_IP_dom(airline, dpt, arr, dpt_date): log('Crawling Interpark domastic schedule site') url = 'http://domair.interpark.com/api/booking/airJourney.do' param = { 'format': 'json', # JSON 포맷 'dep': dpt, # 출발 'arr': arr, # 도착 'depDate': dpt_date, # 출발일 yyyymmdd 'airlineCode': airline, # 항공사 코드 'tripDivi': '0', #편도 왕복 구분 0 - 편도 1 - 왕복 'adt': '1', 'chd': '0', 'inf': '0' } return simple_crawling(url, param, method='get', json=False)
def crawling_BX_dom(dpt, arr, dpt_date): ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기 log('Crawling airbusan homepage schedule site') url = "https://www.airbusan.com/web/bookingApi/domesticAvail" head = { 'Referer': 'https://www.airbusan.com/web/individual/booking/domestic', } param = { 'depDate': dpt_date, 'depCity': dpt, 'arrCity': arr, 'bookingCategory': 'Individual', 'foc': 'N', 'bookingClass': 'ES' } return simple_crawling(url, param, head=head, method='get', json=False)
def crawling_TW(dpt, arr, dpt_date, dom_int): ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기 log('Crawling twayair homepage schedule site') session_url = "https://www.twayair.com/booking/availabilityList.do" session_head = { 'Referer': 'https://www.twayair.com/main.do', } url = 'https://www.twayair.com/booking/ajax/searchAvailability.do' head = { 'Referer': 'https://www.twayair.com/booking/availabilityList.do', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' } param = { 'origin': dpt, 'destination': arr, 'origin1': dpt, 'destination1': arr, 'origin2': arr, 'destination2': dpt, 'onwardDateStr': dpt_date, 'returnDateStr': dpt_date, 'today': datetime.today().strftime('%Y%m%d'), #'searchAvailId':searchAvailId, #'currencyCode':'KRW', 'pointOfPurchase':'KR', 'travelType': 'OW', #'RT', 'domesticYn': dom_int, ## 국내선 'Y', 국제선'N' 'paxTypeCountStr': '1,0,0', 'searchType': 'byDate', 'orderByOW': '', 'orderByRT': '', 'fareBasisCodeOW': '', 'fareBasisCodeRT': '', 'arrivCntryCode': '', 'promotionCode': '', } return session_crawling(session_url, url, param, session_head=session_head, head=head, method='get', json=False)
def crawling_ZE_dom_int(dpt, arr, dpt_date): ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기 log('Crawling eastarjet homepage schedule site') session_url = "https://www.eastarjet.com/newstar/PGWBA00001" session_head = { 'Referer': 'https://www.eastarjet.com/newstar/PGWBA00001', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' } sess = requests.Session() sess.get(session_url, headers=session_head) time.sleep(1) ## 1초가 지연 처리 url = 'https://www.eastarjet.com/json/dataService' head = { 'Referer': 'https://www.eastarjet.com/newstar/PGWBA00002', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' } flightSearch = '{"viewType":"","fly_type":"2","person1":"1","person2":"0","person3":"0",\ "residentCountry":"KR","currency":"","promotion_cd":"",\ "flySection":[{"departure_cd":"{departure_cd}","arrival_cd":"{arrival_cd}","departure_date_cd":"{departure_date_cd}"}]}' #req = requests.post(url,json=payload,headers=head) payload = get_ZE_payload( flightSearch.replace('{departure_cd}', dpt).replace('{arrival_cd}', arr).replace('{departure_date_cd}', dpt_date)) return jsonpayload_crawling(url, payload, session=sess, head=head, method='post', json=False)
def scraping_IP_dom(head, raw_data): ## raw_data 에러체크후 정상 파일 스크래핑 후 리스트 리턴 log('start scraping Interpark crawled data') raw_json = json_loads(raw_data) #json.loads(raw_data) if raw_json is None: ## 파싱 에러 return None if raw_json['replyHeader']['errorCode'] == '1': ## 에러 print('## Error : crawling data not found!') return None loop_field = "['replyAvailFare']['availFareSet']" parse_info = { 'airline': "['replyAvailFare']['availFareSet'][{}]['segFare']['carCode']", #i 'date': "['replyAvailFare']['availFareSet'][{}]['segFare']['depDate']", #i 'flt': "['replyAvailFare']['availFareSet'][{}]['segFare']['mainFlt']", #i 'dpt': "['replyAvailFare']['availFareSet'][{}]['segFare']['depCity']", #i 'dpt_time': "['replyAvailFare']['availFareSet'][{}]['segFare']['depTime']", #i 'arr': "['replyAvailFare']['availFareSet'][{}]['segFare']['arrCity']", #i 'arr_time': "['replyAvailFare']['availFareSet'][{}]['segFare']['arrTime']", #i 'fare1': "['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][0]['fare']", #i,j 'fare2': "['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][1]['fare']", #i,j+1 'tax': "['replyAvailFare']['availFareSet'][{}]['segFare']['airTax']", #i 'fuel': "['replyAvailFare']['availFareSet'][{}]['segFare']['fuelChg']", #i 'seat1': "['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][0]['noOfAvailSeat']", #i,j 'seat2': "['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][1]['noOfAvailSeat']", #i,j+1 } parsed_list = parsing_json_data_to_dict(raw_json, loop_field, parse_info) if len(parsed_list) == 0: log('scraping data not found!', logging.WARNING) return None scraped_list = [[d['airline'],d['flt'],d['date'],d['dpt'],d['arr'],d['dpt_time'],d['arr_time'], d['fare1'],d['fuel'],d['tax'],d['seat1']] for d in parsed_list ] +\ [[d['airline'],d['flt'],d['date'],d['dpt'],d['arr'],d['dpt_time'],d['arr_time'], d['fare2'],d['fuel'],d['tax'],d['seat2']] for d in parsed_list ] log('end scraping Interpark crawled data') return scraped_list
def move_scraped_file(file, fold): if file is None or fold is None: log('check your file or fold![{},{}]'.format(file, fold), level=logging.WARNING) return None ## 파일 존재 여부, 폴더 존재 여부 체크 if not os.path.isfile(file): log('file not found![{}]'.format(file), level=logging.WARNING) return None if not os.path.isdir(fold): log('fold not found![{}]'.format(fold), level=logging.WARNING) return None ## 이동 대상 폴더에 파일이 존재 할 경우 해당 폴더의 파일 삭제 처리 if os.path.isfile(os.path.join(fold, os.path.split(file)[-1])): os.remove(os.path.join(fold, os.path.split(file)[-1])) return move_file(file, fold)
def email_multi_sender(to_list, subject, html, attachs=None): log('start multi email sender') for to in to_list: email_sender(to, subject, html, attachs=attachs) log('end multi email sender')
def get_crawled_file_list(crawl_dir): file_list = get_files(crawl_dir, check=FILE_NAME_HEAD) if file_list is None: log('there is no files for scraping in the {} fold!'.format(CRAWL_DIR), level=logging.WARNING) return file_list
def json_loads(json_str): try: return json.loads(json_str) except JSONDecodeError as je: log(je, logging.ERROR) return None