def pusanUrl(name, url): subMenuUrls = clsSubMenuUrls() subMenuUrls.name = name subMenuUrls.url = url print subMenuUrls.name.decode('utf-8') + ' : ' + subMenuUrls.url #print >> exceptFile, subMenuUrls + ' : ' + subMenuUrls.url detailProductPusanHtml = savefilegethtml.getHtml(subMenuUrls.url, 'class="container', '<!-- end .ot_tab_style1 -->', 'onlinetourSubPagePusan.txt') for subMenu in detailProductPusanHtml: #if subMenu.find('<li class="">') > -1 and subMenu.find('전체') < 0: if subMenu.find('<li') > -1 and subMenu.find('<a') > -1 and (subMenu.find('전체') < 0 or subMenuUrls.url.find('D50') > -1 or subMenuUrls.url.find('D60') > -1 or subMenuUrls.url.find('D70') > -1): detailRegionUrls = clsDetailRegionUrls() detailRegionUrls.name = tourUtil.getRemovedHtmlTag(subMenu).strip() detailRegionUrls.url = mainUrl + tourUtil.getTagAttr(subMenu, 'a', 'href') subMenuUrls.detailRegionList.append(detailRegionUrls) print detailRegionUrls.name.decode('utf-8') + ' : ' + detailRegionUrls.url #print >> exceptFile, detailRegionUrls.name + ' : ' + detailRegionUrls.url return subMenuUrls
print >> exceptFile, "Start : %s" % time.ctime() print menulist con = cx_Oracle.connect("bigtour/[email protected]:1521/ora11g") # 메뉴에 다 잘들어 갔나 확인.. for level1 in menulist: for level2 in level1.tourkindgroup: for level3 in level2.regionUrlGroup: print 'Depart City : ' + level1.departCity + ', TourKind:' + level2.tourkind + ', Region : ' + level3.region + '(' + level3.url + ')' try: print >> exceptFile, level3.url regionHtml = savefilegethtml.getHtml( level3.url, '<div class="leftArea">', '</nav><!-- //lnb -->', 'tourbaksaRegionHtml.txt', '', '') for each_line in regionHtml: if each_line.find('<li class="') > -1 and each_line.find( 'M1=') > -1: #print each_line cityClass = clsCityUrlGroup() cityClass.city = each_line.split('</a>')[0].split( ">")[2] cityClass.url = homepageUrl + each_line.split( "href='")[1].split("'")[0] print 'Depart Url : ' + cityClass.url try: print >> exceptFile, cityClass.url
self.url = '' self.code = '' self.productCode = '' self.airchk = '' self.city = '' def toString(self): val = 'name:' + self.productname + ',price:' + self.price + ',dDay:' + self.dDay + ',dTime:' + self.dTime + ',aDay:' + self.aDay + ',aTime:' + self.aTime + ',night:' + self.night + ',city:' + self.city val += ',period:' + self.period + ',airCode:' + self.airCode + ',status:' + self.status + ',url:' + self.url + ',code:' + self.code + ',productCode:' + self.productCode + ',airchk:' + self.airchk return val tourkind = 'W' period = '' detailHtml = savefilegethtml.getHtml( 'http://www.naeiltour.co.kr/jagiya/honeymoon/program_include.asp?good_cd=550201054&sel_ym=201407', '', '', 'naeiltourDetailHtml.txt') departDayList = list() for detail_each_line in detailHtml: if detail_each_line.find("fn_goodDetail('") > -1: departDayList.append( detail_each_line.split("fn_goodDetail('")[1].split("'")[0]) # 출발 가능 날짜에 항공사 찾아오는 부분 productCls = clsProduct() for dayInfo in departDayList: productListUrl = 'http://www.naeiltour.co.kr/jagiya/honeymoon/program_include.asp?good_cd=550201054&sel_day=20140712' print 'ProductListUrl : ' + productListUrl productListHtml = savefilegethtml.getHtml(productListUrl, '', '', 'naeiltourproductListHtml.txt')
self.period = '' self.code = '' self.status = '' self.name = '' self.price = '' self.booked = '' self.url = '' def toString(self): return 'Code:' + self.code + ',sDay:' + self.sDay + ',sTime:' + self.sTime + ',aDay:' + self.aDay + ',aTime:' + self.aTime + ',aCode:' + self.aCode + ',Period:' + self.period + ',status:' + self.status + ',name:' + self.name + ',price:' + self.price + ',booked:' + self.booked print '==============================================================================================================' print 'PackageList Url : http://www.verygoodtour.com/Product/Package/PackageList?MenuCode=1010103&PageSize=200' regionHtml = savefilegethtml.getHtml( 'http://www.verygoodtour.com/Product/Package/PackageList?MenuCode=1010103&PageSize=200', '<div id="list_proviewM">', 'function BingPaging()', 'regionHtml.txt') #regionHtml = urllib2.urlopen(menu.url).read() #regionHtml = regionHtml[regionHtml.find('<div id="list_proviewM">'):regionHtml.find('function BingPaging()')] #regionHtmlFile = open('regionHtml.txt', 'w') #print >> regionHtmlFile, regionHtml #regionHtmlFile.close() #regionHtml = open('regionHtml.txt') mastercode = '' for each_line in regionHtml: if each_line.find('img_ov_text2') > -1: #Detail Product List 가져오는 URL... mastercode = each_line.split("('")[1].split("')")[0] elif each_line.find('class="title"') > -1:
self.aCode = '' self.period = '' self.code = '' self.status = '' self.name = '' self.price = '' self.booked = '' self.url = '' def toString(self): return 'Code:'+self.code+',sDay:'+self.sDay+',sTime:'+self.sTime+',aDay:'+self.aDay+',aTime:'+self.aTime+',aCode:'+self.aCode+',Period:'+self.period+',status:'+self.status+',name:'+self.name+',price:'+self.price+',booked:'+self.booked print '==============================================================================================================' print 'PackageList Url : http://www.verygoodtour.com/Product/Package/PackageList?MenuCode=101092103&PageSize=200' regionHtml = savefilegethtml.getHtml('http://www.verygoodtour.com/Product/Package/PackageList?MenuCode=101092103&PageSize=200', '<div id="list_proviewM">', 'function BingPaging()', 'regionHtml.txt') #regionHtml = urllib2.urlopen(menu.url).read() #regionHtml = regionHtml[regionHtml.find('<div id="list_proviewM">'):regionHtml.find('function BingPaging()')] #regionHtmlFile = open('regionHtml.txt', 'w') #print >> regionHtmlFile, regionHtml #regionHtmlFile.close() #regionHtml = open('regionHtml.txt') mastercode = '' con = cx_Oracle.connect("bigtour/[email protected]:1521/ora11g") for each_line in regionHtml: if each_line.find('img_ov_text2') > -1: #Detail Product List 가져오는 URL... mastercode = each_line.split("('")[1].split("')")[0] elif each_line.find('class="title"') > -1:
return 'No' # 시간 변수들.. tourAgency = 'vgtour' targetYear = sys.argv[1] targetMonth = sys.argv[2] #targetYear = '2014' #targetMonth = '07' scrappingStartTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") exceptFile = open('verygoodtourException' + scrappingStartTime + '.txt', 'w') print >> exceptFile, "Start : %s" % time.ctime() sitemapUrl = 'http://www.verygoodtour.com/Content/SiteMap.html' sitemapHtml = savefilegethtml.getHtml(sitemapUrl, '', '', 'sitemapHtml.txt') #sitemapHtml = urllib2.urlopen(sitemapUrl).read() #sitemapHtmlFile = open('sitemapHtml.txt', 'w') #print >> sitemapHtmlFile, sitemapHtml #sitemapHtmlFile.close() #sitemapHtml = open('sitemapHtml.txt') #menulist = list() # 메뉴 Url 들을 담고 있을 clsProduct들의 List tourType = '' departCity = '' region = '' depthIdx = 0 idx = 0 productList = list() # 중복으로 같은 상품 안가져 오도록 List에 넣고.. 없는 것들만 들고오도록.. productList.append('START') con = cx_Oracle.connect("bigtour/[email protected]:1521/ora11g") try:
self.subMenuList = list() # 시간 변수들.. tourAgency = 'lottetour' mainUrl = 'http://www.lottetour.com' targetYear = sys.argv[1] targetMonth = sys.argv[2] #targetYear = '2014' #targetMonth = '07' scrappingStartTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") exceptFileName = 'lottetourException' + scrappingStartTime + '.txt' exceptFile = open(exceptFileName, 'w') print >> exceptFile, "Start : %s" % time.ctime() mainpageHtml = savefilegethtml.getHtml('http://www.lottetour.com/welcome', '<nav>', '</nav>', 'onlinetourMainPage.txt') urlDict = dict() urlDict['package'] = 'package' urlDict['free'] = 'fit' urlDict['honeymoon'] = 'honeymoon' urlDict['golf'] = 'golf' urlDict['cruise'] = 'cruise' startComment = False firstOversea = True subMenu = False mainList = list() clsMain = mainCls() clsSubMenu = subMenuCls()
targetYear = sys.argv[1] targetMonth = sys.argv[2] #targetYear = '2014' #targetMonth = '07' scrappingStartTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") homepageUrl = 'http://www.naeiltour.co.kr' exceptFile = open('naeiltourException' + scrappingStartTime + '.txt', 'w') print >> exceptFile, "Start : %s" % time.ctime() #배낭여행 시작======================================== print '@@@@@@@@@@@@@ backpack start @@@@@@@@@@@@@@@@@@@@' print >> exceptFile, '@@@@@@@@@@@@@ backpack start @@@@@@@@@@@@@@@@@@@@' backpackUrl = 'http://www.naeiltour.co.kr/backpack/eu_main.asp?area=40' mainHtml = savefilegethtml.getHtml(backpackUrl, '<div id="left_mn">', '<div id="left_mn2">', 'naeiltourbackpackHtml.txt') comment = False backpackMenuList = list() #menu들 List country = '' try: for each_line in mainHtml: if each_line.find('<!--') > -1: comment = True if comment == False and each_line.find('/backpack/list.asp?') > -1: backpackRegionClass = clsRegionUrl() backpackRegionClass.country = country backpackRegionClass.url = homepageUrl + each_line.split( 'href="')[1].split('"')[0]
): # 출발일정 눌렀을때 List가 펼쳐지는 경우랑, 페이지가 이동하는 경우 나눔.. detailProductUrl = '' #if package.menuCode == 'A01': detailProductUrl = 'http://www.ybtour.co.kr/Goods/' + urlMap[ package. menuCode] + '/inc_evList_ajax.asp?goodCD=' + detailProduct + '&startDT=' + targetYear + targetMonth #detailProductUrl = 'http://www.ybtour.co.kr/Goods/overseas/inc_evList_ajax.asp?goodCD=150201119&startDT=201408' #if detailProductUrl == 'http://www.ybtour.co.kr/Goods/Overseas/inc_evList_ajax.asp?goodCD=JAA2013113&startDT=201407': #print '' print 'Detail Product URL : ' + detailProductUrl print >> exceptFile, 'Detail Product URL : ', detailProductUrl detailProductList = savefilegethtml.getHtml( detailProductUrl, '', '', 'ybtourTempFile.txt') try: # 2014. 06. 29. 여행상품명에서 국가, 도시코드 가져오는 부분으로 적용.. #codeLists = codes.getCityCode(productNameList[codeIdx], sub2package.menuName, productCommentList[codeIdx], subpackage.menuName) codeLists = codes.getCityCode( productNameList[codeIdx], productCommentList[codeIdx]) cityList = codeLists[0] nationList = codeLists[1] continentList = codeLists[2] siteList = codeList[ 3] # 2014. 8. 3. site 추가 if len(cityList) == 0 and len(
detailProduct = pcode.split('s')[1] detailProductUrl = '' if not (package.menuCode == 'A03' or package.menuCode == 'A06'): # 출발일정 눌렀을때 List가 펼쳐지는 경우랑, 페이지가 이동하는 경우 나눔.. detailProductUrl = '' #if package.menuCode == 'A01': detailProductUrl = 'http://www.ybtour.co.kr/Goods/' + urlMap[package.menuCode] + '/inc_evList_ajax.asp?goodCD=' + detailProduct + '&startDT=' + targetYear + targetMonth #detailProductUrl = 'http://www.ybtour.co.kr/Goods/overseas/inc_evList_ajax.asp?goodCD=150201119&startDT=201408' #if detailProductUrl == 'http://www.ybtour.co.kr/Goods/Overseas/inc_evList_ajax.asp?goodCD=JAA2013113&startDT=201407': #print '' print 'Detail Product URL : ' + detailProductUrl print >> exceptFile, 'Detail Product URL : ', detailProductUrl detailProductList = savefilegethtml.getHtml(detailProductUrl, '', '', 'ybtourTempFile.txt') try: # 2014. 06. 29. 여행상품명에서 국가, 도시코드 가져오는 부분으로 적용.. #codeLists = codes.getCityCode(productNameList[codeIdx], sub2package.menuName, productCommentList[codeIdx], subpackage.menuName) codeLists = codes.getCityCode(productNameList[codeIdx], productCommentList[codeIdx]) cityList = codeLists[0] nationList = codeLists[1] continentList = codeLists[2] siteList = codeList[3] # 2014. 8. 3. site 추가 if len(cityList) == 0 and len(nationList) == 0 and len(continentList) == 0: codeList = codes.getCityCode(sub2package.menuName) cityList = codeList[0] nationList = codeList[1] continentList = codeList[2]
def insertData(productCls, detailUrl, regionUrl, tourAgency, kind, dmst_div): print 'Product Url : ', productCls.url print >> exceptFile, 'Product Url : ', productCls.url # 2014. 7. 23. 카테고리의 국가는 넣지 않기로 함... #codeList = codes.getCityCode(productCls.name.decode('utf-8'), detailUrl.name.decode('utf-8'), regionUrl.name.decode('utf-8')) codeList = codes.getCityCode(productCls.name.decode('utf-8')) cityList = codeList[0] nationList = codeList[1] continentList = codeList[2] siteList = codeList[3] # 2014. 8. 3. site 추가 if len(cityList) == 0 and len(nationList) == 0 and len(continentList) == 0: codeList = codes.getCityCode(detailUrl.name.decode('utf-8')) cityList = codeList[0] nationList = codeList[1] continentList = codeList[2] siteList = codeList[3] # 2014. 8. 3. site 추가 # Master 상품 입력 query = tourQuery.getMasterMergeQuery(tourAgency, productCls.code, productCls.name.decode('utf-8'), menu.kind, dmst_div, '', '') #print query cursor = con.cursor() cursor.execute(query) con.commit() # Region Data 삭제 codes.insertRegionData(tourAgency, productCls.code, cityList, nationList, continentList, siteList) detailProductHtml = savefilegethtml.getHtml(productCls.url, '', '', 'tour2000DetailHtml'+targetMonth+'.txt') pl10Idx = 0 for detailProduct in detailProductHtml: try: if detailProduct.find('<span class="text_pink">') > -1 and detailProduct.find('<a href=') < 0: detailCls = clsProductDetail() numArray = tourUtil.getNumArray(detailProduct) if len(numArray) > 7: detailCls.dDay = targetYear + numArray[0] + numArray[1] detailCls.dTime = numArray[2] + numArray[3] detailCls.aDay = targetYear + numArray[4] + numArray[5] detailCls.aTime = numArray[6] + numArray[7] elif len(numArray) == 4: detailCls.dDay = targetYear + numArray[0] + numArray[1] detailCls.dTime = '' detailCls.aDay = targetYear + numArray[2] + numArray[3] detailCls.aTime = '' elif detailProduct.find('onError') > -1: detailCls.airCode = detailProduct[detailProduct.find('.gif') - 4:detailProduct.find('.gif') - 2] elif detailProduct.find('text_redB') > -1: numArray = tourUtil.getNumArray(tourUtil.getRemovedHtmlTag(detailProduct)) for num in numArray: detailCls.price += num elif detailProduct.find('</a></td>') > -1: if detailProduct.find('text_pink') > -1: detailCls.status = codes.getStatus('tour2000', '예약가능') elif detailProduct.find('text_blau') > -1: detailCls.status = codes.getStatus('tour2000', '출발가능') elif detailProduct.find('text_green') > -1: detailCls.status = codes.getStatus('tour2000', '대기예약') elif detailProduct.find('text_grayLightSmall') > -1: detailCls.status = codes.getStatus('tour2000', '예약마감') detailCls.remainSeat = tourUtil.getRemovedHtmlTag(detailProduct).replace("'", "").strip() elif detailProduct.find('<p class="pl10">') > -1: if pl10Idx == 0: pl10Idx = 1 detailCls.productName = tourUtil.getRemovedHtmlTag(detailProduct).replace("'", "").strip() detailCls.url = mainUrl + tourUtil.getTagAttr(detailProduct, 'a', 'href') detailCls.productSeq = detailProduct.split('ev_ym=')[1].split('&')[0] + detailProduct.split('ev_seq=')[1].split('&')[0] else: pl10Idx = 0 if detailCls.productName.find('부산출발') > -1: departCity = 'PUS' else: departCity = 'ICN' query = tourQuery.getDetailMergeQuery(tourAgency, productCls.code, detailCls.productSeq, detailCls.productName.decode('utf-8'), detailCls.dDay+detailCls.dTime, detailCls.aDay+detailCls.aTime, productCls.period, departCity, '', detailCls.airCode, detailCls.status, detailCls.url, detailCls.price, '0', '0', '0', '', productCls.night) #print >> exceptFile, query #print query cursor = con.cursor() cursor.execute(query) con.commit() #break except: print >> exceptFile, 'detail parcing Error : ', sys.exc_info()[0] pass
# 시간 변수들.. tourAgency = 'tour2000' targetYear = sys.argv[1] targetMonth = sys.argv[2] #targetYear = '2014' #targetMonth = '07' scrappingStartTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") exceptFileName = 'tour2000Exception' + scrappingStartTime + '.txt' exceptFile = open(exceptFileName, 'w') print >> exceptFile, "Start : %s" % time.ctime() mainUrl = 'http://www.tour2000.co.kr' mainHtml = savefilegethtml.getHtml('http://www.tour2000.co.kr/index.asp', '<div class="navi_wholeMenu_box">', '<!-- navi_wholeMenu_wrapper// -->', 'tour2000mainHtml.txt') startMainUrl = False menuList = list() MenuUrlCls = clsMenuUrls() for each_line in mainHtml: if each_line.find('text_pinkB14') > -1: MenuUrlCls = clsMenuUrls() MenuUrlCls.kind = codes.getTourKind(tourAgency, tourUtil.getRemovedHtmlTag(each_line).strip()) startMainUrl = True # 해외여행(패키지), 허니문, 골프, 국내(제주) 제외하고는 일단 패스 if MenuUrlCls.kind == 'A' or MenuUrlCls.kind == 'F' or MenuUrlCls.kind == 'H' or MenuUrlCls.kind == 'No': continue if startMainUrl and each_line.find('<li>') > -1:
def insertData(productCls, detailUrl, regionUrl, tourAgency, kind, dmst_div): print 'Product Url : ', productCls.url print >> exceptFile, 'Product Url : ', productCls.url # 2014. 7. 23. 카테고리의 국가는 넣지 않기로 함... #codeList = codes.getCityCode(productCls.name.decode('utf-8'), detailUrl.name.decode('utf-8'), regionUrl.name.decode('utf-8')) codeList = codes.getCityCode(productCls.name.decode('utf-8')) cityList = codeList[0] nationList = codeList[1] continentList = codeList[2] siteList = codeList[3] # 2014. 8. 3. site 추가 if len(cityList) == 0 and len(nationList) == 0 and len(continentList) == 0: codeList = codes.getCityCode(detailUrl.name.decode('utf-8')) cityList = codeList[0] nationList = codeList[1] continentList = codeList[2] siteList = codeList[3] # 2014. 8. 3. site 추가 # Master 상품 입력 query = tourQuery.getMasterMergeQuery(tourAgency, productCls.code, productCls.name.decode('utf-8'), menu.kind, dmst_div, '', '') #print query cursor = con.cursor() cursor.execute(query) con.commit() # Region Data 삭제 codes.insertRegionData(tourAgency, productCls.code, cityList, nationList, continentList, siteList) detailProductHtml = savefilegethtml.getHtml( productCls.url, '', '', 'tour2000DetailHtml' + targetMonth + '.txt') pl10Idx = 0 for detailProduct in detailProductHtml: try: if detailProduct.find( '<span class="text_pink">') > -1 and detailProduct.find( '<a href=') < 0: detailCls = clsProductDetail() numArray = tourUtil.getNumArray(detailProduct) if len(numArray) > 7: detailCls.dDay = targetYear + numArray[0] + numArray[1] detailCls.dTime = numArray[2] + numArray[3] detailCls.aDay = targetYear + numArray[4] + numArray[5] detailCls.aTime = numArray[6] + numArray[7] elif len(numArray) == 4: detailCls.dDay = targetYear + numArray[0] + numArray[1] detailCls.dTime = '' detailCls.aDay = targetYear + numArray[2] + numArray[3] detailCls.aTime = '' elif detailProduct.find('onError') > -1: detailCls.airCode = detailProduct[detailProduct.find('.gif') - 4:detailProduct.find('.gif' ) - 2] elif detailProduct.find('text_redB') > -1: numArray = tourUtil.getNumArray( tourUtil.getRemovedHtmlTag(detailProduct)) for num in numArray: detailCls.price += num elif detailProduct.find('</a></td>') > -1: if detailProduct.find('text_pink') > -1: detailCls.status = codes.getStatus('tour2000', '예약가능') elif detailProduct.find('text_blau') > -1: detailCls.status = codes.getStatus('tour2000', '출발가능') elif detailProduct.find('text_green') > -1: detailCls.status = codes.getStatus('tour2000', '대기예약') elif detailProduct.find('text_grayLightSmall') > -1: detailCls.status = codes.getStatus('tour2000', '예약마감') detailCls.remainSeat = tourUtil.getRemovedHtmlTag( detailProduct).replace("'", "").strip() elif detailProduct.find('<p class="pl10">') > -1: if pl10Idx == 0: pl10Idx = 1 detailCls.productName = tourUtil.getRemovedHtmlTag( detailProduct).replace("'", "").strip() detailCls.url = mainUrl + tourUtil.getTagAttr( detailProduct, 'a', 'href') detailCls.productSeq = detailProduct.split( 'ev_ym=')[1].split('&')[0] + detailProduct.split( 'ev_seq=')[1].split('&')[0] else: pl10Idx = 0 if detailCls.productName.find('부산출발') > -1: departCity = 'PUS' else: departCity = 'ICN' query = tourQuery.getDetailMergeQuery( tourAgency, productCls.code, detailCls.productSeq, detailCls.productName.decode('utf-8'), detailCls.dDay + detailCls.dTime, detailCls.aDay + detailCls.aTime, productCls.period, departCity, '', detailCls.airCode, detailCls.status, detailCls.url, detailCls.price, '0', '0', '0', '', productCls.night) #print >> exceptFile, query #print query cursor = con.cursor() cursor.execute(query) con.commit() #break except: print >> exceptFile, 'detail parcing Error : ', sys.exc_info()[0] pass
tourAgency = 'tour2000' targetYear = sys.argv[1] targetMonth = sys.argv[2] #targetYear = '2014' #targetMonth = '07' scrappingStartTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") exceptFileName = 'tour2000Exception' + scrappingStartTime + '.txt' exceptFile = open(exceptFileName, 'w') print >> exceptFile, "Start : %s" % time.ctime() mainUrl = 'http://www.tour2000.co.kr' mainHtml = savefilegethtml.getHtml('http://www.tour2000.co.kr/index.asp', '<div class="navi_wholeMenu_box">', '<!-- navi_wholeMenu_wrapper// -->', 'tour2000mainHtml.txt') startMainUrl = False menuList = list() MenuUrlCls = clsMenuUrls() for each_line in mainHtml: if each_line.find('text_pinkB14') > -1: MenuUrlCls = clsMenuUrls() MenuUrlCls.kind = codes.getTourKind( tourAgency, tourUtil.getRemovedHtmlTag(each_line).strip()) startMainUrl = True # 해외여행(패키지), 허니문, 골프, 국내(제주) 제외하고는 일단 패스 if MenuUrlCls.kind == 'A' or MenuUrlCls.kind == 'F' or MenuUrlCls.kind == 'H' or MenuUrlCls.kind == 'No':
self.period = '' self.airCode = '' self.status = '' self.url = '' self.code = '' self.productCode = '' self.airchk = '' self.city = '' def toString(self): val = 'name:'+self.productname+',price:'+self.price+',dDay:'+self.dDay+',dTime:'+self.dTime+',aDay:'+self.aDay+',aTime:'+self.aTime + ',night:'+self.night+',city:'+self.city val += ',period:'+self.period+',airCode:'+self.airCode+',status:'+self.status+',url:'+self.url+',code:'+self.code+',productCode:'+self.productCode+',airchk:'+self.airchk return val tourkind = 'F' period = '' detailHtml = savefilegethtml.getHtml('http://www.naeiltour.co.kr/friday/program/program_include.asp?good_cd=2302009532&sel_ym=201407', '', '', 'naeiltourDetailHtml.txt') print 'http://www.naeiltour.co.kr/friday/program/program_include.asp?good_cd=23020145&sel_ym=201407' departDayList = list() for detail_each_line in detailHtml: if detail_each_line.find("fn_goodDetail('") > -1: departDayList.append(detail_each_line.split("fn_goodDetail('")[1].split("'")[0]) # 출발 가능 날짜에 항공사 찾아오는 부분 productCls = clsProduct() for dayInfo in departDayList: productListUrl = 'http://www.naeiltour.co.kr/friday/program/program_include.asp?good_cd=2302009532&sel_day=20140708' print 'ProductListUrl : ' + productListUrl productListHtml = savefilegethtml.getHtml(productListUrl, '', '', 'naeiltourproductListHtml.txt') print 'ProductListUrl : ' + productListUrl for product in productListHtml:
# 시간 변수들.. tourAgency = 'onlinetour' mainUrl = 'http://www.onlinetour.co.kr/web/tour' mainUrl2 = 'http://www.onlinetour.co.kr' targetYear = sys.argv[1] targetMonth = sys.argv[2] #targetYear = '2014' #targetMonth = '07' scrappingStartTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") exceptFileName = 'onlinetourException' + scrappingStartTime + '.txt' exceptFile = open(exceptFileName, 'w') print >> exceptFile, "Start : %s" % time.ctime() mainpageHtml = savefilegethtml.getHtml('http://www.onlinetour.co.kr/web/home', '<li id="n_pack">', '<!--}} ot_navi-->', 'onlinetourMainPage.txt') # URL 쑤셔넣는 부분... mainMenuList = list() mainMenuUrls = clsMenuUrls() subMenuUrls = clsSubMenuUrls() detailRegionUrls = clsDetailRegionUrls() chkFree = False chkDomestic = False for menuList in mainpageHtml: try: #print menuList if menuList.find('<a href=') > -1 and menuList.find('<li>') < 0: mainMenuUrls = clsMenuUrls() mainMenuUrls.name = tourUtil.getRemovedHtmlTag(menuList).strip() mainMenuUrls.url = tourUtil.getTagAttr(menuList, 'a', 'href')
self.url = '' self.code = '' self.productCode = '' self.airchk = '' self.city = '' def toString(self): val = 'name:' + self.productname + ',price:' + self.price + ',dDay:' + self.dDay + ',dTime:' + self.dTime + ',aDay:' + self.aDay + ',aTime:' + self.aTime + ',night:' + self.night + ',city:' + self.city val += ',period:' + self.period + ',airCode:' + self.airCode + ',status:' + self.status + ',url:' + self.url + ',code:' + self.code + ',productCode:' + self.productCode + ',airchk:' + self.airchk return val tourkind = 'F' period = '' detailHtml = savefilegethtml.getHtml( 'http://www.naeiltour.co.kr/friday/program/program_include.asp?good_cd=2302009532&sel_ym=201407', '', '', 'naeiltourDetailHtml.txt') print 'http://www.naeiltour.co.kr/friday/program/program_include.asp?good_cd=23020145&sel_ym=201407' departDayList = list() for detail_each_line in detailHtml: if detail_each_line.find("fn_goodDetail('") > -1: departDayList.append( detail_each_line.split("fn_goodDetail('")[1].split("'")[0]) # 출발 가능 날짜에 항공사 찾아오는 부분 productCls = clsProduct() for dayInfo in departDayList: productListUrl = 'http://www.naeiltour.co.kr/friday/program/program_include.asp?good_cd=2302009532&sel_day=20140708' print 'ProductListUrl : ' + productListUrl productListHtml = savefilegethtml.getHtml(productListUrl, '', '',
targetYear = sys.argv[1] targetMonth = sys.argv[2] #targetYear = '2014' #targetMonth = '07' scrappingStartTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") homepageUrl = 'http://www.naeiltour.co.kr' exceptFile = open('naeiltourException' + scrappingStartTime + '.txt', 'w') print >> exceptFile, "Start : %s" % time.ctime() #배낭여행 시작======================================== print '@@@@@@@@@@@@@ backpack start @@@@@@@@@@@@@@@@@@@@' print >> exceptFile, '@@@@@@@@@@@@@ backpack start @@@@@@@@@@@@@@@@@@@@' backpackUrl = 'http://www.naeiltour.co.kr/backpack/eu_main.asp?area=40' mainHtml = savefilegethtml.getHtml(backpackUrl, '<div id="left_mn">', '<div id="left_mn2">', 'naeiltourbackpackHtml.txt') comment = False backpackMenuList = list() #menu들 List country = '' try: for each_line in mainHtml: if each_line.find('<!--') > -1: comment = True if comment == False and each_line.find('/backpack/list.asp?') > -1: backpackRegionClass = clsRegionUrl() backpackRegionClass.country = country backpackRegionClass.url = homepageUrl + each_line.split('href="')[1].split('"')[0] if each_line.find('">-') > -1: backpackRegionClass.region = each_line.split('">-')[1].split('<')[0].strip()
def searchProduct(filename, productcode, productName, period, targetUrl, listUrl, productDetailUrl, departCity, tourkind, dmst_div, country='', city='', comment=''): detailHtml = savefilegethtml.getHtml(targetUrl, '', '', 'naeiltourDetailHtml.txt') print >> filename, 'TargetUrl : ', targetUrl departDayList = list() for detail_each_line in detailHtml: if detail_each_line.find("fn_goodDetail('") > -1: departDayList.append( detail_each_line.split("fn_goodDetail('")[1].split("'")[0]) # 출발 가능 날짜에 항공사 찾아오는 부분 try: con = cx_Oracle.connect( "bigtour/[email protected]:1521/ora11g") codeList = codes.getCityCode(productName, city, comment, country) cityList = codeList[0] nationList = codeList[1] #print nationList #print cityList #print nationList #print cityList query = savefilegethtml.getMasterMergeQueryTest1( 'naeiltour', productcode, '', country, city, productName, tourkind, dmst_div, comment, '', nationList, cityList) # A : 해외(Abroad) #query = savefilegethtml.getMasterMergeQuery('naeiltour', productcode, '', country, city, productName, tourkind, dmst_div, comment, '', nationList, cityList) # A : 해외(Abroad) #print query cursor = con.cursor() cursor.execute(query) con.commit() productCls = clsProduct() for dayInfo in departDayList: try: productListUrl = listUrl + productcode + '&sel_day=' + dayInfo print 'ProductListUrl : ' + productListUrl productListHtml = savefilegethtml.getHtml( productListUrl, '', '', 'naeiltourproductListHtml.txt') print >> filename, 'ProductListUrl : ' + productListUrl for product in productListHtml: try: if product.find("fn_price('") > -1: productCls = clsProduct() productSplit = product.split('fn_price')[1].split( "'") productCls.productCode = productSplit[1] productCls.dDay = productSplit[3] productCls.code = productSplit[5] if tourkind == 'W' or tourkind == 'G': productCls.airCode = product[ product.find('.gif') - 2:product.find('.gif')] else: productCls.airCode = productSplit[ 7] # 한글 공항코드... but 우리는 영문2자리 공항코드가 필요하다... productCls.price = productSplit[9].replace(',', '') #print productSplit[11] productCls.status = codes.getStatus( 'naeiltour', productSplit[11] ) # 공백 : 예약가능, 03 : 마감임박, 05 : 마감 #if tourkind == 'W': #productCls.city = productSplit[13] productCls.url = productDetailUrl + productcode + '&sel_day=' + productCls.dDay productCls.productname = productName productCls.dTime = '' productCls.aDay = '' productCls.aTime = '' if period != '' and tourkind == 'F': if product.find('<td width="134">') > -1: productCls.period = period #print productCls.toString() productCls.airCode = product[ product.find('.gif') - 2:product.find('.gif')] query = savefilegethtml.getDetailMergeQueryTest1( 'naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', '') #query = savefilegethtml.getDetailMergeQuery('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', '') #print query cursor = con.cursor() cursor.execute(query) con.commit() #break if period == '' and tourkind == 'F': if product.find('<td class="FRIDAYSPACING" >' ) > -1 and product.find( '.gif') > -1: productCls.airCode = product[ product.find('.gif') - 2:product.find('.gif')] if product.find('idth="220">') > -1: """ # 날짜 가져오는 부분... 종류가 너무 많아서 좀 수정 if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('(')[1])[0] productCls.period = re.findall(r"\d", product.split('(')[1])[1] elif product.find('[') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('[')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[0])) productCls.night = tmpText[ len(tmpText) - 1].encode('utf-8') tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[1])) productCls.period = tmpText[0].encode( 'utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1( 'naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() if period == '' and tourkind == 'W': if product.find('valign="middle"') > -1: """ if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('(')[1])[0] productCls.period = re.findall(r"\d", product.split('(')[1])[1] elif product.find('[') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('[')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[0])) productCls.night = tmpText[ len(tmpText) - 1].encode('utf-8') tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[1])) productCls.period = tmpText[0].encode( 'utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1( 'naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() #break if period == '' and tourkind == 'G': if product.find('valign="middle"') > -1: """ if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('(')[1])[0] productCls.period = re.findall(r"\d", product.split('(')[1])[1] elif product.find('[') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('[')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[0])) productCls.night = tmpText[ len(tmpText) - 1].encode('utf-8') tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[1])) productCls.period = tmpText[0].encode( 'utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1( 'naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() #break if period == '' and tourkind == 'D': if product.find('<td class="FRIDAYSPACING" >' ) > -1 and product.find( '.gif') > -1: productCls.airCode = product[ product.find('.gif') - 2:product.find('.gif')] if product.find('idth="220">') > -1: """ if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('')[1])[1] else: productCls.night = re.findall(r"\d", product.split('COLOR=#FF7A73>')[1])[0] productCls.period = re.findall(r"\d", product.split('COLOR=#FF7A73>')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[0])) productCls.night = tmpText[ len(tmpText) - 1].encode('utf-8') tmpText = re.findall( u'[\^0-9]+', tourUtil.getRemovedHtmlTag( splitText[1])) productCls.period = tmpText[0].encode( 'utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# if product.find('COLOR=BLUE>') > -1: departCity = 'PUS' else: departCity = 'ICN' #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1( 'naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() #break except cx_Oracle.DatabaseError as err1: print >> filename, err1 pass except: print >> filename, "Depth3 Error:", sys.exc_info()[0] pass #break except: print >> filename, "Depth2 Error:", sys.exc_info()[0] pass except: print >> filename, "Depth1 Error:", sys.exc_info()[0] pass finally: con.close()
def searchProduct(filename, productcode, productName, period, targetUrl, listUrl, productDetailUrl, departCity, tourkind, dmst_div, country='', city='', comment=''): detailHtml = savefilegethtml.getHtml(targetUrl, '', '', 'naeiltourDetailHtml.txt') print >> filename, 'TargetUrl : ', targetUrl departDayList = list() for detail_each_line in detailHtml: if detail_each_line.find("fn_goodDetail('") > -1: departDayList.append(detail_each_line.split("fn_goodDetail('")[1].split("'")[0]) # 출발 가능 날짜에 항공사 찾아오는 부분 try: con = cx_Oracle.connect("bigtour/[email protected]:1521/ora11g") codeList = codes.getCityCode(productName, city, comment, country) cityList = codeList[0] nationList = codeList[1] #print nationList #print cityList #print nationList #print cityList query = savefilegethtml.getMasterMergeQueryTest1('naeiltour', productcode, '', country, city, productName, tourkind, dmst_div, comment, '', nationList, cityList) # A : 해외(Abroad) #query = savefilegethtml.getMasterMergeQuery('naeiltour', productcode, '', country, city, productName, tourkind, dmst_div, comment, '', nationList, cityList) # A : 해외(Abroad) #print query cursor = con.cursor() cursor.execute(query) con.commit() productCls = clsProduct() for dayInfo in departDayList: try: productListUrl = listUrl + productcode + '&sel_day=' + dayInfo print 'ProductListUrl : ' + productListUrl productListHtml = savefilegethtml.getHtml(productListUrl, '', '', 'naeiltourproductListHtml.txt') print >> filename, 'ProductListUrl : ' + productListUrl for product in productListHtml: try: if product.find("fn_price('") > -1: productCls = clsProduct() productSplit = product.split('fn_price')[1].split("'") productCls.productCode = productSplit[1] productCls.dDay = productSplit[3] productCls.code = productSplit[5] if tourkind == 'W' or tourkind == 'G': productCls.airCode = product[product.find('.gif') - 2:product.find('.gif')] else: productCls.airCode = productSplit[7] # 한글 공항코드... but 우리는 영문2자리 공항코드가 필요하다... productCls.price = productSplit[9].replace(',', '') #print productSplit[11] productCls.status = codes.getStatus('naeiltour', productSplit[11]) # 공백 : 예약가능, 03 : 마감임박, 05 : 마감 #if tourkind == 'W': #productCls.city = productSplit[13] productCls.url = productDetailUrl + productcode + '&sel_day=' + productCls.dDay productCls.productname = productName productCls.dTime = '' productCls.aDay = '' productCls.aTime = '' if period != '' and tourkind == 'F': if product.find('<td width="134">') > -1: productCls.period = period #print productCls.toString() productCls.airCode = product[product.find('.gif') - 2:product.find('.gif')] query = savefilegethtml.getDetailMergeQueryTest1('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', '') #query = savefilegethtml.getDetailMergeQuery('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', '') #print query cursor = con.cursor() cursor.execute(query) con.commit() #break if period == '' and tourkind == 'F': if product.find('<td class="FRIDAYSPACING" >') > -1 and product.find('.gif') > -1: productCls.airCode = product[product.find('.gif') - 2:product.find('.gif')] if product.find('idth="220">') > -1: """ # 날짜 가져오는 부분... 종류가 너무 많아서 좀 수정 if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('(')[1])[0] productCls.period = re.findall(r"\d", product.split('(')[1])[1] elif product.find('[') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('[')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[0])) productCls.night = tmpText[len(tmpText)-1].encode('utf-8') tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[1])) productCls.period = tmpText[0].encode('utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() if period == '' and tourkind == 'W': if product.find('valign="middle"') > -1: """ if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('(')[1])[0] productCls.period = re.findall(r"\d", product.split('(')[1])[1] elif product.find('[') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('[')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[0])) productCls.night = tmpText[len(tmpText)-1].encode('utf-8') tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[1])) productCls.period = tmpText[0].encode('utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() #break if period == '' and tourkind == 'G': if product.find('valign="middle"') > -1: """ if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('(')[1])[0] productCls.period = re.findall(r"\d", product.split('(')[1])[1] elif product.find('[') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('[')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[0])) productCls.night = tmpText[len(tmpText)-1].encode('utf-8') tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[1])) productCls.period = tmpText[0].encode('utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() #break if period == '' and tourkind == 'D': if product.find('<td class="FRIDAYSPACING" >') > -1 and product.find('.gif') > -1: productCls.airCode = product[product.find('.gif') - 2:product.find('.gif')] if product.find('idth="220">') > -1: """ if product.find('(') > -1: productCls.night = re.findall(r"\d", product.split('[')[1])[0] productCls.period = re.findall(r"\d", product.split('')[1])[1] else: productCls.night = re.findall(r"\d", product.split('COLOR=#FF7A73>')[1])[0] productCls.period = re.findall(r"\d", product.split('COLOR=#FF7A73>')[1])[1] """ splitText = product.decode('cp949').split(u'박') if len(splitText) > 1: tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[0])) productCls.night = tmpText[len(tmpText)-1].encode('utf-8') tmpText = re.findall(u'[\^0-9]+', tourUtil.getRemovedHtmlTag(splitText[1])) productCls.period = tmpText[0].encode('utf-8') else: productCls.night = '0' productCls.period = '0' ############################################################################################# if product.find('COLOR=BLUE>') > -1: departCity = 'PUS' else: departCity = 'ICN' #print productCls.toString() query = savefilegethtml.getDetailMergeQueryTest1('naeiltour', productcode, productCls.code, productCls.productname, '20' + productCls.dDay, '', productCls.period, departCity, '', productCls.airCode, productCls.status, productCls.url, productCls.price, '0', '0', '0', '', productCls.night) #print 'Query : ' + query cursor = con.cursor() cursor.execute(query) con.commit() #break except cx_Oracle.DatabaseError as err1: print >> filename, err1 pass except: print >> filename, "Depth3 Error:", sys.exc_info()[0] pass #break except: print >> filename, "Depth2 Error:", sys.exc_info()[0] pass except: print >> filename, "Depth1 Error:", sys.exc_info()[0] pass finally: con.close()
exceptFile = open('tourbaksaException'+scrappingStartTime+'.txt', 'w') print >> exceptFile, "Start : %s" % time.ctime() print menulist con = cx_Oracle.connect("bigtour/[email protected]:1521/ora11g") # 메뉴에 다 잘들어 갔나 확인.. for level1 in menulist: for level2 in level1.tourkindgroup: for level3 in level2.regionUrlGroup: print 'Depart City : ' + level1.departCity + ', TourKind:' + level2.tourkind + ', Region : ' + level3.region + '(' + level3.url + ')' try: print >> exceptFile, level3.url regionHtml = savefilegethtml.getHtml(level3.url, '<div class="leftArea">', '</nav><!-- //lnb -->', 'tourbaksaRegionHtml.txt', '', '') for each_line in regionHtml: if each_line.find('<li class="') > -1 and each_line.find('M1=') > -1: #print each_line cityClass = clsCityUrlGroup() cityClass.city = each_line.split('</a>')[0].split(">")[2] cityClass.url = homepageUrl + each_line.split("href='")[1].split("'")[0] print 'Depart Url : ' + cityClass.url try: print >> exceptFile, cityClass.url departListHtml = savefilegethtml.getHtml(cityClass.url, '<div class="list" id="itemList" >', '', 'tourbaksaDepartListHtml.txt') try: productList = clsProductList()
else: return 'No' # 시간 변수들.. tourAgency = 'vgtour' targetYear = sys.argv[1] targetMonth = sys.argv[2] #targetYear = '2014' #targetMonth = '07' scrappingStartTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") exceptFile = open('verygoodtourException' + scrappingStartTime + '.txt', 'w') print >> exceptFile, "Start : %s" % time.ctime() sitemapUrl = 'http://www.verygoodtour.com/Content/SiteMap.html' sitemapHtml = savefilegethtml.getHtml(sitemapUrl, '', '', 'sitemapHtml.txt') #sitemapHtml = urllib2.urlopen(sitemapUrl).read() #sitemapHtmlFile = open('sitemapHtml.txt', 'w') #print >> sitemapHtmlFile, sitemapHtml #sitemapHtmlFile.close() #sitemapHtml = open('sitemapHtml.txt') #menulist = list() # 메뉴 Url 들을 담고 있을 clsProduct들의 List tourType = '' departCity = '' region = '' depthIdx = 0 idx = 0 productList = list() # 중복으로 같은 상품 안가져 오도록 List에 넣고.. 없는 것들만 들고오도록.. productList.append('START') con = cx_Oracle.connect("bigtour/[email protected]:1521/ora11g") try: