def getParam(self, line):
     #dicts = dict()
     self.startLocation = line[line.find('startLocation=') + len('startLocation='):line.find('amp;') - 1]
     line = line[line.find('amp;') + len('amp;'):]
     #print(line)
     self.id = line[line.find('id=') + len('id='):line.find('amp;') - 1]
     line = line[line.find('amp;') + len('amp;'):]
     #print(line)
     self.type = line[line.find('type=') + len('type='):line.find('amp;') - 1]
     line = line[line.find('amp;') + len('amp;'):]
     #print(line)
     self.MLoc = line[line.find('MLoc=') + len('MLoc='):line.find(' ') - 1]
     #self.tourType = getTourType(line.split('<span>')[1].split('<')[0])
     self.tourType = codes.getTourKind('modetour', line.split('<span>')[1].split('<')[0])
示例#2
0
def getTourType(idx):
    if idx == 0:
        return codes.getTourKind('verygoodtour', 'P')
    elif idx == 1:
        return codes.getTourKind('verygoodtour', 'F')
    elif idx == 2:
        return codes.getTourKind('verygoodtour', 'D')
    elif idx == 3:
        return codes.getTourKind('verygoodtour', 'PUS')
    elif idx == 4:
        return codes.getTourKind('verygoodtour', 'W')
    elif idx == 5:
        return codes.getTourKind('verygoodtour', 'G')
    elif idx == 6:
        return codes.getTourKind('verygoodtour', 'Luxury')
    elif idx == 7:
        return codes.getTourKind('verygoodtour', 'Air')
    elif idx == 8:
        return codes.getTourKind('verygoodtour', 'Hotel')
    elif idx == 9:
        return codes.getTourKind('verygoodtour', 'Company')
    else:
        return 'No'
def getTourType(idx):
    if idx == 0:
        return codes.getTourKind('verygoodtour', 'P')
    elif idx == 1:
        return codes.getTourKind('verygoodtour', 'F')
    elif idx == 2:
        return codes.getTourKind('verygoodtour', 'D')
    elif idx == 3:
        return codes.getTourKind('verygoodtour', 'PUS')
    elif idx == 4:
        return codes.getTourKind('verygoodtour', 'W')
    elif idx == 5:
        return codes.getTourKind('verygoodtour', 'G')
    elif idx == 6:
        return codes.getTourKind('verygoodtour', 'Luxury')
    elif idx == 7:
        return codes.getTourKind('verygoodtour', 'Air')
    elif idx == 8:
        return codes.getTourKind('verygoodtour', 'Hotel')
    elif idx == 9:
        return codes.getTourKind('verygoodtour', 'Company')
    else:
        return 'No'
 def getParam(self, line):
     #dicts = dict()
     self.startLocation = line[line.find('startLocation=') +
                               len('startLocation='):line.find('amp;') - 1]
     line = line[line.find('amp;') + len('amp;'):]
     #print(line)
     self.id = line[line.find('id=') + len('id='):line.find('amp;') - 1]
     line = line[line.find('amp;') + len('amp;'):]
     #print(line)
     self.type = line[line.find('type=') + len('type='):line.find('amp;') -
                      1]
     line = line[line.find('amp;') + len('amp;'):]
     #print(line)
     self.MLoc = line[line.find('MLoc=') + len('MLoc='):line.find(' ') - 1]
     #self.tourType = getTourType(line.split('<span>')[1].split('<')[0])
     self.tourType = codes.getTourKind(
         'modetour',
         line.split('<span>')[1].split('<')[0])
for each_line in homepageHtml:
    if each_line.find('<ul id="city') > -1:
        if len(productGroupCls.tourkindgroup) > 0:
            menulist.append(productGroupCls)
        productGroupCls = clsTotalGroup()
        if each_line.find('city1') > -1:
            productGroupCls.departCity = 'ICN'
        elif each_line.find('city2') > -1:
            productGroupCls.departCity = 'PUS'
        else:
            productGroupCls.departCity = 'TAE'
    elif each_line.find('href="/submain/?') > -1 or each_line.find('href="/SubMain/index.asp?') > -1 or (each_line.find('<li>') < 0 and (each_line.find('Areaindex.asp') > -1 or each_line.find('areaindex.asp') > -1)):
        tourkindGroupCls = clsTourKindGroup()
        tourkindGroupCls.url = each_line.split('href="')[1].split('">')[0]
        #tourkindGroupCls.tourkind = each_line.split('>')[1].split('<')[0]  # Code명 통일하자..
        tourkindGroupCls.tourkind = codes.getTourKind('tourbaksa', each_line.split('>')[1].split('<')[0].strip().decode('cp949'))
    elif each_line.find('<li>') > -1 and each_line.find('<!--') < 0 and each_line.find('-->') < 0 and (each_line.find('Areaindex') > -1 or each_line.find('areaindex') > -1 or each_line.find('M1=') > -1):
        regionUrlGroupCls = clsRegionUrlGroup()
        regionUrlGroupCls.region = each_line.split('</a>')[0].split('">')[1]
        regionUrlGroupCls.url = homepageUrl + each_line.split('href="')[1].split('"')[0]
        tourkindGroupCls.regionUrlGroup.append(regionUrlGroupCls)
    elif each_line.find('</ul>') > -1:
        if productGroupCls.tourkindgroup.count(tourkindGroupCls) < 1:
            productGroupCls.tourkindgroup.append(tourkindGroupCls)
    elif each_line.find('class="etcMenu"') > -1:
        menulist.append(productGroupCls)
        
#homepageHtml.close()

exceptFile = open('tourbaksaException'+scrappingStartTime+'.txt', 'w')
print >> exceptFile, "Start : %s" % time.ctime()
startComment = False
firstOversea = True
subMenu = False
mainList = list()
clsMain = mainCls()
clsSubMenu = subMenuCls()
for each_line in mainpageHtml:
    #print main
    if each_line.find('<!--') > -1 :
        startComment = True
    elif each_line.find('-->') > -1:
        startComment = False
        
    if firstOversea and each_line.find('해외패키지') > -1:
        clsMain = mainCls()
        clsMain.name = codes.getTourKind('lottetour', 'package')
        firstOversea = False
    elif not startComment and each_line.find('<li') > -1 and each_line.find('<a href=') > -1:
        clsSubMenu = subMenuCls()
        clsSubMenu.url = mainUrl + tourUtil.getTagAttr(each_line, 'a', 'href')
        if each_line.find('title') > -1:
            clsSubMenu.name = tourUtil.getRemovedHtmlTag(each_line).strip()
            clsMain.subMenuList.append(clsSubMenu)
        else:
            subMenu = True
    elif not startComment and subMenu and each_line.find('title=') > -1:
        clsSubMenu.name = each_line.split('>')[1].split('<')[0]
        clsMain.subMenuList.append(clsSubMenu)
        subMenu = False
    elif each_line.find('sub_depth0') > -1:
        if len(clsMain.subMenuList) > 0:
示例#7
0
     mainMenuUrls.url = tourUtil.getTagAttr(menuList, 'a', 'href')
     mainMenuUrls.dmst_div = 'A'
     if menuList.find('부산출발') > -1:
         mainMenuUrls.departCity = 'PUS'
         # 부산의 경우.. 세부 지역 URL이 바로 노출되어 있지 않아.. 강제로 쭈셔 넣어 준다..
         mainMenuUrls.subMenuList.append(pusanUrl('동남아', 'http://www.onlinetour.co.kr/web/tour?region_cd=D10'))
         mainMenuUrls.subMenuList.append(pusanUrl('일본', 'http://www.onlinetour.co.kr/web/tour?region_cd=D20'))
         mainMenuUrls.subMenuList.append(pusanUrl('중국', 'http://www.onlinetour.co.kr/web/tour?region_cd=D30'))
         mainMenuUrls.subMenuList.append(pusanUrl('괌/사이판', 'http://www.onlinetour.co.kr/web/tour?region_cd=D40'))
         mainMenuUrls.subMenuList.append(pusanUrl('남태평양', 'http://www.onlinetour.co.kr/web/tour?region_cd=D50'))
         mainMenuUrls.subMenuList.append(pusanUrl('유럽/특수', 'http://www.onlinetour.co.kr/web/tour?region_cd=D60'))
         mainMenuUrls.subMenuList.append(pusanUrl('미주/특수', 'http://www.onlinetour.co.kr/web/tour?region_cd=D70'))
     else:
         mainMenuUrls.departCity = 'ICN'
     
     mainMenuUrls.tourType = codes.getTourKind(tourAgency, mainMenuUrls.name)
     
     print mainMenuUrls.name.decode('utf-8') + ' : ' + mainMenuUrls.url + ' : ' + mainMenuUrls.tourType
     #print >> exceptFile, mainMenuUrls.name + ' : ' + mainMenuUrls.url + ' : ' + mainMenuUrls.tourType
     
     if menuList.find('국내여행') > -1:
         chkDomestic = True
 elif not chkDomestic and menuList.find('<li><a href=') > -1 and menuList.find('region_cd=') > -1 and menuList.find('전체') < 0:
     subMenuUrls = clsSubMenuUrls()
     subMenuUrls.name = tourUtil.getRemovedHtmlTag(menuList).strip()
     subMenuUrls.url = tourUtil.getTagAttr(menuList, 'a', 'href')
     
     print subMenuUrls.name.decode('utf-8') + ' : ' + subMenuUrls.url
     #print >> exceptFile, 'subMenuUrls : ' + subMenuUrls.url
     
     detailProductHtml = savefilegethtml.getHtml(subMenuUrls.url, 'class="container', '<!-- end .ot_tab_style1 -->', 'onlinetourSubPage.txt')
示例#8
0
     if each_line.find('city1') > -1:
         productGroupCls.departCity = 'ICN'
     elif each_line.find('city2') > -1:
         productGroupCls.departCity = 'PUS'
     else:
         productGroupCls.departCity = 'TAE'
 elif each_line.find('href="/submain/?') > -1 or each_line.find(
         'href="/SubMain/index.asp?') > -1 or (
             each_line.find('<li>') < 0 and
             (each_line.find('Areaindex.asp') > -1
              or each_line.find('areaindex.asp') > -1)):
     tourkindGroupCls = clsTourKindGroup()
     tourkindGroupCls.url = each_line.split('href="')[1].split('">')[0]
     #tourkindGroupCls.tourkind = each_line.split('>')[1].split('<')[0]  # Code명 통일하자..
     tourkindGroupCls.tourkind = codes.getTourKind(
         'tourbaksa',
         each_line.split('>')[1].split('<')[0].strip().decode('cp949'))
 elif each_line.find('<li>') > -1 and each_line.find(
         '<!--') < 0 and each_line.find('-->') < 0 and (
             each_line.find('Areaindex') > -1
             or each_line.find('areaindex') > -1
             or each_line.find('M1=') > -1):
     regionUrlGroupCls = clsRegionUrlGroup()
     regionUrlGroupCls.region = each_line.split('</a>')[0].split('">')[1]
     regionUrlGroupCls.url = homepageUrl + each_line.split(
         'href="')[1].split('"')[0]
     tourkindGroupCls.regionUrlGroup.append(regionUrlGroupCls)
 elif each_line.find('</ul>') > -1:
     if productGroupCls.tourkindgroup.count(tourkindGroupCls) < 1:
         productGroupCls.tourkindgroup.append(tourkindGroupCls)
 elif each_line.find('class="etcMenu"') > -1:
    print 'Main URL : ' + mainUrl
    print >> exceptFile, mainUrl
    packageListXml = urllib2.urlopen(mainUrl).read()
    packageListDict = xmltodict.parse(packageListXml)

    urlMap = dict()
    urlMap['A01'] = 'overseas'  # overseas
    urlMap['A03'] = 'airtel'  # airtel
    urlMap['A06'] = 'Honeymoon'  # Honeymoon
    urlMap['A09'] = 'Overseas'  # Golf
    urlMap['A12'] = 'Overseas'  # 국내 여행... but.. 주소는 Overseas를 사용하네..
    urlMap['A15'] = 'Overseas'  # 지역 출발... but 주소는 Overseas를 사용
    urlMap['A18'] = 'Overseas'  # Cruise but 주소는 Overseas

    packageMap = dict()
    packageMap['A01'] = codes.getTourKind('ybtour', 'P')
    packageMap['A03'] = codes.getTourKind('ybtour', 'F')
    packageMap['A06'] = codes.getTourKind('ybtour', 'W')
    packageMap['A09'] = codes.getTourKind('ybtour', 'G')
    packageMap['A12'] = codes.getTourKind('ybtour', 'D')
    packageMap['A15'] = codes.getTourKind('ybtour', 'PUS')
    packageMap['A18'] = codes.getTourKind('ybtour', 'C')

    for pack in packageListDict['ROOT']['List']:
        try:
            package = classPackage()
            package.menuCode = pack['MenuCD']
            package.menuName = pack['MenuNM']
            package.goodTypeCode = pack['GoodTypeCD']
            package.sbar = pack['SBAR']
            ml1List.append(package)
 print 'Main URL : ' + mainUrl
 print >> exceptFile, mainUrl
 packageListXml = urllib2.urlopen(mainUrl).read()
 packageListDict = xmltodict.parse(packageListXml)
 
 urlMap = dict()
 urlMap['A01'] = 'overseas'  # overseas
 urlMap['A03'] = 'airtel'    # airtel
 urlMap['A06'] = 'Honeymoon' # Honeymoon
 urlMap['A09'] = 'Overseas'  # Golf
 urlMap['A12'] = 'Overseas'  # 국내 여행... but.. 주소는 Overseas를 사용하네..
 urlMap['A15'] = 'Overseas'  # 지역 출발... but 주소는 Overseas를 사용
 urlMap['A18'] = 'Overseas'    # Cruise but 주소는 Overseas
 
 packageMap = dict()
 packageMap['A01'] = codes.getTourKind('ybtour', 'P')
 packageMap['A03'] = codes.getTourKind('ybtour', 'F')
 packageMap['A06'] = codes.getTourKind('ybtour', 'W')
 packageMap['A09'] = codes.getTourKind('ybtour', 'G')
 packageMap['A12'] = codes.getTourKind('ybtour', 'D')
 packageMap['A15'] = codes.getTourKind('ybtour', 'PUS')
 packageMap['A18'] = codes.getTourKind('ybtour', 'C')
 
 for pack in packageListDict['ROOT']['List']:
     try:
         package = classPackage()
         package.menuCode = pack['MenuCD']
         package.menuName = pack['MenuNM']
         package.goodTypeCode = pack['GoodTypeCD']
         package.sbar = pack['SBAR']
         ml1List.append(package)
        print >> exceptFile, 'Start City : ', html.split('province_')[1].split('_')[0]
        return html.split('province_')[1].split('_')[0]
    else:
        print >> exceptFile, 'Start City : ICN'
        return 'ICN'

# 시간 변수들..
tourAgency = 'hanatour'
targetYear = sys.argv[1]
targetMonth = sys.argv[2]
#targetYear = '2014'
#targetMonth = '07'
scrappingStartTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")

mainUrls = list()
packageCls = clsMenuUrls(codes.getTourKind('hanatour', 'P'), 'http://www.hanatour.com/asp/booking/oversea/oversea-main.asp?hanacode=overseas_M_bi')        # Package
honeymonCls = clsMenuUrls(codes.getTourKind('hanatour', 'W'), 'http://www.hanatour.com/asp/booking/honeymoon/hr-main.asp?hanacode=main_q_pack_honey')      # Honeymoon
golfCls = clsMenuUrls(codes.getTourKind('hanatour', 'G'), 'http://www.hanatour.com/asp/booking/golf/golf-main.asp?hanacode=main_q_pack_golf')              # Golf
cruiseUrl = clsMenuUrls(codes.getTourKind('hanatour', 'C'), 'http://www.hanatour.com/asp/booking/cruise/cruise-main.asp?hanacode=main_q_pack_cruise')      # Cruise
jejuUrl = clsMenuUrls(codes.getTourKind('hanatour', 'D'), 'http://www.hanatour.com/asp/booking/local/local-cheju.asp?hanacode=main_q_dom_jeju')            # Jeju

mainUrls.append(packageCls)
mainUrls.append(honeymonCls)
mainUrls.append(golfCls)
mainUrls.append(cruiseUrl)
mainUrls.append(jejuUrl)

#productPackage/pk- 값이 존재하고... etc_code=P 인것..이 패키지
#pkg_mst_code 값이 있는 경우는.. 바로 세부조회 내용임...(날짜 선택하는..) 이런 경우도 있김 있음..
#etc_code=W/P/A/B/K/Y/J/C  'W' : honeymoon, 'A': free, 'P' : package, 'B' : AirTel, 'K' : Tracking, 'Y' : Leports, 'J' : 성지순례, 'C' : Cruise
#</form><span class="free_go">
exceptFileName = 'tour2000Exception' + scrappingStartTime + '.txt'
exceptFile = open(exceptFileName, 'w')
print >> exceptFile, "Start : %s" % time.ctime()

mainUrl = 'http://www.tour2000.co.kr'

mainHtml = savefilegethtml.getHtml('http://www.tour2000.co.kr/index.asp', '<div class="navi_wholeMenu_box">', '<!-- navi_wholeMenu_wrapper// -->', 'tour2000mainHtml.txt')

startMainUrl = False
menuList = list()
MenuUrlCls = clsMenuUrls()
for each_line in mainHtml:
    if each_line.find('text_pinkB14') > -1:
        MenuUrlCls = clsMenuUrls()
        MenuUrlCls.kind = codes.getTourKind(tourAgency, tourUtil.getRemovedHtmlTag(each_line).strip())
        startMainUrl = True
    
    # 해외여행(패키지), 허니문, 골프, 국내(제주) 제외하고는 일단 패스
    if MenuUrlCls.kind == 'A' or MenuUrlCls.kind == 'F' or MenuUrlCls.kind == 'H' or MenuUrlCls.kind == 'No':
        continue    
    
    if startMainUrl and each_line.find('<li>') > -1:
        SubMenuCls = clsSubMenu()
        SubMenuCls.name = tourUtil.getRemovedHtmlTag(each_line).strip()
        SubMenuCls.url = mainUrl + tourUtil.getTagAttr(each_line, 'a', 'href')
        MenuUrlCls.subMenuList.append(SubMenuCls)
        
    if startMainUrl and each_line.find('</div>') > -1:
        startMainUrl = False
        menuList.append(MenuUrlCls)
mainUrl = 'http://www.tour2000.co.kr'

mainHtml = savefilegethtml.getHtml('http://www.tour2000.co.kr/index.asp',
                                   '<div class="navi_wholeMenu_box">',
                                   '<!-- navi_wholeMenu_wrapper// -->',
                                   'tour2000mainHtml.txt')

startMainUrl = False
menuList = list()
MenuUrlCls = clsMenuUrls()
for each_line in mainHtml:
    if each_line.find('text_pinkB14') > -1:
        MenuUrlCls = clsMenuUrls()
        MenuUrlCls.kind = codes.getTourKind(
            tourAgency,
            tourUtil.getRemovedHtmlTag(each_line).strip())
        startMainUrl = True

    # 해외여행(패키지), 허니문, 골프, 국내(제주) 제외하고는 일단 패스
    if MenuUrlCls.kind == 'A' or MenuUrlCls.kind == 'F' or MenuUrlCls.kind == 'H' or MenuUrlCls.kind == 'No':
        continue

    if startMainUrl and each_line.find('<li>') > -1:
        SubMenuCls = clsSubMenu()
        SubMenuCls.name = tourUtil.getRemovedHtmlTag(each_line).strip()
        SubMenuCls.url = mainUrl + tourUtil.getTagAttr(each_line, 'a', 'href')
        MenuUrlCls.subMenuList.append(SubMenuCls)

    if startMainUrl and each_line.find('</div>') > -1:
        startMainUrl = False
    else:
        print >> exceptFile, 'Start City : ICN'
        return 'ICN'


# 시간 변수들..
tourAgency = 'hanatour'
targetYear = sys.argv[1]
targetMonth = sys.argv[2]
#targetYear = '2014'
#targetMonth = '07'
scrappingStartTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")

mainUrls = list()
packageCls = clsMenuUrls(codes.getTourKind(
    'hanatour', 'P'
), 'http://www.hanatour.com/asp/booking/oversea/oversea-main.asp?hanacode=overseas_M_bi'
                         )  # Package
honeymonCls = clsMenuUrls(codes.getTourKind(
    'hanatour', 'W'
), 'http://www.hanatour.com/asp/booking/honeymoon/hr-main.asp?hanacode=main_q_pack_honey'
                          )  # Honeymoon
golfCls = clsMenuUrls(codes.getTourKind(
    'hanatour', 'G'
), 'http://www.hanatour.com/asp/booking/golf/golf-main.asp?hanacode=main_q_pack_golf'
                      )  # Golf
cruiseUrl = clsMenuUrls(codes.getTourKind(
    'hanatour', 'C'
), 'http://www.hanatour.com/asp/booking/cruise/cruise-main.asp?hanacode=main_q_pack_cruise'
                        )  # Cruise
jejuUrl = clsMenuUrls(codes.getTourKind(