예제 #1
0
 	def __init__(self):
         self.categoryDict={'音乐':'music','戏剧':"drama",'讲座':"salon",'聚会':"party",'电影':"film",'展览':"exhibition",'运动':"sports",'公益':"commonweal",'旅行':"travel","其他":"others"}
         self.cityList = ["beijing","shanghai","guangzhou"]
         self.timeType = "week"
         self.start_urls = []
         self.init_start_url()
         self.geoCoder = GeoCoder()
예제 #2
0
 def __init__(self):
     self.mcid={u'演唱会':1,u'音乐会':2,u'话剧歌剧':3,u'舞蹈芭蕾':4,u'曲苑杂坛':5,u'体育比赛':6,u'度假休闲':7}
     self.ccid={'流行':9,'摇滚':10,'民族':11,'音乐节':12,'其他演唱会':13,
               '管弦乐':14, '独奏':15,'室内乐及古乐':16, '声乐及合唱':17, '其他音乐会':18,
               '话剧 ':19,'歌剧 ':20,'歌舞剧 ':21,'音乐剧 ':22,'儿童剧 ':23,
               '舞蹈 ':24,'芭蕾 ':25,'舞剧 ':26,
               '相声 ':27,'魔术 ':28,'马戏 ':29,'杂技 ':30,'戏曲 ':31,'其他曲苑杂坛 ':32,
               '球类运动':33,'搏击运动':34,'其它竞技':35,
               '主题公园':36, '风景区':37, '展会':38, '特色体验':39, '温泉':40, '滑雪':41, '游览线路':42, '度假村':43, '代金券':44, '酒店住宿':45
               }
     self.mcidDict=~bidict(self.mcid)
     self.ccidDict=~bidict(self.ccid)
     self.geoCodingDict = {}
     self.geoCodingDictFile = "./geoCodingDict.txt"
     #self.readGeoCodingDict(self.geoCodingDictFile)
     self.geoCoder = GeoCoder()
     self.start_urls = []
     self.init_start_url()
     self.host = 'http://www.damai.cn'
예제 #3
0
class DamaiSpider(CrawlSpider):
    def __init__(self):
        self.mcid={u'演唱会':1,u'音乐会':2,u'话剧歌剧':3,u'舞蹈芭蕾':4,u'曲苑杂坛':5,u'体育比赛':6,u'度假休闲':7}
        self.ccid={'流行':9,'摇滚':10,'民族':11,'音乐节':12,'其他演唱会':13,
                  '管弦乐':14, '独奏':15,'室内乐及古乐':16, '声乐及合唱':17, '其他音乐会':18,
                  '话剧 ':19,'歌剧 ':20,'歌舞剧 ':21,'音乐剧 ':22,'儿童剧 ':23,
                  '舞蹈 ':24,'芭蕾 ':25,'舞剧 ':26,
                  '相声 ':27,'魔术 ':28,'马戏 ':29,'杂技 ':30,'戏曲 ':31,'其他曲苑杂坛 ':32,
                  '球类运动':33,'搏击运动':34,'其它竞技':35,
                  '主题公园':36, '风景区':37, '展会':38, '特色体验':39, '温泉':40, '滑雪':41, '游览线路':42, '度假村':43, '代金券':44, '酒店住宿':45
                  }
        self.mcidDict=~bidict(self.mcid)
        self.ccidDict=~bidict(self.ccid)
        self.geoCodingDict = {}
        self.geoCodingDictFile = "./geoCodingDict.txt"
        #self.readGeoCodingDict(self.geoCodingDictFile)
        self.geoCoder = GeoCoder()
        self.start_urls = []
        self.init_start_url()
        self.host = 'http://www.damai.cn'

    name = 'damai'
    # def readGeoCodingDict(self,filePath):
    #             with open(filePath) as fileInput:
    #                     self.geoCodingDict = json.loads(fileInput.read())
    #
    # def updateGeoCodingDict(self,filePath):
    #         with open(filePath,"w") as fileInput:
    #                 geoCodingDictJson = json.dumps(self.geoCodingDict,ensure_ascii=False)
    #                 fileInput.write(geoCodingDictJson)

    def init_start_url(self):
            # get_source('http://www.damai.cn/projectlist.do?mcid=1&ccid=9')
            # print get_source('http://item.damai.cn/66780.html')
            ccidThresh={1:13,2:18,3:23,4:26,5:32,6:35,7:45}
            startMcid=3
            startCcid=21
            mcid=startMcid
            ccid=startCcid

            while mcid<=7:
                    mcidName=self.mcidDict[mcid]
                    while ccid<=ccidThresh[mcid]: #
                            pageIndex=1
                            # index of page keep changing until there is no perform list in the page
                            performListPage='http://www.damai.cn/projectlist.do?mcid=%s&ccid=%s&pageIndex=%s' % (mcid,ccid,pageIndex)
                            print 'init page %s '%(performListPage)
                            self.start_urls.append(performListPage)
                            ccid+=1
                    mcid+=1


    def parse(self,response):
        #判断是否是列表页,
        listPage=response.body
        soup=BeautifulSoup(listPage)
        performList=soup.find(attrs={'id':'performList'})
        if performList == None: # indicate the index of page has come to an end,
            timeInfo=[]
            price=[]
            showpage=response.body
            # showpage=get_source('http://item.damai.cn/70686.html')
            #soup=BeautifulSoup(showpage,"html.parser")
            soup=BeautifulSoup(showpage)

            try:
                    title=soup.find(attrs={'class':'title'}).get_text().strip().encode('utf-8') # get the title
            except:
                    title='待定'
            #print title
            try:
                    location=soup.find(attrs={'itemprop':'location'}).get_text().strip().encode('utf-8') # get the location
            except:
                    location='待定'
            #print location

            try:
                #geocoding
                lng = 0.0
                lat = 0.0
                if location in self.geoCodingDict:
                        lng,lat = self.geoCodingDict[location]
                else:
                        locationList = location.split("-")
                        region = locationList[0].strip()
                        normRegion = region.replace(" ","").replace("(","").replace(")","").replace("(","").replace(")","")
                        if region in self.geoCodingDict:
                                lng,lat = self.geoCodingDict[normRegion]
                        else:
                                lng,lat = self.geoCoder.geoCoding(region)
                                self.geoCodingDict[region] = (lng,lat)
                                if lng==0.0 and lat==0.0 and len(locationList)>1:
                                        city = locationList[1].strip()
                                        if city in self.geoCodingDict:
                                                lng,lat = self.geoCodingDict[city]
                                        else:
                                                lng,lat = self.geoCoder.geoCoding(city)
                                                self.geoCodingDict[city] = (lng,lat)
                        self.geoCodingDict[location] = (lng,lat)

                #print location


                pidList=[]
                timeList=soup.find(attrs={'id':'perform'}).find_all('a') # get the time, which is a list
                for index,eachtime in enumerate(timeList): # get the price for each time
                        pid=eachtime['pid']
                        currentPerformTime = eachtime['time'].encode('utf-8')
                        timeInfo.append(currentPerformTime)

                        # print eachtime['class'],type(eachtime['class'])
                        if eachtime['class']==[u'grey']:
                                price.append('暂无')
                                continue

                        if index>0:
                                data={'type':'33',
                                          'performID':pid,
                                          'business':'1',
                                          'IsBuyFlow':'False',
                                          'sitestaus':'3'}
                                post_data=urllib.urlencode(data)
                                url='http://item.damai.cn/ajax.aspx?' + post_data
                                newpage=get_source(url)
                                soup=BeautifulSoup(newpage,"html.parser")
                                priceLinkList=soup.find_all('a',attrs={'class':True,'price':True})

                        else:
                                priceLinkList=soup.find(attrs={'id':'price'}).find_all('a')
                        priceList=[]
                        for eachlink in priceLinkList:
                                norlizedPrice=eachlink.get_text()
                                norlizedPrice=norlizedPrice.replace(u'暂时无货,登记试试运气~',u' ( 无货 )').replace(u'点击进行预定登记',u' ( 可预定 )')
                                priceList.append(norlizedPrice.encode('utf-8'))
                        price.append(priceList)
                        currentPerformPriceInfo = ",".join(priceList)
                        #no end time
                        date_time,start_time = getAvosTimeInfo(currentPerformTime)
                #print date_time,start_time,currentPerformPriceInfo
            except:
                print 'parse some error'


            item = DoubanItem()
            item['name'] = title
            item['date'] = date_time
            item['start_time'] = start_time
            item['end_time'] = start_time
            item['ticket']= currentPerformPriceInfo
            item['region'] = location
            item['location'] = gps2GeoPoint(lat,lng)
            item['category'] = self.transferDict(response.meta['CateName'])
            item['source'] = DamaiSpider.__name__
            yield item

        else:
            titleList=performList.find_all('h2')
            nextInfo = soup.find(attrs={'class':"next"})
            if nextInfo!= None:
                next = nextInfo.get("href","")
                next_page = self.host + next
                print 'list page %s '%(next_page)
                yield Request(next_page, callback=self.parse)
            else:
                next_page = ''

            catergorycn_re = re.compile(r'http\:\/\/www\.damai\.cn\/projectlist\.do\?mcid=(\d+)&ccid=.*')
            catergory_list = re.findall(catergorycn_re,next_page)
            if len(catergory_list) > 0:
                CateName = self.mcidDict[int(catergory_list[0])]
            else:
                CateName = u'演唱会'
            for each in titleList:
                a=each.find('a')
                print 'detail page %s '%a['href']
                request = Request(a['href'], callback=self.parse)
                request.meta['CateName'] = CateName
                yield request





    def transferDict(self,CateName):

        dataDict = {
        u"演唱会":"音乐",
        u"音乐会":"音乐",
        u"话剧歌剧":"戏剧",
        u"舞蹈芭蕾":"音乐",
        u"曲苑杂坛":"戏剧",
        u"体育比赛":"运动",
        u"度假休闲":"旅行",
        u"儿童亲子":"其他",
        }
        if dataDict.has_key(CateName) == True:
           return dataDict[CateName]
        else:
           return u"其他"
예제 #4
0
class DoubanSpider(CrawlSpider):
    	def __init__(self):
            self.categoryDict={'音乐':'music','戏剧':"drama",'讲座':"salon",'聚会':"party",'电影':"film",'展览':"exhibition",'运动':"sports",'公益':"commonweal",'旅行':"travel","其他":"others"}
            self.cityList = ["beijing","shanghai","guangzhou"]
            self.timeType = "week"
            self.start_urls = []
            self.init_start_url()
            self.geoCoder = GeoCoder()

        #initlize start_urls
        def init_start_url(self):
            for city in self.cityList:
                for categoryCn,categoryEn in self.categoryDict.iteritems():
                    categoryUrl = "http://%s.douban.com/events/%s-%s" % (city,self.timeType,categoryEn)
                    self.start_urls.append(categoryUrl)
        name = "douban"
        #download_delay = 1

        #parse the response with beautifulsoup
        def parse(self, response):
            #print response.body.decode(response.encoding)
            soup=BeautifulSoup(response.body)
            allEventsInfo=soup.find(attrs={'class':'events-list events-list-pic100 events-list-psmall'}).find_all("li")

            catergorycn_re = re.compile(r'http\:\/\/\w+\.douban\.com\/events\/week-(\w+)')
            catergory_list = re.findall(catergorycn_re,response.url)
            categoryCn = ''
            if len(catergory_list) >0:
                for (k,v) in self.categoryDict.items():
                    if v == catergory_list[0]:
                        categoryCn = k

            for eventInfo in allEventsInfo:
                try:
                    eventTitle = eventInfo.find(attrs={'class':'title'}).get_text().strip().encode('utf-8')
                    eventTimeInfoList = eventInfo.find(attrs={'class':'event-time'}).find_all("time")
                    eventTimeDict = {"startDate":"","endDate":""}
                    for eventTimeInfo in eventTimeInfoList:
                        itemprop = eventTimeInfo["itemprop"]
                        DateTimeInfo = eventTimeInfo["datetime"]
                        if itemprop not in eventTimeDict.keys():
                            continue
                        eventTimeDict[itemprop] = DateTimeInfo.replace("T"," ")
                    eventLocation = eventInfo.find(attrs={'itemprop':'location'}).get("content","")
                except:
                    continue

                try:
                        longitude = float(eventInfo.find(attrs={'itemprop':'longitude'}).get("content",""))
                        latitude = float(eventInfo.find(attrs={'itemprop':'latitude'}).get("content",""))
                except:
                        longitude,latitude = lng,lat = self.geoCoder.geoCoding(eventLocation)
                ticketFee = eventInfo.find(attrs={'class':'fee'}).get_text().strip().encode('utf-8')

                date_time, start_time, end_time = getAvosTimeInfo(eventTimeDict["startDate"], eventTimeDict["endDate"])
                item = DoubanItem()
                item['name'] = eventTitle
                item['date'] = date_time
                item['start_time'] = start_time
                item['end_time'] = end_time
                item['ticket']= ticketFee
                item['region'] = eventLocation
                item['location'] = gps2GeoPoint(latitude,longitude)
                item['category'] = categoryCn
                item['source'] = DoubanSpider.__name__
                yield item



            #nextpage
            nextInfo = soup.find(attrs={'rel':"next"})
            if nextInfo!= None:
                next = nextInfo.get("href","")
            else:
                next = ""
            #http://guangzhou.douban.com/events/week-party
            #http://shanghai.douban.com/events/week-exhibition
            catergory_re = re.compile(r'(http\:\/\/\w+\.douban\.com\/events\/week-\w+)')
            catergory_list = re.findall(catergory_re,response.url)



            if len(catergory_list) > 0:
                categoryUrl = catergory_list[0]
            if next != "":
                nextPageUrl = categoryUrl + next
                print nextPageUrl
                yield Request(nextPageUrl, callback=self.parse)