Пример #1
0
 def post_spider(self):
     logger.info('爬虫结束后的操作')
     data_operator = DataOperator()
     data_operator.delete_data_dated()
     data_operator.data_unique()
     data_operator.deduplication()
     data_operator.mysql_close()
Пример #2
0
def get_items(sid, current_date):
    items = []
    url = 'http://cgi.yanchu.qq.com/cgi-bin/yanchu/mb_api/jsondata.fcg?g_tk=4d3754f563ad04a56fece81bbcc83302&cbk=callback&sCmd=citytype&IDS=0%2C26&page=0&_=1446602940456'
    data = get_data(url,  str(1))
    if data and data['data']['page_data']:
        pages = int(data['data']['page_tol'])
        print '共%s页'%pages
        for page in range(pages):
            print page
            url = 'http://cgi.yanchu.qq.com/cgi-bin/yanchu/mb_api/jsondata.fcg?g_tk=4d3754f563ad04a56fece81bbcc83302&cbk=callback&sCmd=citytype&IDS=0%2C26&page='+str(page)+'&_=1446602940456'
            data = get_data(url, str(page+1))
            if not data or not data['data']['page_data']:
                print('未找到第%s页的数据'%page)
                continue
            print data
            for i in data['data']['page_data']:
                item = {'sid':sid, 'begin_date':' ', 'end_date':' ', 'id':' ' , 'title':' ', 'industry':' ', 'city':' ', 'venue':' ', 'organizer':' ', 'site':' ', 'visitor':' ', 'area':' ', 'history_info_tag':'0'}
                item['city'] = i['city']
                item['id'] = i['show_id']
                item['title'] = i['show_name']
                if len(i['show_time']) > 19:
                    begin_time = i['show_time'].split(',')[0].split(' ')[0]
                    end_time = i['show_time'].split(',')[-1].split(' ')[0]
                else:
                    begin_time = i['show_time'][:10]
                    end_time = begin_time
                item['begin_date'] = begin_time
                item['end_date'] = end_time
                item['venue'] = i['hall_name']
                items.append(item)
            opera = DataOperator()
            opera.item_insert(data=items)
        return items
Пример #3
0
 def get_items(self, page=1):
     data = self.get_html(page=page)
     numbers = re.findall('<div class="newFind">.*?(\d+).*?</div>', data)[0]
     # pages = math.ceil(float(numbers)/20)
     pages = 150  # 爬前150页的数据,后边的基本已过期
     for page in range(1, int(pages) + 1):
         print '共有%s页%s条数据,目前正在抓取第%s页的数据' % (pages, numbers, page)
         data = self.get_html(page=page)
         soup = BeautifulSoup(data)
         events = soup.findAll('div', {'class': 'mlm1r'})
         items = []
         for event in events:
             item = {
                 'sid': self.sid,
                 'begin_date': ' ',
                 'end_date': ' ',
                 'id': ' ',
                 'title': ' ',
                 'industry': ' ',
                 'city': ' ',
                 'venue': ' ',
                 'organizer': ' ',
                 'site': ' ',
                 'visitor': ' ',
                 'area': ' ',
                 'history_info_tag': '0'
             }
             href = event.a['href']
             id = href.split('/')[-1].strip()
             title = event.select('li')[0].text.strip()
             venue = event.select('li')[1].select('span')[1].text.strip()
             date = event.select('li')[1].select('span')[0].text.strip()
             begin_date = date.split('-')[0].replace('.', '-')
             end_date = date.split('-')[1].replace('.', '-')
             if ''.join(begin_date.split('-')[:2]) > self.current_date:
                 '''try:
                     city = ''.join(jieba.analyse.extract_tags(venue, allowPOS=['ns']))
                 except Exception as e:
                     print 'error:', e'''
                 eachData = self.get_html(id=id)
                 try:
                     city = re.findall(u'地址.*?<span>(.*?)</span>',
                                       eachData)[0].split()[-1].rstrip(u'市')
                 except IndexError as e:
                     print 'error:', e
                     city = ''
                 item['city'] = city
                 item['begin_date'] = begin_date
                 item['end_date'] = end_date
                 item['id'] = id
                 item['title'] = title
                 item['venue'] = venue
                 items.append(item)
                 print id, title, city, begin_date, end_date, venue
             else:
                 print 'ID为%s的数据过期' % id
         print '正在写入第%s页的数据' % (page)
         opera = DataOperator()
         opera.item_insert(data=items)
Пример #4
0
 def get_data(self, page, city_name):
     items = []
     print('正在抓取%s第%s页的数据'%(city_name, page))
     url = 'http://www.cnena.com/showroom/search.php?mid=1&fid=0&keyword=%s&action=search&type=title&page=%s'%(city_name, str(page))
     url = url.decode('utf-8').encode('GBK')
     print(url)
     html = self.get_html(url)
     if html:
         pattern = re.compile('<tr>.*?<td.*?>(\d+)</td>.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?<a.*?>(.*?)</a>.*?</tr>', re.S)
         meetings = re.findall(pattern, html)
         if meetings:
             for meeting in meetings:
                 item = {'sid':self.sid, 'begin_date':' ', 'end_date':' ', 'id':' ' , 'title':' ', 'industry':' ', 'city':city_name, 'venue':' ', 'organizer':' ', 'site':' ', 'visitor':' ', 'area':' ', 'history_info_tag':'0'}
                 fid_id_tmp = re.findall('\d+', meeting[1])
                 fid_id = str(fid_id_tmp[0]) + '-' + str(fid_id_tmp[1])
                 item['id'] = fid_id
                 title = meeting[2]
                 item['title'] = title
                 industry = meeting[3]
                 item['industry'] = industry
                 h = 'http://www.cnena.com/showroom/'+meeting[1]
                 print(h)
                 html2 = self.get_html(h)
                 if html2:
                     pattern2=re.compile(u'展会概况.*?开幕日期:(.*?)<br>.*?结束日期:(.*?)<br>.*?展会地点.*?<a.*?>(.*?)</a>',re.S)
                     meetings2 = re.search(pattern2, html2)
                     if meetings2:
                         print(meetings2.group(1))
                         begin_date = re.sub('\D+','-',meetings2.group(1)).strip('-')
                         end_date = re.sub('\D+','-',meetings2.group(2)).strip('-')
                         year = int(begin_date.split('-')[0])
                         month = int(begin_date.split('-')[1])
                         if year >= int(self.current_date[:4])+1 or year == int(self.current_date[:4]) and month >= int(self.current_date[-2:]):
                             item['begin_date'] = begin_date
                             item['end_date'] = end_date
                             venue = meetings2.group(3)
                             item['venue'] = venue
                             print(item)
                             items.append(item)
             print('准备写入第%s页的数据'%page)
             opera = DataOperator()
             opera.item_insert(data=items)
     else:
         print('%s页的数据为空'%page)
     return
Пример #5
0
 def get_items(self, city, page='1'):
     url = 'http://www.zhankoo.com/Search/SearchExhibitionList?city=%s&classifyId=0&ratingOverAll=0&rankType=5&isExhibitionEnd=0&_=1452759283208&pagenumber=%s'%(city, page)
     print '正在抓取%s第%s页的数据'%(city, page)
     data = self.get_data(url)
     soup = BeautifulSoup(data)
     meetings = soup.findAll('h3', {'class':'deal-tile__title'})
     items = []
     if meetings:
         for meeting in meetings:
             item = {'sid':self.sid, 'begin_date':' ', 'end_date':' ', 'id':' ' , 'title':' ', 'industry':' ', 'city':' ', 'venue':' ', 'organizer':' ', 'site':' ', 'visitor':' ', 'area':' ', 'history_info_tag':'0'}
             title = meeting.select('span[class="xtitle"]')[0].a['title']
             href = meeting.select('span[class="xtitle"]')[0].a['href']
             itemid = href.split('_')[-1].split('.')[0]
             date_venue_tmp = meeting.select('span[class="short-title"]')[0].text
             date_venue = ''.join(date_venue_tmp.split()[:-1])
             venue = date_venue.split(':')[-1]
             date_tmp = date_venue.split(':')[1]
             date = re.split(u"[\u4e00-\u9fa5]+",date_tmp)
             begin_date = date[0]
             end_date = date[1]
             print city, title, itemid, begin_date, end_date, venue
             item['id'] = itemid
             item['title'] = title
             item['city'] = city
             item['venue'] = venue
             item['begin_date'] = begin_date
             item['end_date'] = end_date
             items.append(item)
         opera = DataOperator()
         opera.item_insert(data=items)
         pattern_next_page = re.compile(u'<a\s+class="next-page".*?href=".*?pagenumber=(\d+)">下一页</a>')
         try:
             next_page = re.findall(pattern_next_page, data)[0]
         except IndexError:
             print '%s的数据全部抓取完毕'%city
             return
         else:
             print '找到%s的下一页,准备抓取下一页的数据'%city
             self.get_items(city,  next_page)
     else:
         print '%s没有数据'%city
Пример #6
0
 def get_items(self, data, month):
     print('正在解析%s月份的数据' % month)
     soup = BeautifulSoup(data)
     meetings = soup.findAll('table', {'id': 'tbl_%s' % month})[0].findAll(
         'tr', {'class': 'blue_bg'})
     items = []
     if meetings:
         for meeting in meetings:
             item = {
                 'sid': self.sid,
                 'begin_date': ' ',
                 'end_date': ' ',
                 'id': 'm190003',
                 'title': ' ',
                 'industry': '财经',
                 'city': ' ',
                 'venue': ' ',
                 'organizer': ' ',
                 'site': ' ',
                 'visitor': ' ',
                 'area': ' ',
                 'history_info_tag': '0'
             }
             url = meeting.select('td')[0].a['href']
             if url:
                 item['site'] = url
             title = meeting.select('td')[0].a.string
             if title:
                 item['title'] = title
             organizer = meeting.select('td')[1].string
             if organizer:
                 item['organizer'] = organizer
             date = meeting.select('td')[2].string
             if len(date) > 12:
                 begin_date_tmp = date.split('-')[0]
                 begin_date = re.sub('[^\d]', '-',
                                     begin_date_tmp).rstrip('-')
                 end_date_tmp = date.split('-')[1]
                 end_date_tmp2 = re.sub('[^\d]', '-',
                                        end_date_tmp).rstrip('-')
                 if len(end_date_tmp
                        ) <= 3:  #如果len(end_date_temp)=3,说明结束日期只有日期,没有年份和月份
                     end_date = begin_date.replace(
                         begin_date.split('-')[-1],
                         end_date_tmp2)  #把开始日期的日期数值换成结束日期的值
                 elif len(end_date_tmp
                          ) <= 6:  #如果len(end_date_temp)=6,说明结束日期有月份和日期,没有年份
                     end_date = begin_date.split(
                         '-')[0] + '-' + end_date_tmp2  #把开始日期的年份数值与结束日期连接起来
                 else:
                     end_date = end_date_tmp2
             else:
                 begin_date = end_date = re.sub('[^\d]', '-',
                                                date).rstrip('-')
             item['begin_date'] = begin_date
             item['end_date'] = end_date
             city = meeting.select('td')[3].string
             if city:
                 item['city'] = city
                 item['venue'] = city
             print(item)
             items.append(item)
         opera = DataOperator()
         opera.item_insert(data=items)
     else:
         print '%s年%s月没有数据!' % (self.current_date[:4], month)
     return items
Пример #7
0
    def get_data(self, data):
        pattern_items = re.compile(
            '<div.*?class="sslist">.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?<p>(.*?)</p>.*?<p.*?class="cg">(.*?)</p>',
            re.S)
        meetings = re.findall(pattern_items, data)
        items = []
        items_history = []
        for meeting in meetings:
            item = {
                'sid': self.sid,
                'begin_date': ' ',
                'end_date': ' ',
                'id': ' ',
                'title': ' ',
                'industry': ' ',
                'city': ' ',
                'venue': ' ',
                'organizer': ' ',
                'site': ' ',
                'visitor': ' ',
                'area': ' ',
                'history_info_tag': '0'
            }
            print meeting[0]
            id = meeting[0].split('/')[-1].split('_')[0]
            print id
            item['id'] = id
            title = meeting[1].strip()
            item['title'] = title
            city = meeting[2].strip()
            item['city'] = city
            try:
                venue = meeting[3].split('>')[1].split('<')[0]
            except IndexError:
                venue = meeting[3].strip()
            item['venue'] = venue
            h = 'http://www.eshow365.com' + meeting[0]
            print h
            html2 = self.get_html(h)
            pattern_time = re.compile('举办时间:(.*?)---(.*?)</p>', re.S)
            pattern_organizer = re.compile('主办单位:(.*?)</p>')
            pattern_industry = re.compile('所属行业:(.*?)</a>')
            pattern_area = re.compile('展会面积:(\d+).*?</p>')
            time_tmp = re.findall(pattern_time, html2)
            begin_time_tmp = time_tmp[0][0].replace('/', '-')
            begin_date = datetime.datetime.strptime(
                begin_time_tmp, '%Y-%m-%d').strftime('%Y-%m-%d')
            end_time_tmp = time_tmp[0][1].replace('/', '-')
            end_date = datetime.datetime.strptime(
                end_time_tmp, '%Y-%m-%d').strftime('%Y-%m-%d')
            item['begin_date'] = begin_date
            item['end_date'] = end_date
            try:
                org = re.findall(pattern_organizer, html2)[0].split(' ')[0]
            except IndexError:
                org = ' '
            item['organizer'] = org
            industry_tmp = re.findall(pattern_industry, html2)
            if industry_tmp:
                try:
                    indus = industry_tmp[0].split('>')[1].split('<')[0]
                except IndexError:
                    indus = industry_tmp[0].strip()
            else:
                indus = ' '
            print indus
            item['industry'] = indus
            try:
                area = re.findall(pattern_area, html2)[0]
            except IndexError:
                area = ' '
            item['area'] = area

            soup = BeautifulSoup(html2)
            try:
                history_exhibitions = soup.findAll(
                    'div', {'class': 'ljzh'})[0].select('tr')[1:]
            except IndexError:
                print '没有找到历届展会信息'
                history_info_tag = '0'
            else:
                print '找到历届展会信息'
                history_info_tag = '1'
                for history_exhibition in history_exhibitions:
                    item_history = {}
                    history_exhibition_info = history_exhibition.select('td')
                    history_exhibition_title = history_exhibition_info[0].a[
                        'title'].strip()
                    print history_exhibition_title
                    history_exhibition_url = history_exhibition_info[0].a[
                        'href']
                    history_exhibition_id = history_exhibition_url.split(
                        '/')[-1].split('_')[0]
                    print history_exhibition_id
                    try:
                        history_exhibition_venue = history_exhibition_info[
                            1].stripped_strings.next()
                    except StopIteration:
                        history_exhibition_venue = ' '
                    print history_exhibition_venue
                    history_exhibition_date = history_exhibition_info[
                        2].string.strip().replace('/', '-')
                    print history_exhibition_date
                    history_exhibition_area_tmp = history_exhibition_info[
                        3].span.string.strip()
                    history_exhibition_area = filter(
                        lambda x: x.isdigit(), history_exhibition_area_tmp)
                    print history_exhibition_area
                    item_history['sid'] = self.sid
                    item_history['itemid'] = id
                    item_history['history_itemid'] = history_exhibition_id
                    item_history['title'] = history_exhibition_title
                    item_history['venue'] = history_exhibition_venue
                    date_tmp = history_exhibition_date
                    date = datetime.datetime.strptime(
                        date_tmp, '%Y-%m-%d').strftime('%Y-%m-%d')
                    item_history['date'] = date
                    item_history['area'] = history_exhibition_area
                    items_history.append(item_history)
            item['history_info_tag'] = history_info_tag
            items.append(item)
        opera = DataOperator()
        opera.item_insert(data=items, data_history=items_history)
        return
Пример #8
0
 def get_items(self, start_date, page='1'):
     url = 'http://www.foodmate.net/exhibit/search.php?kw=&fields=0&fromdate=%s&todate=&catid=0&process=0&order=0&x=59&y=12&page=%s' % (
         start_date, page)
     data = self.get_data(url)
     soup = BeautifulSoup(data)
     meetings = soup.findAll('div', {'class': 'list'})
     print meetings
     items = []
     if meetings:
         for meeting in meetings:
             item = {
                 'sid': self.sid,
                 'begin_date': ' ',
                 'end_date': ' ',
                 'id': ' ',
                 'title': ' ',
                 'industry': ' ',
                 'city': ' ',
                 'venue': ' ',
                 'organizer': ' ',
                 'site': ' ',
                 'visitor': ' ',
                 'area': ' ',
                 'history_info_tag': '0'
             }
             meeting = meeting.ul
             href = meeting.select('li')[0].a['href']
             itemid = href.split('-')[-1].split('.')[0]
             title = meeting.select('li')[0].a.string
             venue_tmp = meeting.select('li')[1].string
             venue = venue_tmp.split(':')[-1]
             organizer_tmp = meeting.select('li')[2].string
             organizer = organizer_tmp.split(':')[-1]
             date_tmp = meeting.select('li')[3].string
             begin_date = date_tmp.split('~')[0].strip()
             end_date = date_tmp.split('~')[1].strip()
             each_meeting_url = href
             each_meeting_data = self.get_data(each_meeting_url)
             pattern_city = re.compile(u'展出城市.*?<a.*?>(.*?)</a>', re.S)
             pattern_site = re.compile(
                 u'<div.*?id="content">.*?网址.*?<a.*?>(.*?)</a>.*?<br>',
                 re.S)
             try:
                 city = re.findall(pattern_city, each_meeting_data)[0]
             except IndexError:
                 city = ' '
             try:
                 site = re.findall(pattern_site, each_meeting_data)[0]
             except IndexError:
                 site = ' '
             print itemid, title, city, begin_date, end_date, organizer, venue, site
             item['id'] = itemid
             item['title'] = title
             item['city'] = city
             item['venue'] = venue
             item['begin_date'] = begin_date
             item['end_date'] = end_date
             item['organizer'] = organizer
             item['site'] = site
             items.append(item)
         opera = DataOperator()
         opera.item_insert(data=items)
         pattern_next_page = re.compile(
             u'<a.*?href=".*?page=(\d+)"\s+title="下一页">')
         try:
             next_page = re.findall(pattern_next_page, data)[0]
         except IndexError:
             print '全部抓取完毕'
             return
         else:
             print '找到下一页,准备抓取下一页的数据'
             self.get_items(start_date, next_page)
Пример #9
0
 def get_items(self, year, month):
     url = 'http://www.chemsoc.org.cn/Meeting/Home/search.asp?mingcheng=&province=&y=%s&m=%s' % (
         year, month)
     print url
     try:
         data = self.get_data(url)
         pattern_pages = re.compile(u'第\d+页.*?共(\d+)页')
         pages = re.findall(pattern_pages, data)[0]
     except Exception as e:
         print('未找打%s年%s月的数据,error:%s' % (year, month, e))
         return
     if not int(pages):
         print '%s年%s月没有数据' % (year, month)
     else:
         print '%s年%s月共有%s页的数据' % (year, month, pages)
         for page in range(1, int(pages) + 1):
             items = []
             url = 'http://www.chemsoc.org.cn/Meeting/Home/search.asp?page=%s&mingcheng=&province=&y=%s&m=%s' % (
                 page, year, month)
             print url
             try:
                 data = self.get_data(url)
                 soup = BeautifulSoup(data)
                 meetings = soup.findAll(
                     'table', {'class': 'meetings'})[0].findAll('tr')[1:]
             except Exception as e:
                 print('未找打%s年%s月%s页的数据,error:%s' % (year, month, page, e))
                 continue
             for meeting in meetings:
                 item = {
                     'sid': self.sid,
                     'begin_date': ' ',
                     'end_date': ' ',
                     'id': ' ',
                     'title': ' ',
                     'industry': ' ',
                     'city': ' ',
                     'venue': ' ',
                     'organizer': ' ',
                     'site': ' ',
                     'visitor': ' ',
                     'area': ' ',
                     'history_info_tag': '0'
                 }
                 href = meeting.select('td')[0].a['href']
                 itemid = href.split('=')[-1]
                 title = meeting.select('td')[0].a['title']
                 city = meeting.select('td')[1].input['value']
                 meeting_time = meeting.select('td')[2].input['value']
                 begin_time_tmp = re.split(u'-|至', meeting_time)[0]
                 meeting_begin_time = re.sub(u'[年月日]', '-',
                                             begin_time_tmp).rstrip('-')
                 try:
                     end_time_tmp1 = re.split(u'-|至', meeting_time)[1]
                 except IndexError:
                     meeting_end_time = meeting_begin_time
                 else:
                     end_time_tmp2 = re.sub(u'[年月日]', '-',
                                            end_time_tmp1).rstrip('-')
                     if len(end_time_tmp2) <= 2:
                         if int(end_time_tmp2) < int(
                                 meeting_begin_time.split('-')[-1]):
                             meeting_end_time = meeting_begin_time.split(
                                 '-')[0] + '-' + str(
                                     int(meeting_begin_time.split('-')[1]) +
                                     1) + end_time_tmp2
                         else:
                             meeting_end_time = meeting_begin_time.replace(
                                 meeting_begin_time.split('-')[-1],
                                 end_time_tmp2)
                     elif len(end_time_tmp2) <= 4:
                         meeting_end_time = meeting_begin_time.split(
                             '-')[0] + '-' + end_time_tmp2
                 print itemid, city, title, meeting_begin_time, meeting_end_time
                 item['id'] = itemid
                 item['title'] = title
                 item['city'] = city
                 try:
                     begin_date = datetime.datetime.strptime(
                         meeting_begin_time,
                         '%Y-%m-%d').strftime('%Y-%m-%d')
                 except ValueError:
                     begin_date = meeting_begin_time
                 item['begin_date'] = begin_date
                 try:
                     end_date = datetime.datetime.strptime(
                         meeting_end_time, '%Y-%m-%d').strftime('%Y-%m-%d')
                 except ValueError:
                     end_date = meeting_end_time
                 item['end_date'] = end_date
                 each_meeting_url = 'http://www.chemsoc.org.cn/Meeting/Home/' + href
                 print each_meeting_url
                 each_meeting_data = self.get_data(each_meeting_url)
                 if each_meeting_data:
                     pattern_organizer = re.compile(u'<p>主办单位:(.*?)</p>')
                     pattern_visitor = re.compile(u'<p>预计人数:(.*?)</p>')
                     pattern_venue = re.compile(u'<p>地.*?址:(.*?)</p>')
                     try:
                         organizer = re.findall(pattern_organizer,
                                                each_meeting_data)[0]
                     except IndexError:
                         organizer = ' '
                     try:
                         visitor = re.findall(pattern_visitor,
                                              each_meeting_data)[0]
                     except IndexError:
                         visitor = ' '
                     try:
                         venue = re.findall(pattern_venue,
                                            each_meeting_data)[0]
                     except IndexError:
                         venue = city
                     item['organizer'] = organizer
                     item['visitor'] = visitor
                     item['venue'] = venue
                 items.append(item)
             opera = DataOperator()
             opera.item_insert(data=items)
     return
Пример #10
0
 def get_items(self, page):
     print '正在抓取第%s页的数据' % page
     data = self.get_data(page).split('</html>')[1]
     if data:
         soup = BeautifulSoup(data)
         items = []
         meetings = soup.select('table[class="block1"] tr td tr')[1:]
         for meeting in meetings:
             item = {
                 'sid': self.sid,
                 'begin_date': ' ',
                 'end_date': ' ',
                 'id': ' ',
                 'title': ' ',
                 'industry': ' ',
                 'city': ' ',
                 'venue': ' ',
                 'organizer': ' ',
                 'site': ' ',
                 'visitor': ' ',
                 'area': ' ',
                 'history_info_tag': '0'
             }
             date_tmp1 = meeting.select('td')[3].string
             date_tmp2 = re.sub(u'[年月日]', '-', date_tmp1).rstrip('-')
             print date_tmp2
             try:
                 date = datetime.datetime.strptime(
                     date_tmp2, '%Y-%m-%d').strftime('%Y-%m-%d')
             except ValueError:
                 date = date_tmp2
             year = int(date.split('-')[0])
             month = int(date.split('-')[1])
             if year >= int(self.current_date[:4]) + 1 or year == int(
                     self.current_date[:4]) and month >= int(
                         self.current_date[-2:]):
                 id = meeting.select('td')[0].string
                 item['id'] = id
                 title = meeting.select('td')[1].a['title']
                 item['title'] = title
                 url = meeting.select('td')[1].a['href']
                 item['site'] = url
                 city = meeting.select('td')[2].string
                 item['city'] = city
                 data2 = urllib2.urlopen(url).read().decode('gbk')
                 pattern_date_loc = re.compile(
                     u'召开时间.*?</span>(.*?)<br>.*?结束时间.*?</span>(.*?)<br>.*?地点.*?</span>(.*?)<br>',
                     re.S)
                 date_loc = re.search(pattern_date_loc, data2)
                 if date_loc:
                     begin_date_tmp = date_loc.group(1).replace('.', '-')
                     try:
                         begin_date = datetime.datetime.strptime(
                             begin_date_tmp,
                             '%Y-%m-%d').strftime('%Y-%m-%d')
                     except ValueError:
                         begin_date = begin_date_tmp
                     item['begin_date'] = begin_date
                     end_date_tmp = date_loc.group(2).replace('.', '-')
                     try:
                         end_date = datetime.datetime.strptime(
                             end_date_tmp, '%Y-%m-%d').strftime('%Y-%m-%d')
                     except ValueError:
                         end_date = end_date_tmp
                     item['end_date'] = end_date
                     loc = ''.join(date_loc.group(3).split())  #地点中有空格
                     item['venue'] = loc
                 else:
                     item['begin_date'] = item['end_date'] = date
                     item['venue'] = city
                 items.append(item)
         opera = DataOperator()
         opera.item_insert(data=items)
     else:
         print('未找到第%s页的数据' % page)
Пример #11
0
                current_date_format = self.current_date[:4]+'-'+self.current_date[4:]
                if begin_date >= current_date_format:
                    end_date_temp = date[1]
                    end_date = datetime.datetime.strptime(end_date_temp,'%Y-%m-%d').strftime('%Y-%m-%d')
                    venue = eachSoup.select('span[class="dico2"]')[1].text
                    item['city'] = city
                    item['begin_date'] = begin_date
                    item['end_date'] = end_date
                    item['id'] = id
                    item['title'] = title
                    item['venue'] = venue
                    items.append(item)
                    print id,title,city,begin_date,end_date,venue
                else:
                    print 'id为%s的数据已过期'%id
        opera = DataOperator()
        opera.item_insert(data=items)
        pattern_nextPage = re.compile(u'<a href="/zhanhui/class_\d+_(\d+).html">下一页')
        try:
            next_page = re.findall(pattern_nextPage, data)[0]
        except IndexError,e:
            print '查找完毕'
        else:
            print '找到第%s页'%next_page
            self.get_items(cate=cate, page=next_page)


if __name__ == '__main__':
    current_date = '201608'
    sid = '36'
    expowindow = ExpoWindow(sid, current_date)
Пример #12
0
 def getItems(self, page):
     page = page * 20
     data = self.getData(page)
     if data:
         print u'成功获取第%s页的数据' % page
         soup = BeautifulSoup(data, 'lxml')
         meetings = soup.findAll('ul', {'class': 'mod-meet-lt'})[0].findAll(
             'li', recursive=False)
         items = []
         for meeting in meetings:
             item = {
                 'sid': self.sid,
                 'begin_date': ' ',
                 'end_date': ' ',
                 'id': ' ',
                 'title': ' ',
                 'industry': ' ',
                 'city': ' ',
                 'venue': ' ',
                 'organizer': ' ',
                 'site': ' ',
                 'visitor': ' ',
                 'area': ' ',
                 'history_info_tag': '0'
             }
             meeting_url = meeting.a['href']
             meeting_id = meeting_url.split('/')[-1].split('.')[0]
             meeting_title = meeting.find('div', {
                 'class': 'mt-title'
             }).string.strip()
             meeting_city = meeting.find('span', {
                 'class': 'info-city'
             }).string.strip()
             meeting_time = meeting.find('span', {
                 'class': 'info-time'
             }).string.split()[0]
             print meeting_title, meeting_url, meeting_id, meeting_city
             each_meeting_data = self.getData(id=meeting_id)
             if each_meeting_data:
                 each_meeting_soup = BeautifulSoup(each_meeting_data,
                                                   'lxml')
                 try:
                     meeting_date = each_meeting_soup.find(
                         'li', {
                             'title': u'活动时间'
                         }).text.strip()
                 except AttributeError:
                     meeting_begin_date = meeting_end_date = meeting_time
                     #continue
                 else:
                     print meeting_date
                     meeting_begin_date = meeting_date.split('~')[0].split(
                         ' ')[0]
                     meeting_end_date = meeting_date.split(
                         '~')[-1].strip().split(' ')[0]
                     year = int(meeting_begin_date.split('-')[0])
                     month = int(meeting_begin_date.split('-')[1])
                     if year >= int(
                             self.current_date[:4]) + 1 or year == int(
                                 self.current_date[:4]) and month >= int(
                                     self.current_date[-2:]):
                         try:
                             meeting_venue = each_meeting_soup.find(
                                 'li', {
                                     'title': u'活动地点'
                                 }).text.strip().split()[0]
                         except AttributeError:
                             meeting_venue = ''
                         try:
                             meeting_visitors = each_meeting_soup.find(
                                 'li', {
                                     'title': u'活动人数'
                                 }).text.strip().rstrip(u'人')
                         except AttributeError:
                             meeting_visitors = ''
                         try:
                             meeting_organizer = each_meeting_soup.find(
                                 'li', {
                                     'title': u'主办单位'
                                 }).text.strip()
                         except AttributeError:
                             meeting_organizer = ''
                         print meeting_id, meeting_title, meeting_city, meeting_begin_date, meeting_end_date, meeting_venue, meeting_visitors, meeting_organizer
                         item['id'] = meeting_id
                         item['title'] = meeting_title
                         item['city'] = meeting_city
                         item['begin_date'] = meeting_begin_date
                         item['end_date'] = meeting_end_date
                         item['venue'] = meeting_venue
                         item['organizer'] = meeting_organizer
                         item['visitor'] = meeting_visitors
                         items.append(item)
                     else:
                         print u'id为%s的数据过期' % meeting_id
             else:
                 print '未找到id为%s的展会的其他数据' % meeting_id
         opera = DataOperator()
         opera.item_insert(data=items)
     else:
         print '未找到第%s页的数据' % page
Пример #13
0
 def get_items(self, city_name, city_id, begin_date, end_date):
     print u'准备爬取%s的数据' % (city_name)
     try:
         html = self.get_html(city_id, '1', begin_date, end_date)
         pages = re.findall(u'共(\d+)页', html)[0]
     except Exception as e:
         print('未找到%s的数据,error:%s' % (city_name, e))
         return
     print u'%s共%s页' % (city_name, pages)
     for page in range(1, int(pages) + 1):
         items = []
         print u'正在爬取%s第%s页' % (city_name, page)
         try:
             html = self.get_html(city_id, str(page), begin_date, end_date)
             pattern = re.compile(
                 u'div.*?class=\"info.*?<strong>.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?<em.*?class="cgree1">.*?展会时间:(.*?)展馆:(.*?)</a>',
                 re.S)
             meetings = re.findall(pattern, html)[0:]
         except Exception as e:
             print('未找到%s第%s页的数据,error:%s' % (city_name, page, e))
             continue
         for meeting in meetings:
             item = {
                 'sid': self.sid,
                 'begin_date': ' ',
                 'end_date': ' ',
                 'id': ' ',
                 'title': ' ',
                 'industry': ' ',
                 'city': city_name,
                 'venue': ' ',
                 'organizer': ' ',
                 'site': ' ',
                 'visitor': ' ',
                 'area': ' ',
                 'history_info_tag': '0'
             }
             id = meeting[0].split('_')[-1].split('.')[0]
             item['id'] = id
             title = meeting[1]
             item['title'] = title
             time_tmp = meeting[2]
             begin_time_tmp1 = time_tmp.split('---')[0]
             begin_time_tmp2 = re.sub(u'[年月日]', '-',
                                      begin_time_tmp1).rstrip('-')
             try:
                 begin_time = datetime.datetime.strptime(
                     begin_time_tmp2, '%Y-%m-%d').strftime('%Y-%m-%d')
             except TypeError:
                 begin_time = begin_time_tmp2
             item['begin_date'] = begin_time
             end_time_tmp1 = time_tmp.split('---')[1].split('日')[0]
             end_time_tmp2 = re.sub(u'[年月日]', '-', end_time_tmp1)
             if len(end_time_tmp2
                    ) < 3:  #如果len(end_date_temp)=3,说明结束日期只有日期,没有年份和月份
                 end_time_tmp3 = begin_time.replace(
                     begin_time.split('-')[-1],
                     end_time_tmp2)  #把开始日期的日期数值换成结束日期的值
             elif len(end_time_tmp2
                      ) < 6:  #如果len(end_date_temp)=6,说明结束日期有月份和日期,没有年份
                 end_time_tmp3 = begin_time.split(
                     '-')[0] + '-' + end_time_tmp2  #把开始日期的年份数值与结束日期连接起来
             else:
                 end_time_tmp3 = end_time_tmp2
             try:
                 end_time = datetime.datetime.strptime(
                     end_time_tmp3, '%Y-%m-%d').strftime('%Y-%m-%d')
             except TypeError:
                 end_time = end_time_tmp3
             item['end_date'] = end_time
             try:
                 venue = meeting[3].split('>')[-1]
             except IndexError:
                 venue = meeting[3]
             item['venue'] = venue
             time.sleep(2)
             try:
                 each_meeting_html = urllib2.urlopen(
                     'http://www.onezh.com' +
                     meeting[0]).read().decode('utf-8')
                 pattern_area = re.compile(
                     u'<div.*?class="title-detail">.*?<b>面积</b>.*?(\d+).*?</div>',
                     re.S)
                 area = re.findall(pattern_area, each_meeting_html)[0]
             except Exception:
                 print('未找到面积数据')
                 area = ' '
             item['area'] = area
             pattern_industry = re.compile(
                 u'<div.*?class="title-detail">.*?所属行业(.*?)</div>', re.S)
             try:
                 industry = re.findall(pattern_industry,
                                       each_meeting_html)[0].split('>')[-1]
             except IndexError:
                 industry = ' '
             item['industry'] = industry
             pattern_organizer = re.compile(
                 u'<div.*?class="title-detail">.*?主办单位(.*?)</div>', re.S)
             try:
                 organizer = re.findall(pattern_organizer,
                                        each_meeting_html)[0].split('>')[-1]
             except IndexError:
                 organizer = ' '
             item['organizer'] = organizer
             pattern_site = re.compile(u'<li>.*?<b>网址(.*?)</li>', re.S)
             try:
                 site = re.findall(pattern_site,
                                   each_meeting_html)[0].split('>')[-1]
             except IndexError:
                 site = ' '
             item['site'] = site
             print id, title, begin_time, end_time, venue, industry, organizer
             items.append(item)
         opera = DataOperator()
         opera.item_insert(data=items)
     return
Пример #14
0
    def get_items(self, data):
        soup = BeautifulSoup(data)
        meetings_tmp = soup.findAll('ul', {'class': 'trade-news haiwai'})[:2]
        for tmp in meetings_tmp:
            items = []
            items_history = []
            meetings = tmp.findAll('li')
            for meeting in meetings:
                item = {
                    'sid': self.sid,
                    'begin_date': ' ',
                    'end_date': ' ',
                    'id': ' ',
                    'title': ' ',
                    'industry': ' ',
                    'city': ' ',
                    'venue': ' ',
                    'organizer': ' ',
                    'site': ' ',
                    'visitor': ' ',
                    'area': ' ',
                    'history_info_tag': ' '
                }
                base_info = meeting.text.split()
                url = meeting.a['href']
                item['url'] = url
                id = url.split('_')[-1].split('.')[0]
                item['id'] = id
                begin_date = base_info[0]
                item['begin_date'] = begin_date
                item['end_date'] = begin_date
                industry = base_info[1].strip('】').strip('【')
                item['industry'] = industry
                city = base_info[2].strip('】').strip('【')
                item['city'] = city
                title = meeting.a['title']
                item['title'] = title
                print url, begin_date, industry, city, title
                each_meeting_url = url
                each_meeting_data = self.get_data(each_meeting_url)
                if each_meeting_data:
                    pattern_venue = re.compile(
                        u'<ul>.*?展会场馆.*?<a.*?>(.*?)</a>', re.S)
                    pattern_organizer = re.compile(
                        u'<ul>.*?组织单位.*?<a.*?>(.*?)</a>', re.S)
                    pattern_site = re.compile(
                        u'<ul>.*?官方网站.*?</strong>(.*?)</li>', re.S)
                    pattern_area = re.compile(u'<ul>.*?约.*?(\d+).*?平米.*?</li>',
                                              re.S)
                    try:
                        venue = re.findall(pattern_venue, each_meeting_data)[0]
                    except IndexError:
                        venue = ' '
                    item['venue'] = venue
                    try:
                        organizer = re.findall(pattern_organizer,
                                               each_meeting_data)[0]
                    except IndexError:
                        organizer = ' '
                    item['organizer'] = organizer
                    try:
                        site = re.findall(pattern_site, each_meeting_data)[0]
                    except IndexError:
                        site = ' '
                    item['site'] = site
                    try:
                        area = re.findall(pattern_area, each_meeting_data)[0]
                    except IndexError:
                        area = ' '
                    item['area'] = area
                    print venue, organizer, site, area

                    soup = BeautifulSoup(each_meeting_data)
                    try:
                        history_exhibitions = soup.findAll(
                            'table',
                            {'class': 'tbsty exhtbl'})[0].select('tr')[1:]
                    except IndexError:
                        print '没有找到历届展会信息'
                        history_info_tag = '0'
                    else:
                        print '找到历届展会信息'
                        history_info_tag = '1'
                        for history_exhibition in history_exhibitions:
                            item_history = {}
                            history_exhibition_info = history_exhibition.select(
                                'td')[1:4]
                            history_exhibition_title = history_exhibition_info[
                                0].a['title'].strip()
                            history_exhibition_url = history_exhibition_info[
                                0].a['href']
                            history_exhibition_id = history_exhibition_url.split(
                                '_')[-1].split('.')[0]
                            history_exhibition_date = history_exhibition_info[
                                0].a.string
                            print history_exhibition_title, history_exhibition_date, history_exhibition_url
                            history_exhibition_venue = history_exhibition_info[
                                1].a['title'].strip()
                            print history_exhibition_venue
                            history_exhibition_area_tmp = history_exhibition_info[
                                2].string.strip()
                            history_exhibition_area = filter(
                                lambda x: x.isdigit(),
                                history_exhibition_area_tmp)
                            print history_exhibition_area
                            item_history['sid'] = self.sid
                            item_history['itemid'] = id
                            item_history[
                                'history_itemid'] = history_exhibition_id
                            item_history['title'] = history_exhibition_title
                            item_history['venue'] = history_exhibition_venue
                            item_history['date'] = history_exhibition_date
                            item_history['area'] = history_exhibition_area
                            items_history.append(item_history)
                    item['history_info_tag'] = history_info_tag
                items.append(item)
            opera = DataOperator()
            opera.item_insert(data=items, data_history=items_history)
Пример #15
0
 def get_items(self, city_name, city_code,  page='1' ):
     url = 'http://www.expo-china.com/web/exhi/exhi_search.aspx?City=%s&Industry=-1&Start=%sT%s&page=%s'%(city_code, self.start_date, self.end_date, page)
     print '正在抓取%s第%s页的数据'%(city_name, page), url
     data = self.get_data(url)
     if data:
         soup = BeautifulSoup(data)
         items = []
         try:
             meetings=soup.findAll('div',{'class':'Resueltlist'})[0].findAll('li')
         except Exception as e:
             print('未找到展会数据,error:', e)
             return
         for meeting in meetings:
             item = {'sid':self.sid, 'begin_date':' ', 'end_date':' ', 'id':' ' , 'title':' ', 'industry':' ', 'city':city_name, 'venue':' ', 'organizer':' ', 'site':' ', 'visitor':' ', 'area':' ', 'history_info_tag':'0'}
             title = meeting.select('div')[0].h3.a.string.strip()
             href = meeting.select('div')[0].h3.a['href']
             id = href.split('-')[-1].split('.')[0]
             begin_date = meeting.select('div')[1].span.string.strip()
             print title, href, id, begin_date
             item['title'] = title
             item['id'] = id
             item['href'] = href
             item['begin_date'] = begin_date
             each_meeting_url = href
             each_meeting_data = self.get_data(each_meeting_url)
             if each_meeting_data:
                 each_meeting_soup = BeautifulSoup(each_meeting_data)
                 try:
                     each_meeting_info = each_meeting_soup.findAll('div', {'div','zhanhuijieshao_c'})[0]
                 except IndexError:
                     print '未找到%s的具体信息'%title
                     item['end_date'] = begin_date
                 else:
                     print '找到%s的具体信息'%title
                     try:
                         end_date = each_meeting_info.select('ul')[0].select('li')[0].text.split(u'至')[-1]
                     except IndexError:
                         end_date = begin_date
                     item['end_date'] = end_date
                     try:
                         venue = each_meeting_info.select('ul')[0].select('li')[1].text.split(u':')[-1]
                     except IndexError:
                         venue = ' '
                     item['venue'] = venue
                     try:
                         organizer = each_meeting_info.select('div[class*="zhuban_danwei_big"]')[0].div.text.split(u':')[-1].strip()
                     except IndexError:
                         organizer = ' '
                     item['organizer'] = organizer
                     print end_date, venue, organizer
             else:
                 print '未找到%s的详细数据'%title
                 item['end_date'] = begin_date
                 #item['venue'] = ' '
                 #item['organizer'] = ' '
             items.append(item)
         print '%s第%s页抓取完毕,准备写入'%(city_name, page)
         opera = DataOperator()
         opera.item_insert(data=items)
         try:
             next_url = soup.select('div[id="ctl00_MainPageHolder_webPage"]')[0].select('a')[-2]['href']
         except KeyError:
             print '全部抓取完毕!'
             return
         else:
             next_page = next_url.split('=')[-1]
             print '找到%s第%s页的数据'%(city_name, next_page)
             self.get_items(city_name, city_code, next_page)
     else:
         print('未找到第{}页的数据'.format(page))
Пример #16
0
 def get_items(self, begin_time):
     category = {
         '行业交流': 'conferlist',
         '商业展会': 'exhibition',
         '文艺赛事': 'literature',
         '活动聚会': 'event'
     }
     for key, value in category.iteritems():
         print '正在爬取%s的数据' % key
         first_url = 'http://www.77huiyi.com/meet/%s/?mc=&msi=%s&msa=&page=%s' % (
             value, begin_time, '1')
         print first_url
         try:
             data = self.get_data(first_url)
             pattern_numbers = re.compile(u'共(\d+)条')
             numbers = int(re.findall(pattern_numbers, data)[0])
             pages = int(math.ceil(numbers / 20.0))
         except Exception as e:
             print('error:', e)
             continue
         if pages == 0:
             print('在%s下没找到数据' % key)
             continue
         else:
             print('在%s下有%s条数据,共%s页' % (key, numbers, pages))
         for page in range(1, int(pages) + 1):
             items = []
             url = 'http://www.77huiyi.com/meet/%s/?mc=&msi=%s&msa=&page=%s' % (
                 value, begin_time, page)
             print '正在爬取第%s页的数据' % page
             print url
             data = self.get_data(url)
             if not data:
                 print('未找到第%s页的数据' % page)
                 continue
             soup = BeautifulSoup(data)  #data显示正常,但是soup是乱码,暂时还没解决此问题
             meetings = soup.findAll('ul',
                                     {'class': 'clearfix'})[1].findAll('li')
             for meeting in meetings:
                 item = {
                     'sid': self.sid,
                     'begin_date': ' ',
                     'end_date': ' ',
                     'id': ' ',
                     'title': ' ',
                     'industry': ' ',
                     'city': ' ',
                     'venue': ' ',
                     'organizer': ' ',
                     'site': ' ',
                     'visitor': ' ',
                     'area': ' ',
                     'history_info_tag': '0'
                 }
                 meeting_info = meeting.p
                 url = meeting_info.a['href']
                 id = url.split('/')[-2]
                 item['id'] = id
                 item['url'] = url
                 title = meeting_info.a.string
                 item['title'] = title
                 begin_date = meeting_info.select('span')[1].select(
                     'i')[0].string
                 item['begin_date'] = begin_date
                 city_tmp = meeting_info.select('span')[1].select(
                     'i')[1].string
                 try:
                     city = city_tmp.split()[-1]
                 except IndexError:
                     city = city_tmp
                 item['city'] = city
                 print url, title, begin_date, city
                 each_meeting_url = url
                 # each_meeting_data = urllib2.urlopen(each_meeting_url).read().decode('utf-8', 'ignore')
                 try:
                     each_meeting_data = requests.get(each_meeting_url).text
                     each_meeting_soup = BeautifulSoup(each_meeting_data)
                     each_meeting_info = each_meeting_soup.select(
                         'div[class*="conference-info"]')[0]
                 except Exception as e:
                     print('未找到%s的具体信息:%s' % (id, e))
                     item['end_date'] = begin_date
                 else:
                     end_date = each_meeting_info.select(
                         'span')[1].text.split('~')[-1].split()[0]
                     item['end_date'] = end_date
                     loc = each_meeting_info.select(
                         'span')[2].text.split()[-1]
                     item['venue'] = loc
                     print end_date, loc
                 items.append(item)
             opera = DataOperator()
             opera.item_insert(data=items)
     return
Пример #17
0
 def get_items(self):
     try:
         data = self.get_data(1)
         soup = BeautifulSoup(data)
         pages = int(soup.pagecount.string)
     except Exception as e:
         print('error:', e)
         return None
     print '共%s页' % pages
     for page in range(1, pages + 1):
         items = []
         print '正在抓取第%s页的数据' % page
         data = self.get_data(page)
         if data:
             soup = BeautifulSoup(data)
             meetings = soup.findAll('meeting')
             print '当前为第%s页' % soup.pageno
             for meeting in meetings:
                 item = {
                     'sid': self.sid,
                     'begin_date': ' ',
                     'end_date': ' ',
                     'id': ' ',
                     'title': ' ',
                     'industry': ' ',
                     'city': ' ',
                     'venue': ' ',
                     'organizer': ' ',
                     'site': ' ',
                     'visitor': ' ',
                     'area': ' ',
                     'history_info_tag': '0'
                 }
                 id = meeting.meetingid.string
                 item['id'] = id
                 item['title'] = meeting.meetingtitle.string
                 begin_date = meeting.meetingtime.string
                 item['begin_date'] = begin_date
                 city = meeting.meetingaddress.string
                 item['city'] = city
                 if meeting.subject.string:
                     item['industry'] = meeting.subject.string
                 each_meeting_url = 'http://www.meeting.edu.cn/meeting/meeting/notice/meetingAction-%s!detail.action' % id
                 print each_meeting_url
                 try:
                     each_meeting_data = urllib2.urlopen(
                         each_meeting_url).read()
                 except Exception, e:
                     print 'error:', e
                 else:
                     pattern = re.compile(
                         '开始日期.*?<td.*?>(.*?)</td>.*?结束日期.*?<td.*?>(.*?)</td>.*?具体地点.*?<td.*?>(.*?)</td>.*?主办单位.*?<td.*?>(.*?)</td>.*?会议网站.*?<td.*?>.*?<a.*?>(.*?)</a>',
                         re.S)
                     each_meeting_item = re.findall(pattern,
                                                    each_meeting_data)[0]
                     print each_meeting_item
                     end_date = each_meeting_item[1].strip()
                     print end_date
                     if not end_date:
                         end_date = begin_date
                     item['end_date'] = end_date
                     location = each_meeting_item[2].strip()
                     print location
                     if not location:
                         location = city
                     item['venue'] = location
                     organizer = each_meeting_item[3].strip()
                     print organizer
                     item['organizer'] = organizer
                     site = each_meeting_item[4].strip()
                     item['site'] = site
                     print site
                 items.append(item)
                 print item
             print '第%s页抓取完毕' % page
             print '准备写入第%s页的数据' % page
             opera = DataOperator()
             opera.item_insert(data=items)
         else:
             print '未找到第%s页的数据' % page
Пример #18
0
 def pre_spider(self):
     logger.info('爬虫开始前的操作')
     data_operator = DataOperator()
     data_operator.truncate_table('eventlist_official_temp')
     data_operator.from_official_to_temp_official()
     data_operator.truncate_table('eventlist_last')
     data_operator.from_current_to_last()
     data_operator.truncate_table('eventlist_current')
     data_operator.truncate_table('eventlist_unique')
     data_operator.mysql_close()
Пример #19
0
 def get_data(self, page, city_name):
     pattern = re.compile(
         '<dd>.*?<a.*?(\d+).html".*?>(.*?)</a>.*?<p>.*?<a.*?>(.*?)</a>.*?<i.*?>(.*?)</i>.*?<p>.*?<a.*?>(.*?)</a>.*?<a.*?>(.*?)</a>.*?</dd>',
         re.S)
     items = []
     url = 'http://zhanhui.3158.cn/zhxx/all/trade/%s/%s/' % (city_name,
                                                             str(page))
     print('正在抓取第%s页的数据' % page)
     print(url)
     html = self.get_html(url)
     if html:
         data = re.findall(pattern, html)
         for i in data:
             item = {
                 'begin_date': ' ',
                 'end_date': ' ',
                 'id': ' ',
                 'title': ' ',
                 'industry': ' ',
                 'city': ' ',
                 'venue': ' ',
                 'organizer': ' ',
                 'site': ' ',
                 'visitor': ' ',
                 'area': ' ',
                 'history_info_tag': '0'
             }
             begin_date = i[3].split(' ')[0]
             end_date = i[3].split(' ')[-1]
             year = int(begin_date.split('-')[0])
             month = int(begin_date.split('-')[1])
             if year >= int(self.current_date[:4]) + 1 or year == int(
                     self.current_date[:4]) and month >= int(
                         self.current_date[-2:]):
                 item['sid'] = self.sid
                 item['begin_date'] = begin_date
                 item['end_date'] = end_date
                 id = i[0]
                 item['id'] = id
                 title = i[1]
                 print(title)
                 item['title'] = title
                 industry = i[2]
                 item['industry'] = industry
                 city = i[4]
                 item['city'] = city
                 venue = i[5]
                 item['venue'] = venue
                 h2 = 'http://zhanhui.3158.cn/zhxx/n%s.html' % i[0]
                 print(h2)
                 html2 = self.get_html(h2)
                 pattern_organizer = re.compile('主办单位:(.*?)</span>', re.S)
                 organizer_tmp1 = re.findall(pattern_organizer, html2)
                 if organizer_tmp1:
                     try:
                         organizer_tmp2 = organizer_tmp1[0].split(
                             '>')[1].split('<')[0]
                     except IndexError:
                         org = re.split('、|\s',
                                        organizer_tmp1[0].strip())[0]
                     else:
                         org = re.split('、|\s', organizer_tmp2)[0]
                 else:
                     org = ' '
                 print(org)
                 item['organizer'] = org
                 items.append(item)
         if items:
             print('准备写入第%s页的数据' % page)
             opera = DataOperator()
             opera.item_insert(data=items)
         else:
             print('第%s页的数据全部过期,不会写入!' % page)
     else:
         print('未找到%s第%s页的数据' % (city_name, page))
Пример #20
0
 def getItems(self, page):
     data = self.get_data(page=page)
     data = json.loads(data)
     if data:
         print '找到第%s页的数据' % page
         meetings = data['events']
         items = []
         if meetings:
             for meeting in meetings:
                 if not meeting:
                     print '全部爬取完毕'
                     break
                 item = {
                     'sid': self.sid,
                     'begin_date': ' ',
                     'end_date': ' ',
                     'id': ' ',
                     'title': ' ',
                     'industry': ' ',
                     'city': ' ',
                     'venue': ' ',
                     'organizer': ' ',
                     'site': ' ',
                     'visitor': ' ',
                     'area': ' ',
                     'history_info_tag': '0'
                 }
                 itemid = meeting['event_id']
                 title = meeting['event_name']
                 begin_date = meeting['event_begin_time'][:10]
                 print itemid, title
                 if ''.join(begin_date.split('-')[:2]) >= self.current_date:
                     end_date = meeting['event_end_time'][:10]
                     try:
                         city = meeting['event_city_info'][0][
                             'district_name']
                     except IndexError:
                         city = ''
                     try:
                         venue = meeting['event_venue_info'][0].get('title')
                     except IndexError:
                         venue = city
                     visitor = meeting.get('event_scale', ' ')
                     if not visitor:
                         visitor = ''
                     try:
                         organizer = meeting['event_sponsor'][0].get(
                             'ns_name', '')
                     except IndexError:
                         organizer = ''
                     print itemid, title, city, venue, begin_date, end_date, visitor, organizer
                     item['id'] = itemid
                     item['title'] = title
                     item['city'] = city
                     item['venue'] = venue
                     item['begin_date'] = begin_date
                     item['end_date'] = end_date
                     item['visitor'] = visitor
                     items.append(item)
                 else:
                     print '第%s页ID为%s的数据过期' % (page, itemid)
             opera = DataOperator()
             opera.item_insert(data=items)
             return
         else:
             print '第%s页没有展会数据' % page
             return
     else:
         print '访问到%s页时被禁止,暂停6分钟后继续!' % (page)
         for i in tqdm(range(3600)):
             time.sleep(.1)
             self.getItems(page)
Пример #21
0
class HuoDongShu:
    def __init__(self, sid, current_date):
        self.url = 'http://www.huodongshu.com'
        self.current_date = current_date
        self.sid = sid

    def get_html(self, page, month):
        headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Connection': 'keep-alive',
            'Host': 'www.huodongshu.com',
            'Origin': 'http://www.huodongshu.com',
            'Referer': 'http://www.huodongshu.com/html/find.html',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
        }
        params = {
            'count': '10',
            'type': '1',
            'category_one': '222',
            'category_two': 'all',
            'city_name': '222',
            'time_can': month + 3,
            'page': page
        }
        url = self.url + '/event/getComEventListPcData.do'
        print url
        response = requests.post(url, headers=headers, params=params)
        data = response.json()
        if data['msg'] == 'ok':
            return data['data']

    def get_items(self, month, page=1):
        data = self.get_html(page=page, month=month)
        pages, numbers = data['pageCount'], data['total']
        #print '%月份共有%s页%s条数据,目前正在抓取第%s页的数据'%(pages, numbers, page)
        for page in range(1, int(pages) + 1):
            print '%s月份共有%s页%s条数据,目前正在抓取第%s页的数据' % (month, pages, numbers,
                                                    page)
            data = self.get_html(page=page, month=month)
            events = data['list']
            items = []
            for event in events:
                item = {
                    'sid': self.sid,
                    'begin_date': ' ',
                    'end_date': ' ',
                    'id': ' ',
                    'title': ' ',
                    'industry': ' ',
                    'city': ' ',
                    'venue': ' ',
                    'organizer': ' ',
                    'site': ' ',
                    'visitor': ' ',
                    'area': ' ',
                    'history_info_tag': '0'
                }
                id = event.get('id')
                title = event.get('name')
                venue = event.get('place')
                try:
                    city = event.get('city_name').rstrip('市')
                except Exception, e:
                    print 'error:', e
                    address = jieba.analyse.extract_tags(
                        venue, allowPOS=['ns'])  #对展会场馆分词保留地名
                    if address:
                        city = address[0]
                    else:
                        city = ''

                begin_date = time.strftime(
                    '%Y-%m-%d', time.localtime(float(event.get('start_time'))))
                end_date = time.strftime(
                    '%Y-%m-%d', time.localtime(float(event.get('end_time'))))
                eachUrl = event.get('long_url')
                if eachUrl:
                    eachData = requests.get(eachUrl).content
                    try:
                        visitor = BeautifulSoup(eachData).find(
                            'span', {
                                'data-id': 'dimensions'
                            }).text.rstrip('人')
                    except Exception, e:
                        print 'error:', e
                        visitor = ''
                item['city'] = city
                item['begin_date'] = begin_date
                item['end_date'] = end_date
                item['id'] = id
                item['title'] = title
                item['venue'] = venue
                item['visitor'] = visitor
                items.append(item)
                print id, title, city, begin_date, end_date, venue, visitor
            print '正在写入%s月份第%s页的数据' % (month, page)
            opera = DataOperator()
            opera.item_insert(data=items)