예제 #1
0
파일: otofun_net.py 프로젝트: nvn6w/Crawler
    def getThreadsInTopic(self, topicUrl):
        '''
        Get all threads in a topic
        '''

        res = []
        try:
            html = Request.get_page_content(topicUrl)
            soup = BeautifulSoup(html)
            
            threads = soup.find('ol', {'id' : 'threads'})        
            if threads:
                #find thread
                for thread in threads.findAll('h3', {'class' : 'threadtitle'}):
                    #print thread
                    tLink = thread.find('a', {'class' : 'title'})
                    if (tLink) :
                        tUrl  = tLink['href']
                        pos = tUrl.find('?s=')
                        if pos:
                            tUrl = tUrl[0:pos]
                        tUrl = 'http://www.otofun.net/forums/' + tUrl
                        if tUrl not in res:
                            res.append(tUrl)
        except :
            print 'Error when get threads in topic'
            pass
        return res               
예제 #2
0
파일: otofun_net.py 프로젝트: nvn6w/Crawler
 def getTotalPageInTopic(self, topicUrl):
     '''
         Lay tong so page trong 1 topic
     '''        
     total = 0
     try:
         html = Request.get_page_content(topicUrl)
         soup = BeautifulSoup(html)
         nav = soup.find('div', {'class' : 'threadpagenav'})
         if nav:
             lastPage = nav.find('span', {'class' : 'first_last'})
             if lastPage:
                 aLink = lastPage.find('a')
                 if aLink:
                     url = aLink['href']
                     pos1 = url.find('/page')
                     pos2 = url.find('?s=')
                     if pos1 != -1:
                         if pos2 != -1: # ton tai ?s=
                             page = url[pos1+5:pos2]
                         else :
                             page = url[pos1+5:]
                         total = int(page)
     except:
         print 'Error when get total page in topic'
         pass
     return total
예제 #3
0
 def getThreadsInTopic(self, topicUrl):
     '''
     Lay tat ca cac thread goc co trong 1 topic
     '''
     res = []
     try:
         html = Request.get_page_content(topicUrl)
         soup = BeautifulSoup(html)
         ulThread = soup.find('ul', {'id' : 'threads'})
         #print ulThread
         if ulThread:
             threads = ulThread.findAll('li', {'class' : 'threadbit_nam_fix_select'})    
             if threads:                 
                 for thread in threads:
                     try:
                         #import pdb
                         #pdb.set_trace()
                         #print thread
                         tLink = thread.find('a', {'class' : 'title'})
                         if (tLink) :
                             tUrl = tLink['href']
                             res.append(tUrl)
                             #print tUrl
                             #print '--------------------'
                     except Exception,e:
                         print e.message;
                         tb = traceback.format_exc()
                         print tb
     except Exception, e:
         print e.message;
         tb = traceback.format_exc()
         print tb
         pass
예제 #4
0
 def getThreadsInTopic(self, topicUrl):
     '''
     Get all threads in a topic
     '''
     
     res = []
     #html = urllib.urlopen(topicUrl).read()
     html = Request.get_page_content(topicUrl)
     soup = BeautifulSoup(html)
     
     threads = soup.find('ol', {'id' : 'threads'})        
     if threads:                    
         for thread in threads.findAll('li', {'class' : 'threadbit '}):                            
             tLink = thread.find('a', {'class' : 'title'})  
             if tLink :                        
                 tUrl  = tLink['href']
                 if tUrl:
                     pos = tUrl.find('?s=') # loai bo chuoi ?s= .....
                     if pos != -1:
                         tUrl = tUrl[0:pos]                        
                     # them base URL    
                     tUrl = 'http://www.lamchame.com/forum/' + tUrl
                     
                     res.append(tUrl.encode('utf-8'))
                     #print tUrl 
                     #print '-----------'
     return res
예제 #5
0
 def getAllTopics(self):
     '''
     Lay ds cac topic co trong forum
     '''
     res = []
     try:
         baseUrl = 'http://www.webtretho.com/forum/f'
         
         url = 'http://www.webtretho.com/forum/search.php?search_type=1&contenttype=vBForum_Post'
         html = Request.get_page_content(url)
         soup = BeautifulSoup(html)
         
         soupCates = soup.find('select', {'id' : 'forumchoice'})
         cates = soupCates.findAll('option')
         for cate in cates:
             topicNumber = cate['value']
             
             if topicNumber.isdigit():                
                 topicUrl = baseUrl + topicNumber + '/'
                 res.append(topicUrl)
     except Exception, e:
         print e.message;
         tb = traceback.format_exc()
         print tb
         pass
예제 #6
0
    def getThreadDetail(self, url, fileContent = '', page = 1):

        res = ()
        html = ''
        try:
            html = Request.get_page_content(url);
        except Exception, e:
            return None
예제 #7
0
파일: otofun_net.py 프로젝트: nvn6w/Crawler
    def getThreadDetail(self, url):
        res = {}
        
        try :
            html = Request.get_page_content(url)
            soup = BeautifulSoup(html)
            #print soup
            postContainer = soup.find('ol', {'id' : 'posts'})
            
            title = soup.find('span', {'class' : 'threadtitle'}).get_text().strip()
            
            posts = postContainer.findAll('li', {'class' : 'postbit postbitim postcontainer'})
            
            count = 0
            comments = []
            for post in posts:
                
                #print post
                count += 1 
                                   
                postContent = post.find('blockquote', {'class' : 'postcontent restore'})
                postContent = postContent.get_text()
                #postContent = re.sub('<br/>+', '_NEW_LINE_', postContent)
                postContent = re.sub('[\t]+', ' ', postContent)
                postContent = re.sub('[ ]+', ' ', postContent)
                postContent = re.sub('[\\r\\n]+', '\n', postContent)
                postContent = postContent.strip()            
                
                # date infomation
                dateInfo = post.find('span', {'class' : 'postdate'}).get_text().strip()
                dateInfo = re.sub('\s+', ' ', dateInfo)

                # user infomation
                userInfo = post.find('div', {'class' : 'username_container'}).find('strong').get_text().strip()
                
                info = {'user' : userInfo, 'date': dateInfo, 'content' : postContent}
                
                if count == 1: # post
                    postInfo = info;
                    postInfo['title'] = title
                    res['post']= postInfo
                else :  # comment
                    comments.append(info)
                    
            res['comments'] = comments
            
        except:
            #print 'ERROR when crawling URL : ' , url
            print 'ERROR when get thread detail'
            pass
        
        return res    
예제 #8
0
 def getAllTopics(self):
     res = []
     
     baseUrl = 'http://www.lamchame.com/forum/forumdisplay.php/'
     
     searchUrl = 'http://www.lamchame.com/forum/search.php?search_type=1&contenttype=vBForum_Post'
     html = Request.get_page_content(searchUrl)
     soup = BeautifulSoup(html)
     
     soupCates = soup.find('select', {'id' : 'forumchoice'})
     cates = soupCates.findAll('option')
     for cate in cates:
         topicNumber = cate['value']            
         if topicNumber.isdigit():                
             title = cate.string.strip()
             topicUrl = baseUrl + topicNumber + '-' + title.replace(' ', '-')
             res.append(topicUrl.encode('utf-8'))
     return res
예제 #9
0
 def getAllTopics(self):
     res = []
     
     baseUrl  ='http://www.otofun.net/'
     url = 'http://www.otofun.net/forum.php'
     
     html = Request.get_page_content(url)
     soup = BeautifulSoup(html)
     links = soup.findAll('a')
     for link in links:
         if link['href'].startswith('forums/'):
             u = link['href']
             pos = u.find('?s=')
             if pos != -1:
                 u = u[0:pos]
             u = baseUrl + u
             
             res.append(u)
     return res  
예제 #10
0
 def getTotalPageInTopic(self, topicUrl):
     '''
     Get total page in a topic
     '''
     total = 0
     try:
         html = Request.get_page_content(topicUrl)
         soup = BeautifulSoup(html)
         pageNav = soup.find('div', {'class' : 'threadpagenav'})
         if pageNav:
             lastPage = pageNav.find('span', {'class' : 'first_last1'})
             if lastPage:
                 #print lastPage.string
                 total = int(lastPage.string)
     except:
         print 'Error when get total page'
         pass            
     
     return total
예제 #11
0
파일: otofun_net.py 프로젝트: nvn6w/Crawler
 def getAllTopics(self):
     res = []
     try:
         baseUrl  ='http://www.otofun.net/'
         url = 'http://www.otofun.net/forum.php'
         
         html = Request.get_page_content(url)
         soup = BeautifulSoup(html)
         links = soup.findAll('a')
         for link in links:
             if link['href'].startswith('forums/'):
                 u = link['href']
                 pos = u.find('?s=')
                 if pos != -1:
                     u = u[0:pos]
                 u = baseUrl + u                
                 res.append(u.encode('utf-8'))
     except:
         print 'Error when get all topic'
         pass
     return res  
예제 #12
0
 def getTotalPageInThread(self, url):
     '''
         Lay tong so page co trong 1 thread
     '''
     total = 1
     html = Request.get_page_content(url);
     soup = BeautifulSoup(html)
     pageInfo = soup.find('div', {'class' : 'pageRefix'})
     if pageInfo:
         lastPage = pageInfo.find('a', {'class' : 'arrowLstPage'})
         if lastPage:
             #print lastPage
             link = lastPage['href']
             #print link
             pos = link.index('.html')
             
             if pos != False:
                 p = re.compile("index(\d+).html")
                 a =  p.search(link)
                 #print link[pos-1:]
                 total = a.group(1)
                 total = int(total)
                 #print total
     return total
예제 #13
0
    def getThreadsInTopic(self, topicUrl, page = 1):
        '''
        Get all threads in a topic
        '''

        res = {}
        html = Request.get_page_content(topicUrl)
        soup = BeautifulSoup(html)
        
        threads = soup.find('ol', {'id' : 'threads'})        
        if threads:
            #find thread
            for thread in threads.findAll('h3', {'class' : 'threadtitle'}):
                #print thread
                tLink = thread.find('a', {'class' : 'title'})
                if (tLink) :
                    tUrl  = tLink['href']
                    pos = tUrl.find('?s=')
                    if pos:
                        tUrl = tUrl[0:pos]
                    tUrl = 'http://www.otofun.net/forums/' + tUrl
                    if tUrl not in self.listUrl:
                        self.listUrl.append(tUrl)
                        print 'Thread ',tUrl
                        print tLink.string
                        print '==========================='

                        query = u"INSERT INTO url SET url = '%s', domain = '%s', created_date = UNIX_TIMESTAMP(NOW()), modified_date = UNIX_TIMESTAMP(NOW()), done = 0, running = 0" %(tUrl, 'otofun.net')
                        self.dbCursor.execute(query)
                        self.connection.commit()

                        # if self.COUNT_TOPIC % 100 == 0:
                        #     import os;
                        #     if not os.path.exists(self.dataDir + str(self.COUNT_TOPIC) + '-' + str(self.COUNT_TOPIC + 100)):
                        #         os.makedirs(self.dataDir + str(self.COUNT_TOPIC) + '-' + str(self.COUNT_TOPIC + 100))
                        #     self.dataFolder = self.dataDir + str(self.COUNT_TOPIC) + '-' + str(self.COUNT_TOPIC + 100)
                        #
                        # self.getThreadDetail(tUrl)
                        # self.COUNT_TOPIC += 1
                        #
                        # print 'Topic number: ', self.COUNT_TOPIC
                        # print 'Post number: ', self.COUNT_POST
                    time.sleep(1)
                print '-----------'

        #find sub-topic
        topics = soup.find('ol', {'class' : 'subforumlist'})
        if topics:
            for topic in topics.findAll('li', {'class' : 'subforum'}):
                tLink = topic.find('a')
                if (tLink) :
                    tUrl  = tLink['href']
                    if tUrl not in self.listUrl:
                        self.listUrl.append(tUrl)
                        if 'http://' not in tUrl:
                            tUrl = self.baseUrl + tUrl
                        print 'Topic Level- ', page, tUrl
                        print tLink.string
                        self.getThreadsInTopic(tUrl)
                print '-----------'
        page = page + 1
        if '?' in topicUrl:
            topicUrl = topicUrl + '&page=' + str(page)
        else:
            topicUrl = topicUrl + '?page=' + str(page)
        self.getThreadsInTopic(tUrl, page)
        return res
예제 #14
0
    def getThreadDetail(self, url):
        '''
            Lay thong tin chi tiet cua 1 thread
        '''
        #res = { 'post' : {'user' : '', 'title': '', 'date' : '', 'content' : ''}, 'comments' : [{'user' : '', 'date': '', 'content' : ''}] }
        res = {}
        try:
            html = Request.get_page_content(url);
            soup = BeautifulSoup(html)        
            soupPost = soup.find('ol', {'id' : 'posts'})
            
            title = soup.find('div', {'id' : 'widgetRefix'}).find('h1').get_text().strip()
            #print title
                
            # list posts
            posts = soupPost.findAll('li', {'class' : 'postbit postbitim postcontainer'})
            #print posts
            #print len(posts)
            #print posts.size()            
            count = 0
            comments = []
                        
            for post in posts:
                
                #print post
                count += 1 
                                   
                postContent = post.find('blockquote', {'class' : 'postcontent restore'})
                #postContent = postContent.renderContents()
                postContent = postContent.get_text()
                #print postContent
                #print '----------------------'
                postContent = re.sub('[\t]+', ' ', postContent)
                postContent = re.sub('[ ]+', ' ', postContent)
                postContent = re.sub('[\\r\\n]+', '\n', postContent)
                postContent = postContent.strip()
                #print soup2.get_text().strip()
                
                #print postContent
                
                # date infomation
                dateInfo = post.find('span', {'class' : 'postdate'}).get_text().strip()
                dateInfo = re.sub('\s+', ' ', dateInfo)
                #print dateInfo

                # user infomation
                userInfo = post.find('div', {'class' : 'username_container'}).get_text().strip()
                #print userInfo
                
                info = {'user' : userInfo, 'date': dateInfo, 'content' : postContent}
                
                if count == 1: # post
                    postInfo = info;
                    postInfo['title'] = title
                    res['post']= postInfo
                else :
                    comments.append(info)
                #print '--------------------------------'
                
            res['comments'] = comments
            #print res
            
        except Exception, e:
            print e.message;
            tb = traceback.format_exc()
            print tb
            pass