Exemplo n.º 1
0
 def __init__(self, folder):
     self._url = "http://board.netdoktor.de"
     self.pageCache = CacheHandler()
     self.contentCache = CacheHandler()
     self.dataToStore = []  #list of items to store
     self.board = None
     self.section = None
     self.outputFolder = folder
Exemplo n.º 2
0
 def __init__(self, folder):
     self._url = "http://board.netdoktor.de"
     self.pageCache = CacheHandler()
     self.contentCache = CacheHandler()
     self.dataToStore = [] #list of items to store
     self.board = None
     self.section = None
     self.outputFolder = folder
Exemplo n.º 3
0
class Board_NetDoktorParser(object):
    '''Module for text minning from board.netdoktor.de'''
    def __init__(self, folder):
        self._url = "http://board.netdoktor.de"
        self.pageCache = CacheHandler()
        self.contentCache = CacheHandler()
        self.dataToStore = [] #list of items to store
        self.board = None
        self.section = None
        self.outputFolder = folder

    
    #add some stacionary data to all items in list
    def addItemToFinalList(self, name, data):
        for i in range(len(self.dataToStore)):
            self.dataToStore[i][name] = data
        pass
    
    
    def updateContent(self, url):
        '''if new entries in article found, they are processing and add to list to save'''
        #print url[25:]
        #pbar(url[25:])
        #print ".",
        #log.debug("UPDATING data from - %s",url)
        page = getDataFrom(url, None, None)
        
        if page == None:
            return False
        
        pageParser = PageParser(page)
        notCached = pageParser.getEntriesList()
        #print notCached
        eToStore = self.pageCache.cacheAndReturnNew(url, notCached)
        
        for i in range(len(eToStore)):
            eToStore[i]["link"] = url
        #print "adding" + str(len(eToStore))  
        log.debug("ADD %d new entries", len(eToStore))
        
        self.dataToStore.extend(eToStore)
               
        return True
    
        
    
    
    def processData(self, itemList):
        '''proces links to articles in forum section page, if change detect, update appeared'''
        #print itemList
        for item in itemList:
            cached = self.pageCache.getEntriesLen(item["url"])
            loaded = item["comments"] + 1
            #new comments in article detected
            if cached < loaded:
                #print "ecpect " + str(loaded-(cached))
                #print "need update for " + item["url"]
                msg = "need update for %s - %d new ent" %(item["url"],loaded - cached)
                log.debug(msg)
                if self.updateContent(item["url"]):
                    self.pageCache.setEntriesLen(item["url"], loaded)
            else:
                log.debug("no new entries for %s", item["url"])
                pass
    
        pass
    
    
    def processSection(self, bsection):
        '''process all pages in forum section'''
        actualPage = 1
        sectionLenght = None
        url = urljoin(self._url,bsection)
        while(True):
            try:
                urll = url + "/" + str(actualPage) 
                #urll = url + "/" + str(151)
                page = getDataFrom(urll, None, None)
    
                #print "PS for " + url
                if page is None and sectionLenght is None:
                    log.debug("none data, return")
                    return
                elif page is None and sectionLenght != None:
                    log.debug("none data, continue")
                    continue
                
                pageParser = PageParser(page)
                if not sectionLenght:
                    #get max page in section
                    sectionLenght = pageParser.getSectionLenght()
                    #print "sectionLenght is" + str(sectionLenght)
                    log.debug("sectionLenght is %s" , str(sectionLenght)) 
                    
                itemList = pageParser.getListOfItems()
                
                
                self.processData(itemList)
                #add stacionary data
                self.addItemToFinalList("source", "http://board.netdoktor.de/")
                self.addItemToFinalList("section", "netdoktor")
                self.addItemToFinalList("lang", "de")
                #SAVE!!!
                self.createXmlandWrite(name = bsection.replace("/","_"))
            except Exception,e:
                log.critical("%s",e)
                log.exception("Some exception in process section")
                
            if actualPage >= sectionLenght:
                return
            actualPage += 1
Exemplo n.º 4
0
class Board_NetDoktorParser(object):
    '''Module for text minning from board.netdoktor.de'''
    def __init__(self, folder):
        self._url = "http://board.netdoktor.de"
        self.pageCache = CacheHandler()
        self.contentCache = CacheHandler()
        self.dataToStore = []  #list of items to store
        self.board = None
        self.section = None
        self.outputFolder = folder

    #add some stacionary data to all items in list
    def addItemToFinalList(self, name, data):
        for i in range(len(self.dataToStore)):
            self.dataToStore[i][name] = data
        pass

    def updateContent(self, url):
        '''if new entries in article found, they are processing and add to list to save'''
        #print url[25:]
        #pbar(url[25:])
        #print ".",
        #log.debug("UPDATING data from - %s",url)
        page = getDataFrom(url, None, None)

        if page == None:
            return False

        pageParser = PageParser(page)
        notCached = pageParser.getEntriesList()
        #print notCached
        eToStore = self.pageCache.cacheAndReturnNew(url, notCached)

        for i in range(len(eToStore)):
            eToStore[i]["link"] = url
        #print "adding" + str(len(eToStore))
        log.debug("ADD %d new entries", len(eToStore))

        self.dataToStore.extend(eToStore)

        return True

    def processData(self, itemList):
        '''proces links to articles in forum section page, if change detect, update appeared'''
        #print itemList
        for item in itemList:
            cached = self.pageCache.getEntriesLen(item["url"])
            loaded = item["comments"] + 1
            #new comments in article detected
            if cached < loaded:
                #print "ecpect " + str(loaded-(cached))
                #print "need update for " + item["url"]
                msg = "need update for %s - %d new ent" % (item["url"],
                                                           loaded - cached)
                log.debug(msg)
                if self.updateContent(item["url"]):
                    self.pageCache.setEntriesLen(item["url"], loaded)
            else:
                log.debug("no new entries for %s", item["url"])
                pass

        pass

    def processSection(self, bsection):
        '''process all pages in forum section'''
        actualPage = 1
        sectionLenght = None
        url = urljoin(self._url, bsection)
        while (True):
            try:
                urll = url + "/" + str(actualPage)
                #urll = url + "/" + str(151)
                page = getDataFrom(urll, None, None)

                #print "PS for " + url
                if page is None and sectionLenght is None:
                    log.debug("none data, return")
                    return
                elif page is None and sectionLenght != None:
                    log.debug("none data, continue")
                    continue

                pageParser = PageParser(page)
                if not sectionLenght:
                    #get max page in section
                    sectionLenght = pageParser.getSectionLenght()
                    #print "sectionLenght is" + str(sectionLenght)
                    log.debug("sectionLenght is %s", str(sectionLenght))

                itemList = pageParser.getListOfItems()

                self.processData(itemList)
                #add stacionary data
                self.addItemToFinalList("source", "http://board.netdoktor.de/")
                self.addItemToFinalList("section", "netdoktor")
                self.addItemToFinalList("lang", "de")
                #SAVE!!!
                self.createXmlandWrite(name=bsection.replace("/", "_"))
            except Exception, e:
                log.critical("%s", e)
                log.exception("Some exception in process section")

            if actualPage >= sectionLenght:
                return
            actualPage += 1