Exemplo n.º 1
0
 def weightLinksOnPage( self, bodySoup, winSize = 3 ):
     #print self.kw.words
     links= {}
     ce = ContentExtractor()
     mc = ce.extractMappedContentList(bodySoup)
     for i in mc[0]:
         if type(i) != types.IntType:
             continue
         link = mc[1][i].get('href')
         links[link] = self.weightWindowed( mc[0], i, winSize ) # max(...)
     return links
Exemplo n.º 2
0
def process(pagedirs, contentdb):
    sources = os.listdir(pagedirs)
    ce = ContentExtractor(db=DB(contentdb))
    for source in sources:
        sourcedir = os.path.join(pagedirs, source)
        if not os.path.isdir(sourcedir):
            continue
        pagenames = os.listdir(sourcedir)
        pagenames = diff_task(pagenames, contentdb)
        for pagename in pagenames:
            hashurl = pagename
            pagename = os.path.join(sourcedir, pagename)
            with open(pagename, 'r') as f:
                ce.parse(page=f.read(), source=source, hashurl=hashurl)
Exemplo n.º 3
0
def process(pagedirs, contentdb):
    sources = os.listdir(pagedirs)
    ce = ContentExtractor(
        db=DB(contentdb)
    )
    for source in sources:
        sourcedir = os.path.join(pagedirs, source)
        if not os.path.isdir(sourcedir):
            continue
        pagenames = os.listdir(sourcedir)
        pagenames = diff_task(pagenames, contentdb)
        for pagename in pagenames:
            hashurl = pagename
            pagename = os.path.join(sourcedir, pagename)
            with open(pagename, 'r') as f:
                ce.parse(
                    page=f.read(),
                    source=source,
                    hashurl=hashurl
                )