def weightLinksOnPage( self, bodySoup, winSize = 3 ): #print self.kw.words links= {} ce = ContentExtractor() mc = ce.extractMappedContentList(bodySoup) for i in mc[0]: if type(i) != types.IntType: continue link = mc[1][i].get('href') links[link] = self.weightWindowed( mc[0], i, winSize ) # max(...) return links
def process(pagedirs, contentdb): sources = os.listdir(pagedirs) ce = ContentExtractor(db=DB(contentdb)) for source in sources: sourcedir = os.path.join(pagedirs, source) if not os.path.isdir(sourcedir): continue pagenames = os.listdir(sourcedir) pagenames = diff_task(pagenames, contentdb) for pagename in pagenames: hashurl = pagename pagename = os.path.join(sourcedir, pagename) with open(pagename, 'r') as f: ce.parse(page=f.read(), source=source, hashurl=hashurl)
def process(pagedirs, contentdb): sources = os.listdir(pagedirs) ce = ContentExtractor( db=DB(contentdb) ) for source in sources: sourcedir = os.path.join(pagedirs, source) if not os.path.isdir(sourcedir): continue pagenames = os.listdir(sourcedir) pagenames = diff_task(pagenames, contentdb) for pagename in pagenames: hashurl = pagename pagename = os.path.join(sourcedir, pagename) with open(pagename, 'r') as f: ce.parse( page=f.read(), source=source, hashurl=hashurl )