Пример #1
0
    def readWebPage(self, urlString, depth=1, isExternal=False):
        webPageData = self.db.websites.search(
            filters=all(eq('address',
                           WebPage.parseUrl(urlString).string))).rows()
        pageLinks = []
        result = None

        if len(webPageData) == 0:
            return result

        webPageData = webPageData[0]
        pageId = webPageData[0]

        depthData = self.db.session.search('depth',
                                           all(eq('website_id', pageId)))
        if len(depthData) > 0:
            depth = depthData[0][0]

        result = WebPage(url=webPageData[1],
                         depth=depth,
                         isExternal=isExternal)

        query = self.db.execute(
            'SELECT w.{0}, r.{0} from links join websites as w on links.{1} = w.id join websites as r on links.{2} = r.id WHERE w.id = {3};'
            .format(self.db.websites.fields[1], self.db.links.fields[1],
                    self.db.links.fields[2], pageId))

        for row in iter(query):
            pageLinks.append(
                WebPage(url=row[1], parent=result, depth=depth + 1))
        result.links = pageLinks

        return result
Пример #2
0
def test():
    page = WebPage(url='pduch.kis.p.lodz.pl')
    page.downloadContent()
    hist = WebsiteDatabase()
    hist.insertWebpage(page, connection=True)
    if not hist.isInThisSession(page):
        hist.appendSession(page)
    hist.readWebPage('pduch.kis.p.lodz.pl')
    page = WebPage(url='http://www.kis.p.lodz.pl/')
    print hist.wasPageVisited(page)
Пример #3
0
 def _taskHandler(self, url):
     #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
     # print 'url=\t',url
     webPage = WebPage(url)
     if webPage.fetch():
         self._saveTaskResults(webPage)
         self._addUnvisitedHrefs(webPage)
Пример #4
0
 def _taskHandler(self, url):
     '''以_开头的函数是放在队列里供线程提取用的'''
     my_web = WebPage(url)
     #print 'F**k', my_web.fetch()
     if my_web.fetch():
         #print 'has visited %s' % url
         self._saveTaskResults(my_web)
         self._addUnvisitedHrefs(my_web)
Пример #5
0
 def selfTesting(self, args):
     url = 'http://www.baidu.com/'
     print '\nVisiting www.baidu.com'
     pageSource = WebPage(url).fetch()
     if pageSource == None:
         print 'Please check your network and make sure it\'s connected.\n'
     elif not self._isDatabaseAvaliable():
         print 'Please make sure you have the permission to save data: %s\n' % args.dbFile
     else:
         self._saveTaskResults(url, pageSource)
         print 'Create logfile and database Successfully.'
         print 'Already save Baidu.com, Please check the database record.'
         print 'Seems No Problem!\n'
Пример #6
0
 def selfTesting(self):
     url = 'http://www.baidu.com'
     print '\nVisiting www.baidu.com using directly'
     my_web = WebPage(url)
     pageSource = my_web.fetch()
     #测试网络链接
     if pageSource == None:
         print 'please check your network'
     elif not self.isDatabaseAvaliable():
         print 'please make sure you have the permission to save data: %s\n' % args.dbFile
     else:
         self._saveTaskResults(my_web)
         print 'save data successfully'
         print 'seems all is ok'
Пример #7
0
    def __init__(self, args, depth=1):
        self.links = [WebPage(x) for x in args.url]
        self.depth = depth
        self.historyDb = WebsiteDatabase()
        self.done = False
        self.options = args
        self.results = {link.url.domain: Result() for link in self.links}

        self.cloudIndexer = CloudSearchIndexer.forDomainIndex("websites")

        if args.graph or args.rank:
            self.webGraph = Graph(distance=30.0)
            for link in self.links:
                self.webGraph.add_node(link.url.domain,
                                       radius=15,
                                       fill=(1, 0, 0, 0.5))
Пример #8
0
 def _taskHandler(self, url):
     webPage = WebPage(url)
     if webPage.fetch():
         self._saveTaskResults(webPage)
         self._addUnvisitedHrefs(webPage)