Пример #1
0
    def add_stats_record(cls, statsd):

        sqlite3 = cls.try_import()        
        if sqlite3 is None:
            return

        logconsole('Writing project statistics to crawl database...')
        dbfile = os.path.join(objects.config.userdbdir, "crawls.db")
        conn = sqlite3.connect(dbfile)
        c = conn.cursor()
        t = (cls.projid,
             statsd['links'],
             statsd['processed'],
             statsd['filtered'],
             statsd['fatal'],
             statsd['broken'],
             statsd['filesinrepos'],
             statsd['extservers'] + 1,
             statsd['extdirs'] + 1,
             statsd['files'],
             statsd['bytes'],
             '%.2f' % statsd['fetchtime'])
             
        c.execute("insert into project_stats values(?,?,?,?,?,?,?,?,?,?,?,?)", t)
        conn.commit()
        c.close()
        pass
Пример #2
0
    def find_broken_links(self, event, *args, **kwargs):
        urldb = objects.datamgr.get_urldb()

        for node in urldb.preorder():
            urlobj = node.get()
            if urlobj.status == 404:
                self.broken.append(urlobj.get_full_url())

        # Write to a file
        baseurl = objects.queuemgr.get_base_url()
        fname = '404#' + str(hash(baseurl)) + '.txt'
        logconsole('Writing broken links to',fname)
        f = open(fname, 'w')
        f.write("Broken links for crawl starting with URL %s\n\n" % baseurl)
        for link in self.broken:
            f.write(link + '\n')
        f.close()

        return False
    def find_broken_links(self, event, *args, **kwargs):
        urldb = objects.datamgr.get_urldb()

        for node in urldb.preorder():
            urlobj = node.get()
            if urlobj.status == 404:
                self.broken.append(urlobj.get_full_url())

        # Write to a file
        baseurl = objects.queuemgr.get_base_url()
        fname = '404#' + str(hash(baseurl)) + '.txt'
        logconsole('Writing broken links to',fname)
        f = open(fname, 'w')
        f.write("Broken links for crawl starting with URL %s\n\n" % baseurl)
        for link in self.broken:
            f.write(link + '\n')
        f.close()

        return False
Пример #4
0
    def create_user_database(cls):

        sqlite3 = cls.try_import()

        if sqlite3 is None:
            return

        logconsole("Creating user's crawl database file in %s..." %
                   objects.config.userdbdir)

        dbfile = os.path.join(objects.config.userdbdir, "crawls.db")
        conn = sqlite3.connect(dbfile)
        c = conn.cursor()

        # Create table for projects
        # This line is causing a problem in darwin
        # c.execute("drop table if exists projects")
        c.execute(
            """create table projects (id integer primary key autoincrement default 0, time real, name text, url str, config str)"""
        )
        # Create table for project statistics
        # We are storing the information for
        # 1. number of urls scanned
        # 2. number of urls processed (fetched/crawled)
        # 3. number of URLs which were crawl-filtered
        # 4. number of urls failed to fetch
        # 5. number of urls with 404 errors
        # 6. number of URLs which hit the cache
        # 7. number of servers scanned
        # 8. number of unique directories scanned
        # 9. number of files saved
        # 10. Amount of data fetched in bytes
        # 11. the total time for the crawl.

        # This line is causing a problem in darwin
        # c.execute("drop table project_stats")
        c.execute(
            """create table project_stats (project_id integer primary key, urls integer, procurls integer, filteredurls integer, failedurls integer, brokenurls integer, cacheurls integer, servers integer, directories integer, files integer, data real, duration text)"""
        )

        c.close()
Пример #5
0
    def add_stats_record(cls, statsd):

        sqlite3 = cls.try_import()
        if sqlite3 is None:
            return

        logconsole('Writing project statistics to crawl database...')
        dbfile = os.path.join(objects.config.userdbdir, "crawls.db")
        conn = sqlite3.connect(dbfile)
        c = conn.cursor()
        t = (cls.projid, statsd['links'], statsd['processed'],
             statsd['filtered'], statsd['fatal'], statsd['broken'],
             statsd['filesinrepos'], statsd['extservers'] + 1,
             statsd['extdirs'] + 1, statsd['files'], statsd['bytes'],
             '%.2f' % statsd['fetchtime'])

        c.execute("insert into project_stats values(?,?,?,?,?,?,?,?,?,?,?,?)",
                  t)
        conn.commit()
        c.close()
        pass
Пример #6
0
    def create_user_database(cls):

        sqlite3 = cls.try_import()
            
        if sqlite3 is None:
            return

        logconsole("Creating user's crawl database file in %s..." % objects.config.userdbdir)
        
        dbfile = os.path.join(objects.config.userdbdir, "crawls.db")
        conn = sqlite3.connect(dbfile)
        c = conn.cursor()
        
        # Create table for projects
        # This line is causing a problem in darwin
        # c.execute("drop table if exists projects")
        c.execute("""create table projects (id integer primary key autoincrement default 0, time real, name text, url str, config str)""")
        # Create table for project statistics
        # We are storing the information for
        # 1. number of urls scanned
        # 2. number of urls processed (fetched/crawled)
        # 3. number of URLs which were crawl-filtered
        # 4. number of urls failed to fetch
        # 5. number of urls with 404 errors
        # 6. number of URLs which hit the cache
        # 7. number of servers scanned
        # 8. number of unique directories scanned
        # 9. number of files saved
        # 10. Amount of data fetched in bytes
        # 11. the total time for the crawl.
        
        # This line is causing a problem in darwin        
        # c.execute("drop table project_stats")        
        c.execute("""create table project_stats (project_id integer primary key, urls integer, procurls integer, filteredurls integer, failedurls integer, brokenurls integer, cacheurls integer, servers integer, directories integer, files integer, data real, duration text)""")
        
        c.close()