def test1(self): try: file_crawler = FileCrawler("C://", self.session, max_files=10) except IntegrityError, e: self.session.close() Crawl.dropAndCreate(e.message) self.fail(e.message)
def setUp(self): TestCase.setUp(self) from lib.Crawl import Crawl if not Crawl.exists(): Crawl.createTable() self.assertTrue(Crawl.exists())
def setUp(self): TestCase.setUp(self) from lib.Crawl import Crawl self.assertTrue(Crawl.exists()) Record.createTable() self.assertTrue(Record.exists())
def testGvizDataTable(self): session = SqlAlchemySessionFactory().createSqlAlchemySession() import gviz_api data_table = Crawl.getGvizDataTable() self.assertIsInstance(data_table, gviz_api.DataTable) import re r = re.compile(r"^google\.visualization\.Query\.setResponse\({.*}\);$") m = r.match(data_table.ToResponse()) self.assertIsNotNone(m) session.close()
def __init__(self, path, sqlalchemy_session, max_files=None): Thread.__init__(self) self.skipCount = 0 self.maxFiles = max_files self.path = path self.sqlAlchemySession = sqlalchemy_session self.hostName = _getHostName() self.crawl = Crawl() self.crawl.begin() self.sqlAlchemySession.add(self.crawl) self.sqlAlchemySession.commit()
def test2(self): session = Session() info(Crawl.getGvizDataTable(session)) session.close()
class FileCrawler(Thread): __slots__ = () def __init__(self, path, sqlalchemy_session, max_files=None): Thread.__init__(self) self.skipCount = 0 self.maxFiles = max_files self.path = path self.sqlAlchemySession = sqlalchemy_session self.hostName = _getHostName() self.crawl = Crawl() self.crawl.begin() self.sqlAlchemySession.add(self.crawl) self.sqlAlchemySession.commit() def run(self): self.crawl.begin() for root, dirs, files in os.walk(self.path): if len(dirs) > 0 and dirs[0] in EXCLUDE_DIRECTORIES: dirs = dirs[1:] for f in files: file_record = FileRecord() path = root + os.path.sep + f absolute_path = os.path.abspath(path) url = "file://" + self.hostName + "/" + absolute_path file_record.setUrl(url) file_record.setCrawlId(self.crawl.crawlId) if file_record.exists(self.crawl.agentId, self.sqlAlchemySession, BEST_BEFORE_PERIOD_IN_SECOND): self.skipCount += 1 continue stat = os.stat(path) git_blob_hash = _GitBlobHash(path, stat) git_blob_hash.start() file_record.setSize(stat.st_size) created_datetime = datetime.fromtimestamp(stat.st_ctime) created_datetime = created_datetime.replace(tzinfo=dateutil.tz.tzlocal()) file_record.setCreated(created_datetime) last_modified_datetime = datetime.fromtimestamp(stat.st_mtime) last_modified_datetime = last_modified_datetime.replace(tzinfo=dateutil.tz.tzlocal()) file_record.setLastModified(last_modified_datetime) # naive or aware? file_record.setLastSeen(utcnow()) git_blob_hash.join() hash_string = git_blob_hash.getGitBlobHash() if hash_string is not None: file_record.setUri("git:///blob/" + hash_string) self.sqlAlchemySession.add(file_record) self.crawl.increment(git_blob_hash.getReadSize()) if self.maxFiles and self.crawl.getNumberOfProcessedItems() >= self.maxFiles: break self.crawl.end() self.sqlAlchemySession.commit() self.sqlAlchemySession.close() def getNumberOfProcessedFiles(self): return self.crawl.getNumberOfProcessedItems() def getNumberOfProcessedBytes(self): return self.crawl.getNumberOfProcessedBytes() def getFilesPerSecond(self): return self.crawl.getFilesPerSecond() def getBytesPerSecond(self): return self.crawl.getBytesPerSecond() def __str__(self): locale.setlocale(locale.LC_ALL, "") return "%dsec %s (%s) bytes, %s/%s (%d) files" % (self.crawl.getElapsedSeconds(), locale.format("%d", self.getNumberOfProcessedBytes(), grouping=True), locale.format("%d", self.getBytesPerSecond(), grouping=True), locale.format("%d", self.getNumberOfProcessedFiles(), grouping=True), locale.format("%d", self.skipCount, grouping=True), self.getFilesPerSecond())
def setUp(self): Record.createTable() self.assertTrue(Record.exists()) Crawl.createTable() self.assertTrue(Crawl.exists()) self.session = SqlAlchemySessionFactory().createSqlAlchemySession()
def setUp(self): Record.createTable() Crawl.createTable()