Пример #1
0
 def testFileList(self):
     """Optionally check all the filetype => category mappings for a list of
     paths provided in a file.
     """
     if filelist_file:
         logging.info("Running filelist test using file %s" % filelist_file)
         with open(filelist_file, "rb") as f:
             for line in f:
                 path = line.decode("utf-8").rstrip()
                 (t, c) = filetype_utils.get_file_description_and_category(path)
                 self._check_category(path, t, c)
     else:
         logging.info("Skipping filelist test. To run, specify --filelist=path, where path is a path to a list of files")
 def get_categories(self, log):
   paths = log["path"]
   filetypes = []
   categories = []
   tag_lists = []
   for p in paths:
     filepath = p2f(p)
     (filetype,category) = \
         filetype_utils.get_file_description_and_category(filepath)
     filetypes.append(filetype)
     categories.append(category)
     tags = filetype_utils.get_tags(filepath)
     tag_lists.append(tags)
   return (filetypes, categories, tag_lists)
Пример #3
0
 def _tc(self, path, filetype, category, indexable):
     """The work for running a single testcase"""
     (t, c) = filetype_utils.get_file_description_and_category(path)
     self.assertEqual(t, filetype,
                      "Expecting filetype %s for %s, got %s" %
                      (filetype, path, t))
     self.assertEqual(c, category,
                      "Expecting category %s for %s, got %s" %
                      (category, path, c))
     self._check_category(path, t, c)
     i = filetype_utils.is_indexable_file(path)
     self.assertEqual(i, indexable,
                      "Path %s is %sindexable, expecting %sindexable" %
                      (path, "" if i else "not ", "" if indexable else "not "))
Пример #4
0
 def send_file(self, host, volume, path, stat):
   listing = {}
   listing["path"] = [host + ":" + path]
   listing["size"] = [stat.st_size]
   listing["perm"] = [stat.st_mode]
   listing["owner"] = [stat.st_uid]
   (filetype,category) = filetype_utils.get_file_description_and_category(path)
   listing["volume"] = [volume,]
   listing["filetype"] = [filetype,]
   listing["category"] = [category,]
   if (not self.config.has_key('only_metadata') or self.config['only_metadata'] == False) and self.indexable_file(path):
     with open(path) as f:
       listing["data"] = [base64.b64encode(f.read())]
   
   log = Log()
   log.set_log(listing)
   self.buffered_push("output", log)
Пример #5
0
def crawl_files(directory_to_crawl):
    path = os.path.abspath(os.path.expanduser(directory_to_crawl))
    assert os.path.isdir(path)
    print "Crawling directory %s" % path
    total_size = 0
    total_cnt = 0
    indexed_size = 0
    indexed_cnt = 0
    size_by_category = {}
    cnt_by_category = {}
    size_by_type = {}
    cnt_by_type = {}
    size_by_tag = {}
    cnt_by_tag = {}
    
    entries = [["Path", "Size", "Category", "Type", "Indexable?", "Tags"],]
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            entry = []
            fpath = os.path.join(root, filename)
            entry.append(fpath)
            stat = os.stat(fpath)
            filesize = stat.st_size
            total_size += filesize
            total_cnt += 1
            entry.append(str(filesize))
            (filetype, category) = filetype_utils.get_file_description_and_category(fpath)
            entry.append(category)
            add_to_dict(size_by_category, category, filesize)
            add_to_dict(cnt_by_category, category, 1)
            add_to_dict(size_by_type, filetype, filesize)
            add_to_dict(cnt_by_type, filetype, 1)
            entry.append(filetype)
            if filetype_utils.is_indexable_file(fpath):
                entry.append("Yes")
            else:
                entry.append("No")
            tags = filetype_utils.get_tags(fpath)
            if len(tags)>0:
                entry.append(tags[0])
                add_to_dict(size_by_tag, tags[0], filesize)
                add_to_dict(cnt_by_tag, tags[0], 1)
            else:
                entry.append("None")
                add_to_dict(size_by_tag, "untagged", filesize)
                add_to_dict(cnt_by_tag, "untagged", 1)
            entries.append(entry)
        print "Crawled %d files, for %3.2f MB total" % (total_cnt,
                                                        float(total_size)/
                                                        float(total_cnt)/1000000.0)
        datafile = os.path.abspath("./file_data.csv")
        with open(datafile, "w") as fd:
            for entry in entries:
                fd.write(", ".join(entry) + "\n")
        print "Wrote data to file %s" % datafile

        aggfile = os.path.abspath("./aggregate_data.csv")
        with open(aggfile, "w") as fa:
            fa.write("Group, Subgroup, Value\n")
            fa.write("Total, Count, %d\n" % total_cnt)
            fa.write("Total, Size, %d\n" % total_size)
            dict_to_csv(fa, "Cnt by Category", cnt_by_category)
            dict_to_csv(fa, "Size by Category", size_by_category)
            dict_to_csv(fa, "Cnt by Type", cnt_by_type)
            dict_to_csv(fa, "Size by Type", size_by_type)
            dict_to_csv(fa, "Cnt by Tag", cnt_by_tag)
            dict_to_csv(fa, "Size by Tag", size_by_tag)
        print "Wrote aggregates to file %s" % aggfile