def get_categories(self, log):
   paths = log["path"]
   filetypes = []
   categories = []
   tag_lists = []
   for p in paths:
     filepath = p2f(p)
     (filetype,category) = \
         filetype_utils.get_file_description_and_category(filepath)
     filetypes.append(filetype)
     categories.append(category)
     tags = filetype_utils.get_tags(filepath)
     tag_lists.append(tags)
   return (filetypes, categories, tag_lists)
Exemplo n.º 2
0
def crawl_files(directory_to_crawl):
    path = os.path.abspath(os.path.expanduser(directory_to_crawl))
    assert os.path.isdir(path)
    print "Crawling directory %s" % path
    total_size = 0
    total_cnt = 0
    indexed_size = 0
    indexed_cnt = 0
    size_by_category = {}
    cnt_by_category = {}
    size_by_type = {}
    cnt_by_type = {}
    size_by_tag = {}
    cnt_by_tag = {}
    
    entries = [["Path", "Size", "Category", "Type", "Indexable?", "Tags"],]
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            entry = []
            fpath = os.path.join(root, filename)
            entry.append(fpath)
            stat = os.stat(fpath)
            filesize = stat.st_size
            total_size += filesize
            total_cnt += 1
            entry.append(str(filesize))
            (filetype, category) = filetype_utils.get_file_description_and_category(fpath)
            entry.append(category)
            add_to_dict(size_by_category, category, filesize)
            add_to_dict(cnt_by_category, category, 1)
            add_to_dict(size_by_type, filetype, filesize)
            add_to_dict(cnt_by_type, filetype, 1)
            entry.append(filetype)
            if filetype_utils.is_indexable_file(fpath):
                entry.append("Yes")
            else:
                entry.append("No")
            tags = filetype_utils.get_tags(fpath)
            if len(tags)>0:
                entry.append(tags[0])
                add_to_dict(size_by_tag, tags[0], filesize)
                add_to_dict(cnt_by_tag, tags[0], 1)
            else:
                entry.append("None")
                add_to_dict(size_by_tag, "untagged", filesize)
                add_to_dict(cnt_by_tag, "untagged", 1)
            entries.append(entry)
        print "Crawled %d files, for %3.2f MB total" % (total_cnt,
                                                        float(total_size)/
                                                        float(total_cnt)/1000000.0)
        datafile = os.path.abspath("./file_data.csv")
        with open(datafile, "w") as fd:
            for entry in entries:
                fd.write(", ".join(entry) + "\n")
        print "Wrote data to file %s" % datafile

        aggfile = os.path.abspath("./aggregate_data.csv")
        with open(aggfile, "w") as fa:
            fa.write("Group, Subgroup, Value\n")
            fa.write("Total, Count, %d\n" % total_cnt)
            fa.write("Total, Size, %d\n" % total_size)
            dict_to_csv(fa, "Cnt by Category", cnt_by_category)
            dict_to_csv(fa, "Size by Category", size_by_category)
            dict_to_csv(fa, "Cnt by Type", cnt_by_type)
            dict_to_csv(fa, "Size by Type", size_by_type)
            dict_to_csv(fa, "Cnt by Tag", cnt_by_tag)
            dict_to_csv(fa, "Size by Tag", size_by_tag)
        print "Wrote aggregates to file %s" % aggfile