def indexable_file(self, path): """If true, the file can be indexed by the indexing engine (e.g. contains text content). We will only send the data if this returns True. """ if self.only_index: for e in self.only_index: if path.endswith(e): return True return False else: return filetype_utils.is_indexable_file(path)
def _tc(self, path, filetype, category, indexable): """The work for running a single testcase""" (t, c) = filetype_utils.get_file_description_and_category(path) self.assertEqual(t, filetype, "Expecting filetype %s for %s, got %s" % (filetype, path, t)) self.assertEqual(c, category, "Expecting category %s for %s, got %s" % (category, path, c)) self._check_category(path, t, c) i = filetype_utils.is_indexable_file(path) self.assertEqual(i, indexable, "Path %s is %sindexable, expecting %sindexable" % (path, "" if i else "not ", "" if indexable else "not "))
def include(record): if record.has_key("token"): return True else: return filetype_utils.is_indexable_file(p2f(record["path"]))
def indexable_file(self, path): """If true, the file can be indexed by the indexing engine (e.g. contains text content). We will only send the data if this returns True. """ return filetype_utils.is_indexable_file(path)
def crawl_files(directory_to_crawl): path = os.path.abspath(os.path.expanduser(directory_to_crawl)) assert os.path.isdir(path) print "Crawling directory %s" % path total_size = 0 total_cnt = 0 indexed_size = 0 indexed_cnt = 0 size_by_category = {} cnt_by_category = {} size_by_type = {} cnt_by_type = {} size_by_tag = {} cnt_by_tag = {} entries = [["Path", "Size", "Category", "Type", "Indexable?", "Tags"],] for root, dirnames, filenames in os.walk(path): for filename in filenames: entry = [] fpath = os.path.join(root, filename) entry.append(fpath) stat = os.stat(fpath) filesize = stat.st_size total_size += filesize total_cnt += 1 entry.append(str(filesize)) (filetype, category) = filetype_utils.get_file_description_and_category(fpath) entry.append(category) add_to_dict(size_by_category, category, filesize) add_to_dict(cnt_by_category, category, 1) add_to_dict(size_by_type, filetype, filesize) add_to_dict(cnt_by_type, filetype, 1) entry.append(filetype) if filetype_utils.is_indexable_file(fpath): entry.append("Yes") else: entry.append("No") tags = filetype_utils.get_tags(fpath) if len(tags)>0: entry.append(tags[0]) add_to_dict(size_by_tag, tags[0], filesize) add_to_dict(cnt_by_tag, tags[0], 1) else: entry.append("None") add_to_dict(size_by_tag, "untagged", filesize) add_to_dict(cnt_by_tag, "untagged", 1) entries.append(entry) print "Crawled %d files, for %3.2f MB total" % (total_cnt, float(total_size)/ float(total_cnt)/1000000.0) datafile = os.path.abspath("./file_data.csv") with open(datafile, "w") as fd: for entry in entries: fd.write(", ".join(entry) + "\n") print "Wrote data to file %s" % datafile aggfile = os.path.abspath("./aggregate_data.csv") with open(aggfile, "w") as fa: fa.write("Group, Subgroup, Value\n") fa.write("Total, Count, %d\n" % total_cnt) fa.write("Total, Size, %d\n" % total_size) dict_to_csv(fa, "Cnt by Category", cnt_by_category) dict_to_csv(fa, "Size by Category", size_by_category) dict_to_csv(fa, "Cnt by Type", cnt_by_type) dict_to_csv(fa, "Size by Type", size_by_type) dict_to_csv(fa, "Cnt by Tag", cnt_by_tag) dict_to_csv(fa, "Size by Tag", size_by_tag) print "Wrote aggregates to file %s" % aggfile