def run(self): dump_logger = getLogger('dumpscraper') # Let's invoke the getscore runner and tell him to work on training data dump_logger.info("Calculating dump score...") running = getscore.DumpScraperGetscore(self.settings, self.parentArgs) running.run() # First of all let's feed the classifier with the training data training = scipy_genfromtxt(self.settings['data_dir'] + "/" + "training/features.csv", delimiter=",", skip_header=1, usecols=(0, 1, 2)) target = scipy_genfromtxt(self.settings['data_dir'] + "/" + "training/features.csv", delimiter=",", skip_header=1, usecols=(-2)) clf = sklearn.neighbors.KNeighborsClassifier(10, weights='uniform') clf.fit(training, target) trash_count = hash_count = plain_count = 0 cleared = [] with open(self.settings['data_dir'] + "/" + 'features.csv', 'rb') as csvfile: reader = csv_reader(csvfile) for line in reader: if line[0] == 'Trash score': continue features = np_array(line[0:3]) features = features.reshape(1, -1) label = clf.predict(features) if label == 0: folder = 'trash' trash_count += 1 elif label == 1: folder = 'hash' hash_count += 1 elif label == 2: folder = 'plain' plain_count += 1 target_file = self.settings['data_dir'] + "/" + 'organized/' + folder + "/" + line[-1] target_dir = path.dirname(target_file) # If asked for a clean run, let's delete the entire folder before copying any file if self.parentArgs.clean and target_dir not in cleared and path.exists(target_dir): cleared.append(target_dir) shutil_rmtree(target_dir) if not path.exists(target_dir): makedirs(target_dir) shutil_copyfile(self.settings['data_dir'] + "/" + 'raw/' + line[-1], target_file) dump_logger.info("Trash files: " + str(trash_count)) dump_logger.info("Hash files: " + str(hash_count)) dump_logger.info("Plain files: " + str(plain_count)) dump_logger.info("Operation completed")
def run(self): # Let's invoke the getscore runner and tell him to work on training data print("Calculating dump score...") running = getscore.DumpScraperGetscore(self.settings, self.parentArgs) running.run() # First of all let's feed the classifier with the training data training = scipy.genfromtxt("data/training/features.csv", delimiter=",", skip_header=1, usecols=(0, 1, 2)) target = scipy.genfromtxt("data/training/features.csv", delimiter=",", skip_header=1, usecols=(-2)) clf = sklearn.neighbors.KNeighborsClassifier(10, weights='uniform') clf.fit(training, target) trash_count = hash_count = plain_count = 0 with open('data/raw/features.csv', 'rb') as csvfile: reader = csv.reader(csvfile) for line in reader: if line[0] == 'Trash score': continue features = line[0:3] label = clf.predict(features) if label == 0: folder = 'trash' trash_count += 1 elif label == 1: folder = 'hash' hash_count += 1 elif label == 2: folder = 'plain' plain_count += 1 target_dir = 'data/organized/' + folder + "/" + line[-1] if not os.path.exists(os.path.dirname(target_dir)): os.makedirs(os.path.dirname(target_dir)) shutil.copyfile('data/raw/' + line[-1], target_dir) print("Trash files: " + str(trash_count)) print("Hash files: " + str(hash_count)) print("Plain files: " + str(plain_count)) print("Operation completed")
def run(self): self.banner() # Peform some sanity checks try: self.checkenv() except exceptions.InvalidSettings as error: print("") print(error) return # Let's load the correct object if self.args.command == 'scrape': runner = scrape.DumpScraperScrape(self.settings, self.args) elif self.args.command == 'scrapeold': runner = scrapeold.DumpScraperScrapeold(self.settings, self.args) elif self.args.command == 'getscore': runner = getscore.DumpScraperGetscore(self.settings, self.args) elif self.args.command == 'training': runner = training.DumpScraperTraining(self.settings, self.args) elif self.args.command == 'classify': runner = classify.DumpScraperClassify(self.settings, self.args) elif self.args.command == 'extract': runner = extract.DumpScraperExtract(self.settings, self.args) else: print("Unrecognized command") return # And away we go! try: runner.check() runner.run() # Ehm.. something wrong happened? except exceptions.RunningError as error: print("") print(error) # Always save the updated settings finally: with open(os.path.realpath("settings.json"), 'w+') as update_settings: json.dump(self.settings, update_settings, indent=4)
def _getscore(self): # Let's invoke the getscore runner and tell him to work on training data self.parentArgs.level = 1 running = getscore.DumpScraperGetscore(self.settings, self.parentArgs) running.run(training=True)
def run(self): self.banner() self.check_updates() dump_logger = logging.getLogger('dumpscraper') # Perform some sanity checks try: self.checkenv() except exceptions.InvalidSettings as error: dump_logger.error(error) return # Let's ouput some info if hasattr(self.args, 'level') and self.args.level > 0: dump_logger.debug('\tUsing a greedy level of ' + str(self.args.level)) if hasattr(self.args, 'clean') and self.args.clean: dump_logger.debug( "\tClean the target folder before attempting to write inside it" ) if hasattr(self.args, 'force') and self.args.force: dump_logger.debug("\tForcing the execution only on file " + str(self.args.force)) # Let's load the correct object if self.args.command == 'scrape': from lib.runner import scrape runner = scrape.DumpScraperScrape(self.settings, self.args) elif self.args.command == 'scraperaw': from lib.runner import scraperaw runner = scraperaw.DumpScraperScraperaw(self.settings, self.args) elif self.args.command == 'scrapeold': from lib.runner import scrapeold runner = scrapeold.DumpScraperScrapeold(self.settings, self.args) elif self.args.command == 'getscore': from lib.runner import getscore runner = getscore.DumpScraperGetscore(self.settings, self.args) elif self.args.command == 'training': from lib.runner import training runner = training.DumpScraperTraining(self.settings, self.args) elif self.args.command == 'classify': from lib.runner import classify runner = classify.DumpScraperClassify(self.settings, self.args) elif self.args.command == 'extract': from lib.runner import extract runner = extract.DumpScraperExtract(self.settings, self.args) elif self.args.command == 'review': from lib.runner import review runner = review.DumpScraperReview(self.settings, self.args) else: dump_logger.error("Unrecognized command " + self.args.command) return # And away we go! try: runner.check() runner.run() # Ehm.. something wrong happened? except exceptions.RunningError as error: dump_logger.error(error) # Always save the updated settings finally: with open(os_path.realpath("settings.json"), 'w+') as update_settings: json.dump(self.settings, update_settings, indent=4)