def investigate(self, case_number): data = self.scrape(links=self.base_urls, scraping_ads=True) training_data = [(elem, "trafficking") for elem in BackpageLogger.query.filter_by( is_trafficking=True).all()] training_data += [(elem, "not trafficking") for elem in BackpageLogger.query.filter_by( is_trafficking=False).all()] trafficking_numbers = [ elem.phone_number for elem in BackpageLogger.query.filter_by( is_trafficking=True).all() ] cls = [] cls.append(algorithms.svm(training_data)) cls.append(algorithms.decision_tree(training_data)) using_naive_bayes = len( training_data ) > 50 #totally a hack, consider getting advice / changing this?? if using_naive_bayes: nb = algorithms.naive_bayes(training_data) for datum in data: if datum["phone_number"] in trafficking_numbers: self.save([datum], case_number) if not using_naive_bayes: for cl in cls: if cl.classify(algorithms.preprocess( datum["text_body"])) == "trafficking": self.save([datum], case_number) else: if nb.classify(datum["text_body"]) == 'trafficking': self.save([datum], case_number) time.sleep(700) # wait ~ 12 minutes (consider changing this) self.investigate( case_number) #this is an infinite loop, which I am okay with.
def investigate(self, case_number): data = self.scrape(links=self.base_urls, scraping_ads=True) training_data = [(elem, "trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()] training_data += [ (elem, "not trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=False).all() ] trafficking_numbers = [elem.phone_number for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()] cls = [] cls.append(algorithms.svm(training_data)) cls.append(algorithms.decision_tree(training_data)) using_naive_bayes = len(training_data) > 50 # totally a hack, consider getting advice / changing this?? if using_naive_bayes: nb = algorithms.naive_bayes(training_data) for datum in data: if datum["phone_number"] in trafficking_numbers: self.save([datum], case_number) if not using_naive_bayes: for cl in cls: if cl.classify(algorithms.preprocess(datum["text_body"])) == "trafficking": self.save([datum], case_number) else: if nb.classify(datum["text_body"]) == "trafficking": self.save([datum], case_number) time.sleep(700) # wait ~ 12 minutes (consider changing this) self.investigate(case_number) # this is an infinite loop, which I am okay with.
def investigate(self): data = self.scrape(self.base_urls) training_data = [(elem, "trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()] training_data = [(elem, "not trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=False).all()] cls = [] cls.append(algorithms.svm(train)) cls.append(algorithms.decision_tree(train)) nb = algorithms.naive_bayes(train) for datum in data: if len(train) > 50: #totally a hack/rule of thumb for cl in cls: if cl.classify(algorithms.preprocess(datum["text_body"])) == "trafficking": self.save_ads([datum]) else: if nb.classify(datum["text_body"]) == 'trafficking': self.save_ads([datum]) time.sleep(700) # wait ~ 12 minutes (consider changing this) self.investigate() #this is an infinite loop, which I am okay with.
def test_decision_tree(): testing = [("hello there","greeting"),("later","goodbye")] cl = algorithms.decision_tree(testing) test = algorithms.preprocess("hello there friends") assert cl.classify(test) == "greeting"
def test_decision_tree(): testing = [("hello there", "greeting"), ("later", "goodbye")] cl = algorithms.decision_tree(testing) test = algorithms.preprocess("hello there friends") assert cl.classify(test) == "greeting"