def investigate(self, case_number):
     data = self.scrape(links=self.base_urls, scraping_ads=True)
     training_data = [(elem, "trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()]
     training_data += [
         (elem, "not trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=False).all()
     ]
     trafficking_numbers = [elem.phone_number for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()]
     cls = []
     cls.append(algorithms.svm(training_data))
     cls.append(algorithms.decision_tree(training_data))
     using_naive_bayes = len(training_data) > 50  # totally a hack, consider getting advice / changing this??
     if using_naive_bayes:
         nb = algorithms.naive_bayes(training_data)
     for datum in data:
         if datum["phone_number"] in trafficking_numbers:
             self.save([datum], case_number)
         if not using_naive_bayes:
             for cl in cls:
                 if cl.classify(algorithms.preprocess(datum["text_body"])) == "trafficking":
                     self.save([datum], case_number)
         else:
             if nb.classify(datum["text_body"]) == "trafficking":
                 self.save([datum], case_number)
     time.sleep(700)  # wait ~ 12 minutes (consider changing this)
     self.investigate(case_number)  # this is an infinite loop, which I am okay with.
Пример #2
0
 def investigate(self, case_number):
     data = self.scrape(links=self.base_urls, scraping_ads=True)
     training_data = [(elem, "trafficking")
                      for elem in BackpageLogger.query.filter_by(
                          is_trafficking=True).all()]
     training_data += [(elem, "not trafficking")
                       for elem in BackpageLogger.query.filter_by(
                           is_trafficking=False).all()]
     trafficking_numbers = [
         elem.phone_number for elem in BackpageLogger.query.filter_by(
             is_trafficking=True).all()
     ]
     cls = []
     cls.append(algorithms.svm(training_data))
     cls.append(algorithms.decision_tree(training_data))
     using_naive_bayes = len(
         training_data
     ) > 50  #totally a hack, consider getting advice / changing this??
     if using_naive_bayes:
         nb = algorithms.naive_bayes(training_data)
     for datum in data:
         if datum["phone_number"] in trafficking_numbers:
             self.save([datum], case_number)
         if not using_naive_bayes:
             for cl in cls:
                 if cl.classify(algorithms.preprocess(
                         datum["text_body"])) == "trafficking":
                     self.save([datum], case_number)
         else:
             if nb.classify(datum["text_body"]) == 'trafficking':
                 self.save([datum], case_number)
     time.sleep(700)  # wait ~ 12 minutes (consider changing this)
     self.investigate(
         case_number)  #this is an infinite loop, which I am okay with.
def test_accuracy():
    training = [("hello there are you doing okay?", "greeting"),
                ("hi", "greeting"), ("hey there, how are you?", "greeting"),
                ("bye", "goodbye"), ("later", "goodbye"), ("adios", "goodbye")]
    testing = [("hello there", "greeting"), ("hi", "greeting"),
               ("hey, how are you?", "greeting"), ("bye", "goodbye"),
               ("later", "goodbye"), ("adios", "goodbye")]
    nb = algorithms.naive_bayes(training)
    assert nb.accuracy(testing) == algorithms.accuracy("naive_bayes", nb,
                                                       testing)
Пример #4
0
 def investigate(self):
     data = self.scrape(self.base_urls)
     training_data = [(elem, "trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()] 
     training_data = [(elem, "not trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=False).all()]
     cls = []
     cls.append(algorithms.svm(train))
     cls.append(algorithms.decision_tree(train))
     nb = algorithms.naive_bayes(train)
     for datum in data:
         if len(train) > 50: #totally a hack/rule of thumb 
             for cl in cls:
                 if cl.classify(algorithms.preprocess(datum["text_body"])) == "trafficking":
                     self.save_ads([datum])
         else:
             if nb.classify(datum["text_body"]) == 'trafficking':
                 self.save_ads([datum])
     time.sleep(700) # wait ~ 12 minutes (consider changing this)
     self.investigate() #this is an infinite loop, which I am okay with.
Пример #5
0
def test_naive_bayes():
    testing = [("hello there","greeting"),("later","goodbye")]
    cl = algorithms.naive_bayes(testing)
    test = "Hello there friends"
    assert cl.classify(test) == "greeting"
Пример #6
0
def test_accuracy():
    training = [("hello there are you doing okay?","greeting"),("hi","greeting"),("hey there, how are you?","greeting"),("bye","goodbye"),("later","goodbye"),("adios","goodbye")]
    testing = [("hello there","greeting"),("hi","greeting"),("hey, how are you?","greeting"),
                ("bye","goodbye"),("later","goodbye"),("adios","goodbye")]
    nb = algorithms.naive_bayes(training)
    assert nb.accuracy(testing) == algorithms.accuracy("naive_bayes",nb,testing)
def test_naive_bayes():
    testing = [("hello there", "greeting"), ("later", "goodbye")]
    cl = algorithms.naive_bayes(testing)
    test = "Hello there friends"
    assert cl.classify(test) == "greeting"