def main(): ''' Main module gathers new raw files for ingest into CityScrapeDB ''' configure_log() logger = logging.getLogger(__name__) logger.info('Executing initial page scrape to look for new files...') soup = get_soup() logger.info('Fetching files now!') get_files(soup) logger.info('CityScrape download complete!')
def run_naive_bayes(cls, train, test, binarizer, labels, alpha): # logging logging = configure_log(__file__) logging.info("alpha = %s" % (str(alpha))) logging.info("Fitting Naive Bayes...") train_data, train_labels = train test_data, test_labels = test classifier = OneVsRestClassifier(MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None)) with warnings.catch_warnings(): # FIXME: split the data set in a way that the train set has every label warnings.simplefilter("ignore") classifier.fit(train_data, train_labels) possible_labels = set() [map(possible_labels.add, row) for row in [label.nonzero()[0] for label in labels]] logging.info("Predicting test set...") test_predictions = cls.predict( classifier=classifier, data=test_data, labels=test_labels, possible_labels=possible_labels, binarizer=binarizer, ) # logging.info('Predicting train set...') # train_predictions = cls.predict(classifier=classifier, data=train_data, labels=train_labels, # possible_labels=possible_labels, binarizer=binarizer) test_precision = precision_score(y_true=test_labels, y_pred=test_predictions, average="samples") # train_precision = precision_score(y_true=train_labels, y_pred=train_predictions, average='samples') # return train_precision, test_precision return test_precision
def run_svm(cls, train, test, C, binarizer, labels, intercept): """ :param binarizer: :param labels: :param train: :param test: :param C: """ # logging logging = configure_log(__file__) logging.info("C = %s" % (str(C))) logging.info("Fitting Linear SVM...") train_data, train_labels = train test_data, test_labels = test dual = False if train_data.shape[0] > train_data.shape[1] else True classifier = OneVsRestClassifier( LinearSVC(dual=dual, class_weight=None, C=C, intercept_scaling=intercept) ) # C -> inf = hard-margin with warnings.catch_warnings(): # FIXME: split the data set in a way that the train set has every label warnings.simplefilter("ignore") classifier.fit(train_data, train_labels) possible_labels = set() [map(possible_labels.add, row) for row in [label.nonzero()[0] for label in labels]] seen_labels = set() [map(seen_labels.add, row) for row in [label.nonzero()[0] for label in train_labels]] logging.info("Predicting test set...") test_predictions = cls.predict( classifier=classifier, data=test_data, labels=test_labels, possible_labels=possible_labels, binarizer=binarizer, ) # test_predictions = cls.predict_k(classifier=classifier, data=test_data, k=100, binarizer=binarizer) # logging.info('Predicting train set...') # train_predictions = cls.predict(classifier=classifier, data=train_data, labels=train_labels, # possible_labels=possible_labels, binarizer=binarizer) precision = precision_score(y_true=test_labels, y_pred=test_predictions, average="samples") # (precision, recall, f, support) = precision_recall_fscore_support(y_true=test_labels, y_pred=test_predictions, # average='samples') # train_precision = precision_score(y_true=train_labels, y_pred=train_predictions, average='samples') # return test_precision return precision # , recall, f, support