def test(sc, file_positive, files_negative, file_model): """ Tests a classification model using positive samples in file_positive and negative samples in file_negative. It prints the results to standard output :param sc: The spark context :type sc: SparkContext :param file_positive: The file with tweets to predict :type file_positive: str :param files_negative: The files with tweets to reject :type files_negative: list[str] :param file_model: The file where the model is located :type file_model: str """ tweets_positive = sc.textFile(file_positive).map(parse_json).filter(lambda x: is_valid(x) and is_english(x)).cache() list_negatives = [sc.textFile(file_negative).map(parse_json).filter(lambda x: is_valid(x) and is_english(x)) for file_negative in files_negative] tweets_negative = list_negatives[0] for ln in list_negatives[1:]: tweets_negative = tweets_negative.union(ln) try: print("Reading stored classification model") model = pickle.load(open(file_model, 'rb')) print("Computing predictions") threshold = 0.0 total_positive = tweets_positive.count() total_negative = tweets_negative.count() true_positives = tweets_positive.filter(lambda x: model.predict(parse(x)) > threshold).count() true_negatives = tweets_negative.filter(lambda x: model.predict(parse(x)) <= threshold).count() false_negatives = total_positive - true_positives false_positives = total_negative - true_negatives print("Results for %s:" % file_model) print(" Total positives: %d" % total_positive) print(" Total negatives: %d" % total_negative) print(" False positives: %d" % false_positives) print(" False negatives: %d" % false_negatives) precision = 0.0 recall = 0.0 try: precision = float(true_positives) / float(true_positives + false_positives) recall = float(true_positives) / float(true_positives + false_negatives) except: pass print(" Precision: %f" % precision) print(" Recall: %f" % recall) print("Done!") except Exception as e: print("Error:") print(e)
def fast_predict(sc, file_input, file_output, sports_model, politics_model, technology_model): """ Predicts using the provided models """ tweets = sc.textFile(file_input).map(parse_json).filter( lambda x: is_valid(x) and is_english(x)) try: print("Reading stored classification model") sports = pickle.load(open(sports_model, 'rb')) politics = pickle.load(open(politics_model, 'rb')) technology = pickle.load(open(technology_model, 'rb')) def predict_labels(tweet): x = parse(tweet) labels = [] if sports.predict(x) > 0.0: labels.append("sports") if politics.predict(x) > 0.0: labels.append("politics") if technology.predict(x): labels.append("technology") return labels print("Computing predictions") predictions = tweets.map(lambda t: (t, predict_labels(t))) filtered_predictions = predictions.filter(lambda t: len(t[1]) == 1) filtered_predictions.map(prediction_string).saveAsTextFile(file_output) print("Done!") except Exception as e: print("Error:") print(e)
def fast_predict(sc, file_input, file_output, sports_model, politics_model, technology_model): """ Predicts using the provided models """ tweets = sc.textFile(file_input).map(parse_json).filter(lambda x: is_valid(x) and is_english(x)) try: print("Reading stored classification model") sports = pickle.load(open(sports_model, 'rb')) politics = pickle.load(open(politics_model, 'rb')) technology = pickle.load(open(technology_model, 'rb')) def predict_labels(tweet): x = parse(tweet) labels = [] if sports.predict(x) > 0.0: labels.append("sports") if politics.predict(x) > 0.0: labels.append("politics") if technology.predict(x): labels.append("technology") return labels print("Computing predictions") predictions = tweets.map(lambda t: (t, predict_labels(t))) filtered_predictions = predictions.filter(lambda t: len(t[1]) == 1) filtered_predictions.map(prediction_string).saveAsTextFile(file_output) print("Done!") except Exception as e: print("Error:") print(e)