"k11": 1, "k12": 1, "k13": 1, "k14": 1, "k15": 1, } test_data = pd.read_csv(open("data/test.csv", "r"), quotechar='"') sub_data = pd.read_csv(open("data/sampleSubmission.csv", "r"), quotechar='"') if not np.alltrue(test_data["id"] == sub_data["id"]): raise Exception("IDs do not match") yh = Yhat(username, apikey) variabless = sub_data.columns[1:] raw_tweets = test_data["tweet"].tolist() for variable in variables: model_version = best_model[variable] model_name = "TweetClassifier_%s" % (variable,) results_from_server = yh.raw_predict(model_name, model_version, raw_tweets) pred = results_from_server["prediction"]["scores"] sub_data[variable] = pred try: sub_data.to_csv(open(sub_file, "w"), index=False) except IOError: sys.stderr.write("IO error: could not write data to file")
print "Uploading to yhat" upload_status = yh.upload(model_name,tweet_clf) model_version = upload_status['version'] print "'%s':'%s' uploaded to yhat" % (model_name,model_version) # Sanity check uploaded classifier by comparing remote against local scores print "Preforming sanity check" print "Predicting local scores" local_sanity = tweet_clf.predict(tweet_clf.transform(sanity_raw))['scores'] local_sanity = np.array(local_sanity) print "Getting scores from server" results_from_server = yh.raw_predict(model_name,model_version,sanity_raw) try: server_sanity = results_from_server['prediction']['scores'] except: print results_from_server sys.exit(3) server_sanity = np.array(server_sanity) # Because of float point scores compare difference of scores to some level # of tolerance rather than checking equality score_diff = np.abs(local_sanity - server_sanity) sanity_tolerance = 1e-3 sanity_status = np.alltrue(score_diff < sanity_tolerance) if not sanity_status:
print "Uploading to yhat" upload_status = yh.upload(model_name, tweet_clf) model_version = upload_status['version'] print "'%s':'%s' uploaded to yhat" % (model_name, model_version) # Sanity check uploaded classifier by comparing remote against local scores print "Preforming sanity check" print "Predicting local scores" local_sanity = tweet_clf.predict(tweet_clf.transform(sanity_raw))['scores'] local_sanity = np.array(local_sanity) print "Getting scores from server" results_from_server = yh.raw_predict(model_name, model_version, sanity_raw) try: server_sanity = results_from_server['prediction']['scores'] except: print results_from_server sys.exit(3) server_sanity = np.array(server_sanity) # Because of float point scores compare difference of scores to some level # of tolerance rather than checking equality score_diff = np.abs(local_sanity - server_sanity) sanity_tolerance = 1e-3 sanity_status = np.alltrue(score_diff < sanity_tolerance) if not sanity_status:
's1':1, 's2':1, 's3':1, 's4':1, 's5':1, 'w1':1, 'w2':1, 'w3':1, 'w4':1, 'k1':1, 'k2':1, 'k3':1, 'k4':1, 'k5':1, 'k6':1, 'k7':1, 'k8':1, 'k9':1, 'k10':1, 'k11':1, 'k12':1, 'k13':1, 'k14':1, 'k15':1 } test_data = pd.read_csv(open('data/test.csv', 'r'), quotechar='"') sub_data = pd.read_csv(open('data/sampleSubmission.csv', 'r'), quotechar='"') if not np.alltrue(test_data['id'] == sub_data['id']): raise Exception("IDs do not match") yh = Yhat(username, apikey) variabless = sub_data.columns[1:] raw_tweets = test_data['tweet'].tolist() for variable in variables: model_version = best_model[variable] model_name = "TweetClassifier_%s" % (variable, ) results_from_server = yh.raw_predict(model_name, model_version, raw_tweets) pred = results_from_server['prediction']['scores'] sub_data[variable] = pred try: sub_data.to_csv(open(sub_file, 'w'), index=False) except IOError: sys.stderr.write("IO error: could not write data to file")
# <codecell> yh = Yhat("YOUR USERNAME", "YOUR API KEY") # <codecell> print yh.upload("NamedEntityFindr", clf) # <codecell> [model for model in yh.show_models()['models'] if model['name'] == "NamedEntityFindr"] # <codecell> results_from_server = yh.raw_predict("NamedEntityFindr", 1, data) results_from_server # <codecell> print 'sanity check.' print 'results all match => %s' \ % np.all(np.array(results['entities']) == np.array(results_from_server['prediction']['entities'])) # <markdowncell> # <h2>Final Thoughts</h2> # <ul> # <li><a href="http://nltk.googlecode.com/svn/trunk/doc/book/ch05.html" title="Categorizing and Tagging Words - NLTK docs" target="_blank">Categorizing and Tagging Words with NLTK</a> (NLTK docs)</li> # <li><a href="http://pixelmonkey.org/pub/nlp-training/" title="Just Enough NLP with Python" target="_blank">Just Enough NLP with Python</a> (slides)</li> # <li><a href="http://cdn.preterhuman.net/texts/science_and_technology/artificial_intelligence/Foundations%20of%20Statistical%20Natural%20Language%20Processing%20-%20Christopher%20D.%20Manning.pdf" title="Foundations of Statistical Natural Language Processing by Christopher Manning & Hinrich Schiitze" target="_blank">Foundations of Statistical Natural Language Processing</a> by Christopher Manning & Hinrich Schiitze (PDF)</li>