def sanity_check(): """ Run python softmaxreg.py. """ random.seed(314159) np.random.seed(265) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) _, wordVectors0, _ = load_saved_params() wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) dimVectors = wordVectors.shape[1] dummy_weights = 0.1 * np.random.randn(dimVectors, 5) dummy_features = np.zeros((10, dimVectors)) dummy_labels = np.zeros((10,), dtype=np.int32) for i in range(10): words, dummy_labels[i] = dataset.getRandomTrainSentence() dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words) print("==== Gradient check for softmax regression ====") gradcheck_naive(lambda weights: softmaxRegression(dummy_features, dummy_labels, weights, 1.0, nopredictions = True), dummy_weights) print("\n===Results ===") print(softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0))
def __test_getSentenceFeatures(): dataset = StanfordSentiment() tokens = dataset.tokens() numWords = len(tokens) _, wordVectors, _ = load_saved_params("../saved_params") # Concatenate the input/output word embedding vectors to form the final word representation wordVectors = np.concatenate( (wordVectors[:numWords, :], wordVectors[numWords:, :]), axis=1) sentence = "make a splash".split() sent_feature = getSentenceFeatures(tokens, wordVectors, sentence) print(sent_feature)
causeprior = loadObj(config.get(section, "cause_prior")) N = len(causeprior) cause_offset, effect_offset = 0, N while True: dim = input( 'Please enter the dimension of word vectors you want to try, e.g. 200,\nEnter `q` to quit. Please enter your dimension: ' ) if dim == 'q' or dim == 'quit': sys.exit(0) params_path = datasets_dir + '/dim=' + dim if not os.path.exists(params_path): print('dimension not exists!') else: # load params _, wordVectors, _, _ = load_saved_params(params_path) causeVectors, effectVectors = wordVectors[:N, :], wordVectors[N:, :] break while True: word = input( 'Please enter a word you want to query, end with "_c" or "_e" : ') if word == 'q' or word == 'quit': sys.exit(0) if word not in wordlist: print('NOT FOUND!') continue word_type = word.split('_')[1] if word_type == 'c':
from softmaxreg import softmaxRegression, getSentenceFeature, accuracy, softmax_wrapper # Try different regularizations and pick the best! # NOTE: fill in one more "your code here" below before running! REGULARIZATION = None # Assign a list of floats in the block below ### YOUR CODE HERE REGULARIZATION = [0.00001, 0.00003, 0.0001, 0.0003, 0.001,0.003,0.01] ### END YOUR CODE # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # Load the word vectors we trained earlier _, wordVectors0, _ = load_saved_params() wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset)
return obj config = configparser.ConfigParser( interpolation=configparser.ExtendedInterpolation()) configPath = 'bi-config.ini' config.read(configPath) section = "COPA" datasets_dir = config.get(section, "datasets_dir") wordlist = loadObj(config.get(section, "id2word_list")) dim = '300' params_path = datasets_dir + '/GloveInit/dim=' + dim #params_path = '../oldRndWrongCausalSkip/GloveInit/dim=300' #params_path = '../oldRndWrongCausalSkip/dim=300' _, X, _, _ = load_saved_params(params_path) # X: wordVectors k_means = sklearn.cluster.KMeans(n_clusters=500, max_iter=100000) k_means.fit(X) print(len(k_means.labels_)) d = {} for i, label in enumerate(k_means.labels_): d.setdefault(label, []) d[label].append(wordlist[i]) with open('kmeans_gloveinit_cluster500.txt', 'w') as outf: for k, v in d.items(): outf.write(str(k) + ': ' + str(v) + '\n') outf.flush()
def __main__(): dataset = StanfordSentiment(root_dir="../datasets") tokens = dataset.tokens() numWords = len(tokens) _, wordVectors, _ = load_saved_params("../saved_params") # Concatenate the input/output word embedding vectors to form the final word representation inputVectors, outputVectors = wordVectors[:numWords, :], wordVectors[ numWords:, :] print(inputVectors.shape) print(outputVectors.shape) wordVectors = np.concatenate((inputVectors, outputVectors), axis=1) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain, ), dtype=np.int32) for i in range(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev, ), dtype=np.int32) for i in range(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest, ), dtype=np.int32) for i in range(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run # and see if the regularization value performs well results = [] regValues = getRegularizationValues() for reg in regValues: print("Training for reg=%f" % reg) # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0 / (reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print("Train accuracy (%%): %f" % trainAccuracy) # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print("Dev accuracy (%%): %f" % devAccuracy) # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print("Test accuracy (%%): %f" % testAccuracy) results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy }) # Print the accuracies print("") print("=== Recap ===") print("Reg\t\tTrain\tDev\tTest") for result in results: print("%.2E\t%.3f\t%.3f\t%.3f" % (result["reg"], result["train"], result["dev"], result["test"])) print("") bestResult = chooseBestModel(results) print("Best regularization value: %0.2E" % bestResult["reg"]) print("Test accuracy (%%): %f" % bestResult["test"])
config.read(configPath) for section in config.sections(): if not section=="COPA": continue datasets_dir = config.get(section, "datasets_dir") tokens = loadObj(config.get(section, "tokens")) wordlist = loadObj(config.get(section, "id2word_list")) causeprior = loadObj(config.get(section, "cause_prior")) N = len(causeprior) for f in glob.glob(datasets_dir+"/GloveInit/dim=*"): # Load the causal vectors we trained earlier if not os.listdir(f): continue print(f) _, wordVectors, _, _ = load_saved_params(f) causeVectors, effectVectors = wordVectors[:N,:], wordVectors[N:,:] visualizeWords = ['kill_c','guilty_e','happy_e', 'gift_c', 'fire_c', 'property_e', 'flood_c', 'damage_e', 'war_c', 'hide_c', 'surprise_e', 'coupon_c', 'discount_e','click_e','death_e','child_e','add_e', 'information_e','email_e','contact_e','information_c','like_c','find_c', 'like_c','look_c', 'intention_c','interrupt_c','intersection_c'] #visualizeWords = wordlist #visualizeWords = wordlist[0:N] """ indices = list(np.random.choice(np.arange(0,len(wordlist)),size=20,replace=False)) visualizeWords = [ wordlist[idx] for idx in indices ] """