def main(data_file): """extract function word features from a text file""" # TODO: parse the review file. Field [0] per line is the review ID. Field[-1] is the review # define this function in util.py reviews, ids = load_reviews(data_file) # debug using just a few reviews = reviews[:10] ids = ids[:10] feature_key = ["the", "or", "and"] print(f"loading feature vectors for {len(reviews)} reviews") # TODO: For function words "the", "or" and "and", use a Python list to # make a count vector per review feature_lists = [] # TODO: Create the same feature vectors as a numpy array feature_np = np.zeros(((len(reviews)), len(feature_key)), dtype=np.int) # TODO: Cast your feature_lists to a numpy array and then verify it is equivalent to feature_np # TODO: Shuffle the list of id's and the feature matrix in unison. Then check your work print(f"Shuffling data") #TODO: define this function in util.py shuffled_feature_matrix, shuffled_ids = shuffle_dataset(feature_np, ids) print("ids before shuffle") print(ids) print("ids after shuffle") print(shuffled_ids)
def main(data_file): """extract function word features from a text file""" # TODO: parse the review file. Field [0] per line is the review ID. Field[-1] is the review # define this function in util.py reviews, ids = load_reviews(data_file) # debug using just a few reviews = reviews[:10] feature_key = ["the", "or", "and"] print(f"loading feature vectors for {len(reviews)} reviews") # TODO: For function words "the", "or" and "and", use a Python list to # make a count vector per review feature_lists = [] # TODO: Create the same feature vectors as a numpy array feature_np = np.zeros(((len(reviews)), len(feature_key)), dtype=np.int) # Verify your list and numpy array are the same result are_equal = np.array_equal(np.asarray(feature_lists), feature_np) if are_equal: print("Numpy and list reprs are the same!") else: print("Numpy and list reprs are not equivalent. Keep trying!") # TODO: Calculate the total count per feature using your np array and .sum count_per_feat = [0, 0, 0] for i, feature_name in enumerate(feature_key): print(f"Count of '{feature_name}': {count_per_feat[i]}")
def main(data_file): """extract function word features from a text file""" # TODO: parse the review file. Field [0] per line is the review ID. Field[-1] is the review # define this function in util.py reviews, ids = load_reviews(data_file) # debug using just a few reviews = reviews[:10] ids = ids[:10] feature_key = ["the", "or", "and"] print(f"loading feature vectors for {len(reviews)} reviews") # TODO: For function words "the", "or" and "and", use a Python list to # make a count vector per review feature_lists = [] for review in reviews: # TODO: Create the same feature vectors as a numpy array feature_np = np.zeros(((len(reviews)), len(feature_key)), dtype=np.int) # TODO: Cast your feature_lists to a numpy array and then verify it is equivalent to feature_np # TODO: Shuffle the list of id's and the feature matrix in unison. Then check your work print(f"Shuffling data") #TODO: define this function in util.py shuffled_feature_matrix, shuffled_ids = shuffle_dataset(feature_np, ids) print("ids before shuffle") print(ids) print("ids after shuffle") print(shuffled_ids) if __name__ == '__main__': parser = argparse.ArgumentParser(description='feature vector lab') parser.add_argument('--path', type=str, default="imdb_practice.txt", help='path to input with one review per line') args = parser.parse_args() main(args.path)
def main(data_file, vocab_path): """extract function word features from a text file""" # load resources and text file function_words = load_function_words(vocab_path) reviews, ids = load_reviews(data_file) # TODO: appropriately shape and fill this matrix # define the shape of this 2d array nrows = len(ids) ncols = len(function_words) # initialize the 2d array review_features = np.zeros((nrows, ncols), dtype=np.int) # fill in the value of the 2d array for i in range(len(reviews)): #tokenize and lowercase all the words each_review = word_tokenize(reviews[i].lower()) #loop through each word of each review and fill in the value of the 2d array for word in each_review: if word in function_words: word_index = function_words.index(word) review_features[i][word_index] += 1 # row is which review # column is which word print(f"Numpy array has shape {review_features.shape} and dtype {review_features.dtype}") # TODO: Calculate these from review_features # sum up each column words_count = [sum(x) for x in zip(*review_features)] # get the most common words most_common_count = max(words_count) most_common_word_index = words_count.index(most_common_count) most_common_word = function_words[most_common_word_index] print(f"Most common word: {most_common_word}, count: {most_common_count}") # TODO: Find any features that weren't in the data (i.e. columns that sum to 0) # initialize a list for index whose column sum is zero zero_inds = [] # loop through the list of the sum of columns, append index whose column sum is zero to the list just initialized for i in range(len(words_count)): if words_count[i] == 0: zero_inds.append(i) if len(zero_inds) > 0: print("No instances found for: ") for ind in zero_inds: print(f" {function_words[ind]}") else: print("All function words found") matrix_sum = review_features.sum() print(f"Sum of raw count matrix: {matrix_sum}") # TODO: make a binary feature vector from your count vector # copy the 2d array and convert it to a binary vector word_binary = np.copy(review_features) # loop through each entry and convert the value whose value is not zero to one for i in range(len(word_binary)): for j in range(len(word_binary[i])): if word_binary[i][j] > 0: word_binary[i][j] = 1 word_binary_sum = word_binary.sum() print(f"Sum of binary matrix: {word_binary_sum}") # TODO: normalize features by review length (divide rows by number of words in the review) # copy the matrix norm_reviews = np.copy(review_features) # copy the numpy ndarray to a list norm_reviews = norm_reviews.tolist() # loop through each row and calculate the sum of each row for i in range(len(norm_reviews)): sum_of_row = sum(norm_reviews[i]) # loop through each entry of each row and normalize it by the sum of each row for j in range(len(norm_reviews[i])): normalized_val = (norm_reviews[i][j]) / (sum_of_row) norm_reviews[i][j] = normalized_val # convert the list back to a numpy array norm_reviews = np.array(norm_reviews) #round the decimals norm_reviews_sum = round(norm_reviews.sum(), 2) print(f"Sum of normed matrix: {norm_reviews_sum}") # TODO: remove features from <review_features> that occur less than <min_count> times min_count = 100 min_matrix = np.copy(review_features) # initialize a list for index whose column sum is less than minimum ocunt remove_column_index = [] for i in range(len(words_count)): if words_count[i] <= min_count: remove_column_index.append(i) # remove columns whose column sum is less than the minimum count by np.delete(array, list of index to remove, axis = 1) min_matrix = np.delete(min_matrix, remove_column_index, 1) min_matrix_shape = min_matrix.shape print(f"Shape after removing features that occur < {min_count} times: {min_matrix_shape}") # TODO: split the dataset by updating the function above train, val = split_data(review_features, ids, 0.3) # Code below that all your data has been retained in your splits; do not edit. # Must all print True check_splits(train, val, review_features, ids)
def main(data_file): """extract function word features from a text file""" # TODO: parse the review file. Field [0] per line is the review ID. Field[-1] is the review # define this function in util.py reviews, ids = load_reviews(data_file) ###################### debug using just a few reviews = reviews[:10] ids = ids[:10] ###################### print('\n Debug: \n ') print(reviews) print(ids) print('\n') ###################### feature_key = ["the", "or", "and"] print(f"loading feature vectors for {len(reviews)} reviews") # For function words "the", "or" and "and", use a Python list to # make a count vector per review feature_lists = [] for review in reviews: review_words = word_tokenize(review.lower()) vec = [] for word in feature_key: these_words = [w for w in review_words if w == word] vec.append(len(these_words)) feature_lists.append(vec) print(feature_lists) # Create the same feature vectors as a numpy array feature_np = np.zeros(((len(reviews)), len(feature_key)), dtype=np.int) for i, review in enumerate(reviews): review_words = word_tokenize(review.lower()) for j, word in enumerate(feature_key): these_words = [w for w in review_words if w == word] feature_np[i, j] = len(these_words) print(feature_np) # Cast your feature_lists to a numpy array and then verify it is equivalent to feature_np feature_lists_np = np.asarray(feature_lists) print(f'equal? {np.array_equal(feature_lists_np, feature_np)}') # Shuffle the list of id's and the feature matrix in unison. Then check your work print("ids before shuffle") print(ids) print("ids after shuffle") nums = np.random.permutation(len(ids)) print(nums) shuffled_ids = [ids[i] for i in nums] print(shuffled_ids) print("feature matrix before shuffle") print(feature_np) print("feature matrix after shuffle") shuffled_feature_np = np.zeros(((len(reviews)), len(feature_key)), dtype=np.int) for i in range(len(reviews)): shuffled_feature_np[i] = feature_np[nums[i]] print(shuffled_feature_np) # define this function in util.py shuffled_feature_matrix, shuffled_ids = shuffle_dataset(feature_np, ids) print("ids before shuffle") print(ids) print("ids after shuffle") print(shuffled_ids)
def main(data_file, vocab_path): """extract function word features from a text file""" # load resources and text file function_words = load_function_words(vocab_path) reviews, ids = load_reviews(data_file) # TODO: appropriately shape and fill this matrix review_features = np.zeros((1, 1), dtype=np.int) # row is which review # column is which word print( f"Numpy array has shape {review_features.shape} and dtype {review_features.dtype}" ) # TODO: Calculate these from review_features most_common_count = 0 most_common_word = "" print(f"Most common word: {most_common_word}, count: {most_common_count}") # TODO: Find any features that weren't in the data (i.e. columns that sum to 0) zero_inds = [] if len(zero_inds) > 0: print("No instances found for: ") for ind in zero_inds: print(f" {function_words[ind]}") else: print("All function words found") matrix_sum = review_features.sum() print(f"Sum of raw count matrix: {matrix_sum}") # TODO: make a binary feature vector from your count vector word_binary = np.copy(review_features) word_binary_sum = word_binary.sum() print(f"Sum of binary matrix: {word_binary_sum}") # TODO: normalize features for review length (divide rows by number of function words in the review) # HINT: each row should sum to 1 norm_reviews = np.copy(review_features) norm_reviews_sum = norm_reviews.sum() print(f"Sum of normed matrix: {norm_reviews_sum}") # TODO: remove features from <review_features> that occur less than <min_count> times min_count = 100 min_matrix = np.copy(review_features) min_matrix_shape = min_matrix.shape print( f"Shape after removing features that occur < {min_count} times: {min_matrix_shape}" ) #TODO: split the dataset by updating the function above train, val = split_data(review_features, ids, 0.3) # Code below that all your data has been retained in your splits; do not edit. # Must all print True check_splits(train, val, review_features, ids)
def main(data_file, vocab_path): """extract function word features from a text file""" ### load resources and text file function_words = load_function_words(vocab_path) reviews, ids = load_reviews(data_file) ### appropriately shape and fill this matrix review_features = np.zeros((len(reviews), len(function_words)), dtype=np.int) review_features = feature_matrix(reviews, function_words) # row is which review # column is which word print( f"Numpy array has shape {review_features.shape} and dtype {review_features.dtype}" ) ### Calculate these from review_features column_sum = np.sum(review_features, axis=0) most_common_count = max(column_sum) index = np.where(column_sum == column_sum.max()) most_common_word = function_words[index[0][0]] print(f"Most common word: {most_common_word}, count: {most_common_count}") ### Find any features that weren't in the data (i.e. columns that sum to 0) index = np.where(column_sum == 0) zero_inds = index[0] if len(zero_inds) > 0: print("No instances found for: ") for ind in zero_inds: print(f" {function_words[ind]}") else: print("All function words found") matrix_sum = review_features.sum() print(f"Sum of raw count matrix: {matrix_sum}") ### make a binary feature vector from your count vector word_binary = np.copy(review_features) for i in range(len(reviews)): word_binary[i] = np.where(word_binary[i] > 0, 1, 0) word_binary_sum = word_binary.sum() print(f"Sum of binary matrix: {word_binary_sum}") ### normalize features by review length (divide rows by number of words in the review) norm_reviews = np.copy(review_features) for i in range(len(reviews)): for j in range(len(function_words)): norm_reviews[i, j] = norm_reviews[i, j] / norm_reviews[i].sum() norm_reviews_sum = norm_reviews.sum() print(f"Sum of normed matrix: {norm_reviews_sum}") ### remove features from <review_features> that occur less than <min_count> times min_count = 100 min_matrix = np.copy(review_features) index = np.where(column_sum <= min_count) mincnt_ind = index[0] functionword_min_matrix = [] for i in range(len(function_words)): if i not in mincnt_ind: functionword_min_matrix.append(function_words[i]) min_matrix = feature_matrix(reviews, functionword_min_matrix) min_matrix_shape = min_matrix.shape print( f"Shape after removing features that occur < {min_count} times: {min_matrix_shape}" ) ### split the dataset by updating the function above train, val = split_data(review_features, ids, 0.3) # Code below that all your data has been retained in your splits; do not edit. # Must all print True check_splits(train, val, review_features, ids)
def main(data_file, vocab_path): """extract function word features from a text file""" # load resources and text file function_words = load_function_words(vocab_path) reviews, ids = load_reviews(data_file) # TODO 0: appropriately shape and fill this matrix review_features = np.zeros((1, 1), dtype=np.int) # row is which review # column is which word print( f"0: Numpy array has shape {review_features.shape} and dtype {review_features.dtype}" ) matrix_sum = review_features.sum() print(f"Sum of raw count matrix: {matrix_sum}") # TODO 1: Figure out what the most common word (feature) is in review_features. Do not hardcode the answer most_common_count = 0 most_common_word = "" print( f"1. Most common word: {most_common_word}, count: {most_common_count}") # TODO 2: Find any features that weren't in the data (i.e. columns that sum to 0) zero_inds = [] if len(zero_inds) > 0: print("2. No instances found for: ") for ind in zero_inds: print(f" {function_words[ind]}") else: print("2. All function words found") # TODO 3: make a binary feature vector from your count vector word_binary = np.copy(review_features) word_binary_sum = word_binary.sum() print(f"3: Sum of binary matrix: {word_binary_sum}") # TODO 4: normalize features for review length (divide rows by number of *function words* in the review) # HINT: each row should sum to 1 norm_reviews = np.copy(review_features) norm_reviews_sum = norm_reviews.sum() print(f"4: Sum of normed matrix: {norm_reviews_sum}") # TODO 5: remove features from <review_features> that occur less than <min_count> times min_count = 100 min_matrix = np.copy(review_features) min_matrix_shape = min_matrix.shape print( f"5: Shape after removing features that occur < {min_count} times: {min_matrix_shape}" ) # TODO 6: normalize features by each feature's *document frequency* # For THIS exercise, divide each count by the number of documents that has that feature at all # (be careful not to divide by *total count* of the feature) # perform this on the matrix from TODO 5 df_norm_reviews = np.copy(min_matrix) df_norm_reviews_sum = df_norm_reviews.sum() print(f"6: Sum of document frequency normed matrix: {df_norm_reviews_sum}")