示例#1
0
def main(data_file):
    """extract function word features from a text file"""

    # TODO: parse the review file. Field [0] per line is the review ID. Field[-1] is the review
    # define this function in util.py
    reviews, ids = load_reviews(data_file)

    # debug using just a few
    reviews = reviews[:10]
    ids = ids[:10]

    feature_key = ["the", "or", "and"]

    print(f"loading feature vectors for {len(reviews)} reviews")

    # TODO: For function words "the", "or" and "and", use a Python list to
    #     make a count vector per review
    feature_lists = []

    # TODO: Create the same feature vectors as a numpy array
    feature_np = np.zeros(((len(reviews)), len(feature_key)), dtype=np.int)

    # TODO: Cast your feature_lists to a numpy array and then verify it is equivalent to feature_np

    # TODO: Shuffle the list of id's and the feature matrix in unison. Then check your work
    print(f"Shuffling data")
    #TODO: define this function in util.py
    shuffled_feature_matrix, shuffled_ids = shuffle_dataset(feature_np, ids)
    print("ids before shuffle")
    print(ids)
    print("ids after shuffle")
    print(shuffled_ids)
示例#2
0
def main(data_file):
    """extract function word features from a text file"""

    # TODO: parse the review file. Field [0] per line is the review ID. Field[-1] is the review
    # define this function in util.py
    reviews, ids = load_reviews(data_file)

    # debug using just a few
    reviews = reviews[:10]

    feature_key = ["the", "or", "and"]

    print(f"loading feature vectors for {len(reviews)} reviews")

    # TODO: For function words "the", "or" and "and", use a Python list to
    #     make a count vector per review
    feature_lists = []

    # TODO: Create the same feature vectors as a numpy array
    feature_np = np.zeros(((len(reviews)), len(feature_key)), dtype=np.int)

    # Verify your list and numpy array are the same result
    are_equal = np.array_equal(np.asarray(feature_lists), feature_np)
    if are_equal:
        print("Numpy and list reprs are the same!")
    else:
        print("Numpy and list reprs are not equivalent. Keep trying!")

    # TODO: Calculate the total count per feature using your np array and .sum
    count_per_feat = [0, 0, 0]

    for i, feature_name in enumerate(feature_key):
        print(f"Count of '{feature_name}': {count_per_feat[i]}")
示例#3
0
def main(data_file):
    """extract function word features from a text file"""

    # TODO: parse the review file. Field [0] per line is the review ID. Field[-1] is the review
    # define this function in util.py
    reviews, ids = load_reviews(data_file)

    # debug using just a few
    reviews = reviews[:10]
    ids = ids[:10]

    feature_key = ["the", "or", "and"]

    print(f"loading feature vectors for {len(reviews)} reviews")

    # TODO: For function words "the", "or" and "and", use a Python list to
    #     make a count vector per review
    feature_lists = []
    for review in reviews:


    # TODO: Create the same feature vectors as a numpy array
    feature_np = np.zeros(((len(reviews)), len(feature_key)), dtype=np.int)

    # TODO: Cast your feature_lists to a numpy array and then verify it is equivalent to feature_np

    # TODO: Shuffle the list of id's and the feature matrix in unison. Then check your work
    print(f"Shuffling data")
    #TODO: define this function in util.py
    shuffled_feature_matrix, shuffled_ids = shuffle_dataset(feature_np, ids)
    print("ids before shuffle")
    print(ids)
    print("ids after shuffle")
    print(shuffled_ids)



if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='feature vector lab')
    parser.add_argument('--path', type=str, default="imdb_practice.txt",
                        help='path to input with one review per line')

    args = parser.parse_args()

    main(args.path)
示例#4
0
def main(data_file, vocab_path):
    """extract function word features from a text file"""

    # load resources and text file
    function_words = load_function_words(vocab_path)

    reviews, ids = load_reviews(data_file)

    # TODO: appropriately shape and fill this matrix
    # define the shape of this 2d array
    nrows = len(ids)
    ncols = len(function_words)
    # initialize the 2d array
    review_features = np.zeros((nrows, ncols), dtype=np.int)
    # fill in the value of the 2d array
    for i in range(len(reviews)):
        #tokenize and lowercase all the words
        each_review = word_tokenize(reviews[i].lower())
        #loop through each word of each review and fill in the value of the 2d array
        for word in each_review:
            if word in function_words:
                word_index = function_words.index(word)
                review_features[i][word_index] += 1
    # row is which review
    # column is which word

    print(f"Numpy array has shape {review_features.shape} and dtype {review_features.dtype}")

    # TODO: Calculate these from review_features
    # sum up each column
    words_count = [sum(x) for x in zip(*review_features)]
    # get the most common words
    most_common_count = max(words_count)
    most_common_word_index = words_count.index(most_common_count)
    most_common_word = function_words[most_common_word_index]
    print(f"Most common word: {most_common_word}, count: {most_common_count}")

    # TODO: Find any features that weren't in the data (i.e. columns that sum to 0)
    # initialize a list for index whose column sum is zero
    zero_inds = []
    # loop through the list of the sum of columns, append index whose column sum is zero to the list just initialized
    for i in range(len(words_count)):
        if words_count[i] == 0:
            zero_inds.append(i)
    if len(zero_inds) > 0:
        print("No instances found for: ")
        for ind in zero_inds:
            print(f"  {function_words[ind]}")
    else:
        print("All function words found")

    matrix_sum = review_features.sum()
    print(f"Sum of raw count matrix: {matrix_sum}")

    # TODO: make a binary feature vector from your count vector
    # copy the 2d array and convert it to a binary vector
    word_binary = np.copy(review_features)
    # loop through each entry and convert the value whose value is not zero to one
    for i in range(len(word_binary)):
        for j in range(len(word_binary[i])):
            if word_binary[i][j] > 0:
                word_binary[i][j] = 1
    word_binary_sum = word_binary.sum()
    print(f"Sum of binary matrix: {word_binary_sum}")

    # TODO: normalize features by review length (divide rows by number of words in the review)
    # copy the matrix
    norm_reviews = np.copy(review_features)
    # copy the numpy ndarray to a list
    norm_reviews = norm_reviews.tolist()
    # loop through each row and calculate the sum of each row
    for i in range(len(norm_reviews)):
        sum_of_row = sum(norm_reviews[i])
        # loop through each entry of each row and normalize it by the sum of each row
        for j in range(len(norm_reviews[i])):
            normalized_val = (norm_reviews[i][j]) / (sum_of_row)
            norm_reviews[i][j] = normalized_val
    # convert the list back to a numpy array
    norm_reviews = np.array(norm_reviews)
    #round the decimals
    norm_reviews_sum = round(norm_reviews.sum(), 2)
    print(f"Sum of normed matrix: {norm_reviews_sum}")

    # TODO: remove features from <review_features> that occur less than <min_count> times
    min_count = 100
    min_matrix = np.copy(review_features)
    # initialize a list for index whose column sum is less than minimum ocunt
    remove_column_index = []
    for i in range(len(words_count)):
        if words_count[i] <= min_count:
            remove_column_index.append(i)
    # remove columns whose column sum is less than the minimum count by np.delete(array, list of index to remove, axis = 1)
    min_matrix = np.delete(min_matrix, remove_column_index, 1)
    min_matrix_shape = min_matrix.shape
    print(f"Shape after removing features that occur < {min_count} times: {min_matrix_shape}")

    # TODO: split the dataset by updating the function above
    train, val = split_data(review_features, ids, 0.3)

    # Code below that all your data has been retained in your splits; do not edit.
    # Must all print True

    check_splits(train, val, review_features, ids)
示例#5
0
def main(data_file):
    """extract function word features from a text file"""
    # TODO: parse the review file. Field [0] per line is the review ID. Field[-1] is the review
    # define this function in util.py
    reviews, ids = load_reviews(data_file)

    ###################### debug using just a few
    reviews = reviews[:10]
    ids = ids[:10]

    ######################
    print('\n Debug: \n ')
    print(reviews)
    print(ids)
    print('\n')
    ######################

    feature_key = ["the", "or", "and"]
    print(f"loading feature vectors for {len(reviews)} reviews")

    # For function words "the", "or" and "and", use a Python list to
    # make a count vector per review
    feature_lists = []
    for review in reviews:
        review_words = word_tokenize(review.lower())
        vec = []
        for word in feature_key:
            these_words = [w for w in review_words if w == word]
            vec.append(len(these_words))
        feature_lists.append(vec)
    print(feature_lists)

    # Create the same feature vectors as a numpy array
    feature_np = np.zeros(((len(reviews)), len(feature_key)), dtype=np.int)
    for i, review in enumerate(reviews):
        review_words = word_tokenize(review.lower())
        for j, word in enumerate(feature_key):
            these_words = [w for w in review_words if w == word]
            feature_np[i, j] = len(these_words)
    print(feature_np)

    # Cast your feature_lists to a numpy array and then verify it is equivalent to feature_np
    feature_lists_np = np.asarray(feature_lists)
    print(f'equal? {np.array_equal(feature_lists_np, feature_np)}')

    # Shuffle the list of id's and the feature matrix in unison. Then check your work
    print("ids before shuffle")
    print(ids)

    print("ids after shuffle")
    nums = np.random.permutation(len(ids))
    print(nums)
    shuffled_ids = [ids[i] for i in nums]
    print(shuffled_ids)

    print("feature matrix before shuffle")
    print(feature_np)

    print("feature matrix after shuffle")
    shuffled_feature_np = np.zeros(((len(reviews)), len(feature_key)),
                                   dtype=np.int)
    for i in range(len(reviews)):
        shuffled_feature_np[i] = feature_np[nums[i]]
    print(shuffled_feature_np)

    # define this function in util.py
    shuffled_feature_matrix, shuffled_ids = shuffle_dataset(feature_np, ids)
    print("ids before shuffle")
    print(ids)
    print("ids after shuffle")
    print(shuffled_ids)
示例#6
0
def main(data_file, vocab_path):
    """extract function word features from a text file"""

    # load resources and text file
    function_words = load_function_words(vocab_path)

    reviews, ids = load_reviews(data_file)

    # TODO: appropriately shape and fill this matrix
    review_features = np.zeros((1, 1), dtype=np.int)
    # row is which review
    # column is which word

    print(
        f"Numpy array has shape {review_features.shape} and dtype {review_features.dtype}"
    )

    # TODO: Calculate these from review_features
    most_common_count = 0
    most_common_word = ""
    print(f"Most common word: {most_common_word}, count: {most_common_count}")

    # TODO: Find any features that weren't in the data (i.e. columns that sum to 0)
    zero_inds = []
    if len(zero_inds) > 0:
        print("No instances found for: ")
        for ind in zero_inds:
            print(f"  {function_words[ind]}")
    else:
        print("All function words found")

    matrix_sum = review_features.sum()
    print(f"Sum of raw count matrix: {matrix_sum}")

    # TODO: make a binary feature vector from your count vector
    word_binary = np.copy(review_features)
    word_binary_sum = word_binary.sum()
    print(f"Sum of binary matrix: {word_binary_sum}")

    # TODO: normalize features for review length (divide rows by number of function words in the review)
    # HINT: each row should sum to 1
    norm_reviews = np.copy(review_features)
    norm_reviews_sum = norm_reviews.sum()
    print(f"Sum of normed matrix: {norm_reviews_sum}")

    # TODO: remove features from <review_features> that occur less than <min_count> times
    min_count = 100
    min_matrix = np.copy(review_features)
    min_matrix_shape = min_matrix.shape
    print(
        f"Shape after removing features that occur < {min_count} times: {min_matrix_shape}"
    )

    #TODO: split the dataset by updating the function above

    train, val = split_data(review_features, ids, 0.3)

    # Code below that all your data has been retained in your splits; do not edit.
    # Must all print True

    check_splits(train, val, review_features, ids)
def main(data_file, vocab_path):
    """extract function word features from a text file"""

    ### load resources and text file
    function_words = load_function_words(vocab_path)
    reviews, ids = load_reviews(data_file)

    ### appropriately shape and fill this matrix
    review_features = np.zeros((len(reviews), len(function_words)),
                               dtype=np.int)
    review_features = feature_matrix(reviews, function_words)
    # row is which review
    # column is which word
    print(
        f"Numpy array has shape {review_features.shape} and dtype {review_features.dtype}"
    )

    ### Calculate these from review_features
    column_sum = np.sum(review_features, axis=0)
    most_common_count = max(column_sum)

    index = np.where(column_sum == column_sum.max())
    most_common_word = function_words[index[0][0]]

    print(f"Most common word: {most_common_word}, count: {most_common_count}")

    ### Find any features that weren't in the data (i.e. columns that sum to 0)
    index = np.where(column_sum == 0)
    zero_inds = index[0]
    if len(zero_inds) > 0:
        print("No instances found for: ")
        for ind in zero_inds:
            print(f"  {function_words[ind]}")
    else:
        print("All function words found")

    matrix_sum = review_features.sum()
    print(f"Sum of raw count matrix: {matrix_sum}")

    ### make a binary feature vector from your count vector
    word_binary = np.copy(review_features)
    for i in range(len(reviews)):
        word_binary[i] = np.where(word_binary[i] > 0, 1, 0)

    word_binary_sum = word_binary.sum()
    print(f"Sum of binary matrix: {word_binary_sum}")

    ### normalize features by review length (divide rows by number of words in the review)
    norm_reviews = np.copy(review_features)

    for i in range(len(reviews)):
        for j in range(len(function_words)):
            norm_reviews[i, j] = norm_reviews[i, j] / norm_reviews[i].sum()

    norm_reviews_sum = norm_reviews.sum()
    print(f"Sum of normed matrix: {norm_reviews_sum}")

    ### remove features from <review_features> that occur less than <min_count> times
    min_count = 100
    min_matrix = np.copy(review_features)

    index = np.where(column_sum <= min_count)
    mincnt_ind = index[0]

    functionword_min_matrix = []
    for i in range(len(function_words)):
        if i not in mincnt_ind:
            functionword_min_matrix.append(function_words[i])

    min_matrix = feature_matrix(reviews, functionword_min_matrix)

    min_matrix_shape = min_matrix.shape
    print(
        f"Shape after removing features that occur < {min_count} times: {min_matrix_shape}"
    )

    ### split the dataset by updating the function above
    train, val = split_data(review_features, ids, 0.3)

    # Code below that all your data has been retained in your splits; do not edit.
    # Must all print True

    check_splits(train, val, review_features, ids)
示例#8
0
def main(data_file, vocab_path):
    """extract function word features from a text file"""

    # load resources and text file
    function_words = load_function_words(vocab_path)

    reviews, ids = load_reviews(data_file)

    # TODO 0: appropriately shape and fill this matrix
    review_features = np.zeros((1, 1), dtype=np.int)
    # row is which review
    # column is which word

    print(
        f"0: Numpy array has shape {review_features.shape} and dtype {review_features.dtype}"
    )

    matrix_sum = review_features.sum()
    print(f"Sum of raw count matrix: {matrix_sum}")

    # TODO 1: Figure out what the most common word (feature) is in review_features. Do not hardcode the answer
    most_common_count = 0
    most_common_word = ""
    print(
        f"1. Most common word: {most_common_word}, count: {most_common_count}")

    # TODO 2: Find any features that weren't in the data (i.e. columns that sum to 0)
    zero_inds = []
    if len(zero_inds) > 0:
        print("2. No instances found for: ")
        for ind in zero_inds:
            print(f"  {function_words[ind]}")
    else:
        print("2. All function words found")

    # TODO 3: make a binary feature vector from your count vector
    word_binary = np.copy(review_features)
    word_binary_sum = word_binary.sum()
    print(f"3: Sum of binary matrix: {word_binary_sum}")

    # TODO 4: normalize features for review length (divide rows by number of *function words* in the review)
    # HINT: each row should sum to 1
    norm_reviews = np.copy(review_features)
    norm_reviews_sum = norm_reviews.sum()
    print(f"4: Sum of normed matrix: {norm_reviews_sum}")

    # TODO 5: remove features from <review_features> that occur less than <min_count> times
    min_count = 100
    min_matrix = np.copy(review_features)
    min_matrix_shape = min_matrix.shape
    print(
        f"5: Shape after removing features that occur < {min_count} times: {min_matrix_shape}"
    )

    # TODO 6: normalize features by each feature's *document frequency*
    # For THIS exercise, divide each count by the number of documents that has that feature at all
    # (be careful not to divide by *total count* of the feature)
    # perform this on the matrix from TODO 5
    df_norm_reviews = np.copy(min_matrix)
    df_norm_reviews_sum = df_norm_reviews.sum()
    print(f"6: Sum of document frequency normed matrix: {df_norm_reviews_sum}")