def find_alternate_sentence(
    row
):  #the input format will be a dictionary, the output will be row modified with the alternate sentence and the corresponding ID
    '''
        Just like SimilarTo_Sentence and SimilarTo_SentID_GM, we will determine
        Alternate_SimilarTo_Sentence and Alternate_SimilarTo_SentID_GM
        by calculating the cosine distance between two sentences
        using the **document distance** code that we discussed in the previous class

        # -------------------------------------------------------------------------
        # Your aim in this function is to speed up the code using a simple trick
        # and a modification
        #
        # ----------
        # PRE-BONUS hints (to help get to 10x speedup):
        # Hint #1: Look at the other files in the folder.
        # Hint #2: You can speed up this function A LOT without changing a
        #          single line of it!
        #
        # Ask yourself:
        # -------------
        #   - Why are the functions called here so slow?
        #   - Is there something you learned in the class about "document distance" problem,
        #     that can be used here?
        #
        # -----
        # BONUS hints: (to get more than 10x speedup --- only try this after you
        #               have gotten a 10x speedup by completing the above changes and
        #               optimizing the other functions in this file)
        #
        # Hint #1: Is there a step which can be taken out of the 'for' loop?
        #
        # Hint #2: This code calculates the cosine distance between the given row's Sentence
        # and the Sentence_with_Target all the rows in MASTER_FILE.
        # This is repeated for each 'row' in SENTENCE_DB_FILE.
        # In first iteration, you already calculate the cosine distance of
        # "I go to school because I want to get a good [education]."
        # and all the rows in the MASTER_FILE
        # and that includes "I go to school because I want to get a good [education]."
        # This is repeated in 2nd iteration for "I go to school because I want to get a good [education].".
        #
        # Can you cache (store) these calculations for future iterations?
        # What would be the best data structure for caching?
        # Try to further optimize the code using a cache
        # -------------------------------------------------------------------------

    '''

    # find alternate similar sentence using document distance
    similar_sentence = None
    for record in get_csv_rows(MASTER_FILE):
        # record is a row in MASTER_FILE

        if record['SentID_GM'] == row['SentID_GM']:
            # ignore the same sentence
            continue

        # get frequency mapping for row['Sentence']
        row_word_list = get_words_from_string(
            row['Sentence'])  #list of strings
        row_freq_mapping = count_frequency(
            row_word_list
        )  #list of lists where the inner list contains a word and then its count

        # get frequency mapping for record['Sentence_with_Target']
        record_word_list = get_words_from_string(
            record['Sentence_with_Target'])  #list of strings
        record_freq_mapping = count_frequency(
            record_word_list
        )  #list of lists where the inner list contains a word and then its count

        distance = vector_angle(row_freq_mapping, record_freq_mapping)  #float
        if 0 < distance < 0.75:  #checking to see if your distance variable can be considered similar
            if (not similar_sentence) or (
                    distance < similar_sentence['distance']
            ):  #checking to see what identified sentence is more similar
                similar_sentence = {  #Creating a new dictionary with the more newly identified more similar sentence 
                    'distance': distance,
                    'Sentence_with_Target': record['Sentence_with_Target'],
                    'SentID_GM': record['SentID_GM']
                }

    if similar_sentence and similar_sentence['SentID_GM'] != row.get(
            'SimilarTo_SentID_GM'
    ):  #if the newly found sentence is NOT identical to the origincal sentence
        row['Alternate_SimilarTo_SentID_GM'] = similar_sentence['SentID_GM']
        row['Alternate_SimilarTo_Sentence'] = similar_sentence[
            'Sentence_with_Target']
예제 #2
0
def find_alternate_sentence(row):
    '''
        Just like SimilarTo_Sentence and SimilarTo_SentID_GM, we will determine
        Alternate_SimilarTo_Sentence and Alternate_SimilarTo_SentID_GM
        by calculating the cosine distance between two sentences
        using the **document distance** code that we discussed in the previous class

        # -------------------------------------------------------------------------
        # Your aim in this function is to speed up the code using a simple trick
        # and a modification
        #
        # ----------
        # PRE-BONUS hints (to help get to 10x speedup):
        # Hint #1: Look at the other files in the folder.
        # Hint #2: You can speed up this function A LOT without changing a
        #          single line of it!
        #
        # Ask yourself:
        # -------------
        #   - Why are the functions called here so slow?
        #   - Is there something you learned in the class about "document distance" problem,
        #     that can be used here?
        #
        # -----
        # BONUS hints: (to get more than 10x speedup --- only try this after you
        #               have gotten a 10x speedup by completing the above changes and
        #               optimizing the other functions in this file)
        #
        # Hint #1: Is there a step which can be taken out of the 'for' loop?
        #
        # Hint #2: This code calculates the cosine distance between the given row's Sentence
        # and the Sentence_with_Target all the rows in MASTER_FILE.
        # This is repeated for each 'row' in SENTENCE_DB_FILE.
        # In first iteration, you already calculate the cosine distance of
        # "I go to school because I want to get a good [education]."
        # and all the rows in the MASTER_FILE
        # and that includes "I go to school because I want to get a good [education]."
        # This is repeated in 2nd iteration for "I go to school because I want to get a good [education].".
        #
        # Can you cache (store) these calculations for future iterations?
        # What would be the best data structure for caching?
        # Try to further optimize the code using a cache
        # -------------------------------------------------------------------------

    '''

    # find alternate similar sentence using document distance
    similar_sentence = None  #Initialize similar_sentnce variable
    d = {}
    for record in get_csv_rows(MASTER_FILE):  #Loop through master file
        # record is a row in MASTER_FILE

        if record['SentID_GM'] == row['SentID_GM']:
            # ignore the same sentence
            continue

        # get frequency mapping for row['Sentence']
        row_word_list = get_words_from_string(
            row['Sentence']
        )  #Puts the words in the sentence as strings in a list
        row_freq_mapping = count_frequency(
            row_word_list
        )  #Gets the count for each word for how many times it appears in the Sentence in the form of a dictionary

        # get frequency mapping for record['Sentence_with_Target']
        record_word_list = get_words_from_string(
            record['Sentence_with_Target']
        )  #Puts the words from the Sentence that has the target into string into a list
        record_freq_mapping = count_frequency(
            record_word_list
        )  #Gets the count for each of these word for how many times it appears in the sentence in the form of a dictionary

        distance = vector_angle(
            row_freq_mapping, record_freq_mapping
        )  #Inputs the two word counts of each sentence in the forms of dictionaries and then computes the angle between these sentences
        if 0 < distance < 0.75:  #If the angle is between 0 and .75
            if (not similar_sentence) or (distance <
                                          similar_sentence['distance']):
                similar_sentence = {
                    'distance': distance,
                    'Sentence_with_Target': record['Sentence_with_Target'],
                    'SentID_GM': record[
                        'SentID_GM']  #Creates a dictionary of values for the new similar sentence, where the distance is set equal to the angle between the two setnences
                    #The Key Sentence_with_Target will be equal to the sentence from the master file and the Sentence ID will be the same as the Sentence ID in the master file
                }

    if similar_sentence and similar_sentence['SentID_GM'] != row.get(
            'SimilarTo_SentID_GM'
    ):  #If this new similar sentence isn't equal to the old similar sentence
        row['Alternate_SimilarTo_SentID_GM'] = similar_sentence[
            'SentID_GM']  #A new row is created called Alternate_SimilarTo_SentID_GM, which is equal to the Sentence ID generated in the dictionary above
        row['Alternate_SimilarTo_Sentence'] = similar_sentence[
            'Sentence_with_Target']  #Another new row is created called Alternate_SimilarTo_Sentnce, which is equal to the sentence generated in the dictionary above
예제 #3
0
        # -------------------------------------------------------------------------

    '''

    # find alternate similar sentence using document distance
    similar_sentence = None
    for record in get_csv_rows(MASTER_FILE):
        # record is a row in MASTER_FILE

        if record['SentID_GM'] == row['SentID_GM']:
            # ignore the same sentence
            continue
# conditional statement saying if the masterfile and the sentenceDB file are equivalent then continue in the function
        # get frequency mapping for row['Sentence']
        row_word_list = get_words_from_string(row['Sentence'])
        row_freq_mapping = count_frequency(row_word_list)
        # setting variables equal to functions that are defined in the docdist1.py file that will then be able to be
        # called

        # get frequency mapping for record['Sentence_with_Target']
        record_word_list = get_words_from_string(record['Sentence_with_Target'])
        record_freq_mapping = count_frequency(record_word_list)
        # setting variables equal to functions that are defined in the docdist1.py file that will then be able to be
        # called

        distance = vector_angle(row_freq_mapping, record_freq_mapping)
        # calculating the distance of each vector angle by calling the variables that refer to functions in the
        # docdist1.py file
        if 0 < distance < 0.75:
            # conditional: if the distance value is between 0 and .75 then it's true and one will continue
            if (not similar_sentence) or (distance < similar_sentence['distance']):
def find_alternate_sentence(row, data):
    '''
        Just like SimilarTo_Sentence and SimilarTo_SentID_GM, we will determine
        Alternate_SimilarTo_Sentence and Alternate_SimilarTo_SentID_GM
        by calculating the cosine distance between two sentences
        using the **document distance** code that we discussed in the previous class

        # -------------------------------------------------------------------------
        # Your aim in this function is to speed up the code using a simple trick
        # and a modification
        #
        # Biggest hint: look at the other files in the folder
        #
        # Ask yourself:
        # -------------
        #   - Why are the functions called here, so slow?
        #   - Is there something you learned in the class about "document distance" problem,
        #     that can be used here?
        #   - Is there a step which can be taken out of the 'for' loop?
        #
        # -----
        # Bonus:
        # ------
        # This code calculates the cosine distance between the given row's Sentence
        # and the Sentence_with_Target all the rows in MASTER_FILE.
        # This is repeated for each 'row' in SENTENCE_DB_FILE.
        # In first iteration, you already calculate the cosine distance of
        # "I go to school because I want to get a good [education]."
        # and all the rows in the MASTER_FILE
        # and that includes "I go to school because I want to get a good [education]."
        # This is repeated in 2nd iteration for "I go to school because I want to get a good [education].".
        #
        # Can you cache (store) these calculations for future iterations?
        # What would be the best data structure for caching?
        # Try to further optimize the code using a cache
        # -------------------------------------------------------------------------

    '''

    # This function creates a frequency mapping for the sentence from DB file and for the sentence in the master file
    # then check the distance between of the frequency mapping of the two sentences
    # if cosine distance is within 0 and 0.75 (non inclusive),
    # then insert the distance, record's Sentence_with_Target, and record' SentID_GM into similar_sentence as values to the corresponding keys
    # then use similar_sentence's SentID_GM and Sentence_with_Target into row's Alternate_SimilarTo_SentID_GM and Alternate_SimilarTo_Sentence

    # Data structure:
    # similar_sentence: None or dictionary (if the sentence matches);
    # similar_sentence is a dictionary where keys are distance, Sentence_with_Target, and SentID_GM and the values of the keys come from record
    # record: a dictionary from the list of dictionaries returned from get_csv_rows, passing the MASTER_FILE
    # row: a dictionary from the list of dictionaries returned from get_csv_rows, passing the SENTENCE_DB_FILE
    # Input: a row (essentially a dictionary) from the DB file; Output: None

    # find alternate similar sentence using document distance
    similar_sentence = None
    #for record in get_csv_rows(MASTER_FILE):
    # Change 2, see bottom for details

    #for record in get_csv_rows(MASTER_FILE):
    for record in data:
        if record['SentID_GM'] == row['SentID_GM']:
            # ignore the same sentence
            continue

        # get frequency mapping for row['Sentence']
        row_word_list = get_words_from_string(row['Sentence'])
        row_freq_mapping = count_frequency(row_word_list)

        # get frequency mapping for record['Sentence_with_Target']
        record_word_list = get_words_from_string(
            record['Sentence_with_Target'])
        record_freq_mapping = count_frequency(record_word_list)

        distance = vector_angle(row_freq_mapping, record_freq_mapping)
        # if the cosine distance is between 0 and 0.75 (non-inclusive)
        if 0 < distance < 0.75:
            if (not similar_sentence) or (distance <
                                          similar_sentence['distance']):
                # then make similar_sentence into a dictionary where keys are distance, Sentence_with_Target, and SentID_GM
                # and the values of the keys come from record
                similar_sentence = {
                    'distance': distance,
                    'Sentence_with_Target': record['Sentence_with_Target'],
                    'SentID_GM': record['SentID_GM']
                }

    # if similar sentence is not none and similar_sentence['SentID_GM'] is not the row's SimilarTo_SentID_GM
    if similar_sentence and similar_sentence['SentID_GM'] != row.get(
            'SimilarTo_SentID_GM'):
        # then the row's Alternate_SimilarTo_SentID_GM equals to similar_sentence's Sentece ID
        # and the row's Alternate_SimilarTo_Sentence equals to similar_sentence's Sentence with Target
        row['Alternate_SimilarTo_SentID_GM'] = similar_sentence['SentID_GM']
        row['Alternate_SimilarTo_Sentence'] = similar_sentence[
            'Sentence_with_Target']
def find_alternate_sentence(row):
    '''
        Just like SimilarTo_Sentence and SimilarTo_SentID_GM, we will determine
        Alternate_SimilarTo_Sentence and Alternate_SimilarTo_SentID_GM
        by calculating the cosine distance between two sentences
        using the **document distance** code that we discussed in the previous class

        # -------------------------------------------------------------------------
        # Your aim in this function is to speed up the code using a simple trick
        # and a modification
        #
        # ----------
        # PRE-BONUS hints (to help get to 10x speedup):
        # Hint #1: Look at the other files in the folder.
        # Hint #2: You can speed up this function A LOT without changing a
        #          single line of it!
        #
        # Ask yourself:
        # -------------
        #   - Why are the functions called here so slow?
        #   - Is there something you learned in the class about "document distance" problem,
        #     that can be used here?
        #
        # -----
        # BONUS hints: (to get more than 10x speedup --- only try this after you
        #               have gotten a 10x speedup by completing the above changes and
        #               optimizing the other functions in this file)
        #
        # Hint #1: Is there a step which can be taken out of the 'for' loop?
        #
        # Hint #2: This code calculates the cosine distance between the given row's Sentence
        # and the Sentence_with_Target all the rows in MASTER_FILE.
        # This is repeated for each 'row' in SENTENCE_DB_FILE.
        # In first iteration, you already calculate the cosine distance of
        # "I go to school because I want to get a good [education]."
        # and all the rows in the MASTER_FILE
        # and that includes "I go to school because I want to get a good [education]."
        # This is repeated in 2nd iteration for "I go to school because I want to get a good [education].".
        #
        # Can you cache (store) these calculations for future iterations?
        # What would be the best data structure for caching?
        # Try to further optimize the code using a cache
        # -------------------------------------------------------------------------

    '''

    # find alternate similar sentence using document distance
    similar_sentence = None
    for record in get_csv_rows(MASTER_FILE):
        # record is a row in MASTER_FILE

        if record['SentID_GM'] == row['SentID_GM']:
            # ignore the same sentence
            continue
# conditional statement saying if the masterfile and the sentenceDB file are equivalent then continue in the function
# get frequency mapping for row['Sentence']
        row_word_list = get_words_from_string(row['Sentence'])
        row_freq_mapping = count_frequency(row_word_list)
        # setting variables equal to functions that are defined in the docdist1.py file that will then be able to be
        # called

        # get frequency mapping for record['Sentence_with_Target']
        record_word_list = get_words_from_string(
            record['Sentence_with_Target'])
        record_freq_mapping = count_frequency(record_word_list)
        # setting variables equal to functions that are defined in the docdist1.py file that will then be able to be
        # called

        distance = vector_angle(row_freq_mapping, record_freq_mapping)
        # calculating the distance of each vector angle by calling the variables that refer to functions in the
        # docdist1.py file
        if 0 < distance < 0.75:
            # conditional: if the distance value is between 0 and .75 then it's true and one will continue
            if (not similar_sentence) or (distance <
                                          similar_sentence['distance']):
                # if the value is not none or the value of the distance is less than the value of
                # similar_sentence['distance']
                similar_sentence = {
                    'distance': distance,
                    'Sentence_with_Target': record['Sentence_with_Target'],
                    'SentID_GM': record['SentID_GM']
                }
# Then continue and set this dictionary keys of 'distance', 'sentence_with_target', and 'SentID_GM' equal to the
#  values of distance, record['Sentence_with_Target'],and record['SentID_GM']
    if similar_sentence and similar_sentence['SentID_GM'] != row.get(
            'SimilarTo_SentID_GM'):
        row['Alternate_SimilarTo_SentID_GM'] = similar_sentence['SentID_GM']
        row['Alternate_SimilarTo_Sentence'] = similar_sentence[
            'Sentence_with_Target']