def find_alternate_sentence( row ): #the input format will be a dictionary, the output will be row modified with the alternate sentence and the corresponding ID ''' Just like SimilarTo_Sentence and SimilarTo_SentID_GM, we will determine Alternate_SimilarTo_Sentence and Alternate_SimilarTo_SentID_GM by calculating the cosine distance between two sentences using the **document distance** code that we discussed in the previous class # ------------------------------------------------------------------------- # Your aim in this function is to speed up the code using a simple trick # and a modification # # ---------- # PRE-BONUS hints (to help get to 10x speedup): # Hint #1: Look at the other files in the folder. # Hint #2: You can speed up this function A LOT without changing a # single line of it! # # Ask yourself: # ------------- # - Why are the functions called here so slow? # - Is there something you learned in the class about "document distance" problem, # that can be used here? # # ----- # BONUS hints: (to get more than 10x speedup --- only try this after you # have gotten a 10x speedup by completing the above changes and # optimizing the other functions in this file) # # Hint #1: Is there a step which can be taken out of the 'for' loop? # # Hint #2: This code calculates the cosine distance between the given row's Sentence # and the Sentence_with_Target all the rows in MASTER_FILE. # This is repeated for each 'row' in SENTENCE_DB_FILE. # In first iteration, you already calculate the cosine distance of # "I go to school because I want to get a good [education]." # and all the rows in the MASTER_FILE # and that includes "I go to school because I want to get a good [education]." # This is repeated in 2nd iteration for "I go to school because I want to get a good [education].". # # Can you cache (store) these calculations for future iterations? # What would be the best data structure for caching? # Try to further optimize the code using a cache # ------------------------------------------------------------------------- ''' # find alternate similar sentence using document distance similar_sentence = None for record in get_csv_rows(MASTER_FILE): # record is a row in MASTER_FILE if record['SentID_GM'] == row['SentID_GM']: # ignore the same sentence continue # get frequency mapping for row['Sentence'] row_word_list = get_words_from_string( row['Sentence']) #list of strings row_freq_mapping = count_frequency( row_word_list ) #list of lists where the inner list contains a word and then its count # get frequency mapping for record['Sentence_with_Target'] record_word_list = get_words_from_string( record['Sentence_with_Target']) #list of strings record_freq_mapping = count_frequency( record_word_list ) #list of lists where the inner list contains a word and then its count distance = vector_angle(row_freq_mapping, record_freq_mapping) #float if 0 < distance < 0.75: #checking to see if your distance variable can be considered similar if (not similar_sentence) or ( distance < similar_sentence['distance'] ): #checking to see what identified sentence is more similar similar_sentence = { #Creating a new dictionary with the more newly identified more similar sentence 'distance': distance, 'Sentence_with_Target': record['Sentence_with_Target'], 'SentID_GM': record['SentID_GM'] } if similar_sentence and similar_sentence['SentID_GM'] != row.get( 'SimilarTo_SentID_GM' ): #if the newly found sentence is NOT identical to the origincal sentence row['Alternate_SimilarTo_SentID_GM'] = similar_sentence['SentID_GM'] row['Alternate_SimilarTo_Sentence'] = similar_sentence[ 'Sentence_with_Target']
def find_alternate_sentence(row): ''' Just like SimilarTo_Sentence and SimilarTo_SentID_GM, we will determine Alternate_SimilarTo_Sentence and Alternate_SimilarTo_SentID_GM by calculating the cosine distance between two sentences using the **document distance** code that we discussed in the previous class # ------------------------------------------------------------------------- # Your aim in this function is to speed up the code using a simple trick # and a modification # # ---------- # PRE-BONUS hints (to help get to 10x speedup): # Hint #1: Look at the other files in the folder. # Hint #2: You can speed up this function A LOT without changing a # single line of it! # # Ask yourself: # ------------- # - Why are the functions called here so slow? # - Is there something you learned in the class about "document distance" problem, # that can be used here? # # ----- # BONUS hints: (to get more than 10x speedup --- only try this after you # have gotten a 10x speedup by completing the above changes and # optimizing the other functions in this file) # # Hint #1: Is there a step which can be taken out of the 'for' loop? # # Hint #2: This code calculates the cosine distance between the given row's Sentence # and the Sentence_with_Target all the rows in MASTER_FILE. # This is repeated for each 'row' in SENTENCE_DB_FILE. # In first iteration, you already calculate the cosine distance of # "I go to school because I want to get a good [education]." # and all the rows in the MASTER_FILE # and that includes "I go to school because I want to get a good [education]." # This is repeated in 2nd iteration for "I go to school because I want to get a good [education].". # # Can you cache (store) these calculations for future iterations? # What would be the best data structure for caching? # Try to further optimize the code using a cache # ------------------------------------------------------------------------- ''' # find alternate similar sentence using document distance similar_sentence = None #Initialize similar_sentnce variable d = {} for record in get_csv_rows(MASTER_FILE): #Loop through master file # record is a row in MASTER_FILE if record['SentID_GM'] == row['SentID_GM']: # ignore the same sentence continue # get frequency mapping for row['Sentence'] row_word_list = get_words_from_string( row['Sentence'] ) #Puts the words in the sentence as strings in a list row_freq_mapping = count_frequency( row_word_list ) #Gets the count for each word for how many times it appears in the Sentence in the form of a dictionary # get frequency mapping for record['Sentence_with_Target'] record_word_list = get_words_from_string( record['Sentence_with_Target'] ) #Puts the words from the Sentence that has the target into string into a list record_freq_mapping = count_frequency( record_word_list ) #Gets the count for each of these word for how many times it appears in the sentence in the form of a dictionary distance = vector_angle( row_freq_mapping, record_freq_mapping ) #Inputs the two word counts of each sentence in the forms of dictionaries and then computes the angle between these sentences if 0 < distance < 0.75: #If the angle is between 0 and .75 if (not similar_sentence) or (distance < similar_sentence['distance']): similar_sentence = { 'distance': distance, 'Sentence_with_Target': record['Sentence_with_Target'], 'SentID_GM': record[ 'SentID_GM'] #Creates a dictionary of values for the new similar sentence, where the distance is set equal to the angle between the two setnences #The Key Sentence_with_Target will be equal to the sentence from the master file and the Sentence ID will be the same as the Sentence ID in the master file } if similar_sentence and similar_sentence['SentID_GM'] != row.get( 'SimilarTo_SentID_GM' ): #If this new similar sentence isn't equal to the old similar sentence row['Alternate_SimilarTo_SentID_GM'] = similar_sentence[ 'SentID_GM'] #A new row is created called Alternate_SimilarTo_SentID_GM, which is equal to the Sentence ID generated in the dictionary above row['Alternate_SimilarTo_Sentence'] = similar_sentence[ 'Sentence_with_Target'] #Another new row is created called Alternate_SimilarTo_Sentnce, which is equal to the sentence generated in the dictionary above
# ------------------------------------------------------------------------- ''' # find alternate similar sentence using document distance similar_sentence = None for record in get_csv_rows(MASTER_FILE): # record is a row in MASTER_FILE if record['SentID_GM'] == row['SentID_GM']: # ignore the same sentence continue # conditional statement saying if the masterfile and the sentenceDB file are equivalent then continue in the function # get frequency mapping for row['Sentence'] row_word_list = get_words_from_string(row['Sentence']) row_freq_mapping = count_frequency(row_word_list) # setting variables equal to functions that are defined in the docdist1.py file that will then be able to be # called # get frequency mapping for record['Sentence_with_Target'] record_word_list = get_words_from_string(record['Sentence_with_Target']) record_freq_mapping = count_frequency(record_word_list) # setting variables equal to functions that are defined in the docdist1.py file that will then be able to be # called distance = vector_angle(row_freq_mapping, record_freq_mapping) # calculating the distance of each vector angle by calling the variables that refer to functions in the # docdist1.py file if 0 < distance < 0.75: # conditional: if the distance value is between 0 and .75 then it's true and one will continue if (not similar_sentence) or (distance < similar_sentence['distance']):
def find_alternate_sentence(row, data): ''' Just like SimilarTo_Sentence and SimilarTo_SentID_GM, we will determine Alternate_SimilarTo_Sentence and Alternate_SimilarTo_SentID_GM by calculating the cosine distance between two sentences using the **document distance** code that we discussed in the previous class # ------------------------------------------------------------------------- # Your aim in this function is to speed up the code using a simple trick # and a modification # # Biggest hint: look at the other files in the folder # # Ask yourself: # ------------- # - Why are the functions called here, so slow? # - Is there something you learned in the class about "document distance" problem, # that can be used here? # - Is there a step which can be taken out of the 'for' loop? # # ----- # Bonus: # ------ # This code calculates the cosine distance between the given row's Sentence # and the Sentence_with_Target all the rows in MASTER_FILE. # This is repeated for each 'row' in SENTENCE_DB_FILE. # In first iteration, you already calculate the cosine distance of # "I go to school because I want to get a good [education]." # and all the rows in the MASTER_FILE # and that includes "I go to school because I want to get a good [education]." # This is repeated in 2nd iteration for "I go to school because I want to get a good [education].". # # Can you cache (store) these calculations for future iterations? # What would be the best data structure for caching? # Try to further optimize the code using a cache # ------------------------------------------------------------------------- ''' # This function creates a frequency mapping for the sentence from DB file and for the sentence in the master file # then check the distance between of the frequency mapping of the two sentences # if cosine distance is within 0 and 0.75 (non inclusive), # then insert the distance, record's Sentence_with_Target, and record' SentID_GM into similar_sentence as values to the corresponding keys # then use similar_sentence's SentID_GM and Sentence_with_Target into row's Alternate_SimilarTo_SentID_GM and Alternate_SimilarTo_Sentence # Data structure: # similar_sentence: None or dictionary (if the sentence matches); # similar_sentence is a dictionary where keys are distance, Sentence_with_Target, and SentID_GM and the values of the keys come from record # record: a dictionary from the list of dictionaries returned from get_csv_rows, passing the MASTER_FILE # row: a dictionary from the list of dictionaries returned from get_csv_rows, passing the SENTENCE_DB_FILE # Input: a row (essentially a dictionary) from the DB file; Output: None # find alternate similar sentence using document distance similar_sentence = None #for record in get_csv_rows(MASTER_FILE): # Change 2, see bottom for details #for record in get_csv_rows(MASTER_FILE): for record in data: if record['SentID_GM'] == row['SentID_GM']: # ignore the same sentence continue # get frequency mapping for row['Sentence'] row_word_list = get_words_from_string(row['Sentence']) row_freq_mapping = count_frequency(row_word_list) # get frequency mapping for record['Sentence_with_Target'] record_word_list = get_words_from_string( record['Sentence_with_Target']) record_freq_mapping = count_frequency(record_word_list) distance = vector_angle(row_freq_mapping, record_freq_mapping) # if the cosine distance is between 0 and 0.75 (non-inclusive) if 0 < distance < 0.75: if (not similar_sentence) or (distance < similar_sentence['distance']): # then make similar_sentence into a dictionary where keys are distance, Sentence_with_Target, and SentID_GM # and the values of the keys come from record similar_sentence = { 'distance': distance, 'Sentence_with_Target': record['Sentence_with_Target'], 'SentID_GM': record['SentID_GM'] } # if similar sentence is not none and similar_sentence['SentID_GM'] is not the row's SimilarTo_SentID_GM if similar_sentence and similar_sentence['SentID_GM'] != row.get( 'SimilarTo_SentID_GM'): # then the row's Alternate_SimilarTo_SentID_GM equals to similar_sentence's Sentece ID # and the row's Alternate_SimilarTo_Sentence equals to similar_sentence's Sentence with Target row['Alternate_SimilarTo_SentID_GM'] = similar_sentence['SentID_GM'] row['Alternate_SimilarTo_Sentence'] = similar_sentence[ 'Sentence_with_Target']
def find_alternate_sentence(row): ''' Just like SimilarTo_Sentence and SimilarTo_SentID_GM, we will determine Alternate_SimilarTo_Sentence and Alternate_SimilarTo_SentID_GM by calculating the cosine distance between two sentences using the **document distance** code that we discussed in the previous class # ------------------------------------------------------------------------- # Your aim in this function is to speed up the code using a simple trick # and a modification # # ---------- # PRE-BONUS hints (to help get to 10x speedup): # Hint #1: Look at the other files in the folder. # Hint #2: You can speed up this function A LOT without changing a # single line of it! # # Ask yourself: # ------------- # - Why are the functions called here so slow? # - Is there something you learned in the class about "document distance" problem, # that can be used here? # # ----- # BONUS hints: (to get more than 10x speedup --- only try this after you # have gotten a 10x speedup by completing the above changes and # optimizing the other functions in this file) # # Hint #1: Is there a step which can be taken out of the 'for' loop? # # Hint #2: This code calculates the cosine distance between the given row's Sentence # and the Sentence_with_Target all the rows in MASTER_FILE. # This is repeated for each 'row' in SENTENCE_DB_FILE. # In first iteration, you already calculate the cosine distance of # "I go to school because I want to get a good [education]." # and all the rows in the MASTER_FILE # and that includes "I go to school because I want to get a good [education]." # This is repeated in 2nd iteration for "I go to school because I want to get a good [education].". # # Can you cache (store) these calculations for future iterations? # What would be the best data structure for caching? # Try to further optimize the code using a cache # ------------------------------------------------------------------------- ''' # find alternate similar sentence using document distance similar_sentence = None for record in get_csv_rows(MASTER_FILE): # record is a row in MASTER_FILE if record['SentID_GM'] == row['SentID_GM']: # ignore the same sentence continue # conditional statement saying if the masterfile and the sentenceDB file are equivalent then continue in the function # get frequency mapping for row['Sentence'] row_word_list = get_words_from_string(row['Sentence']) row_freq_mapping = count_frequency(row_word_list) # setting variables equal to functions that are defined in the docdist1.py file that will then be able to be # called # get frequency mapping for record['Sentence_with_Target'] record_word_list = get_words_from_string( record['Sentence_with_Target']) record_freq_mapping = count_frequency(record_word_list) # setting variables equal to functions that are defined in the docdist1.py file that will then be able to be # called distance = vector_angle(row_freq_mapping, record_freq_mapping) # calculating the distance of each vector angle by calling the variables that refer to functions in the # docdist1.py file if 0 < distance < 0.75: # conditional: if the distance value is between 0 and .75 then it's true and one will continue if (not similar_sentence) or (distance < similar_sentence['distance']): # if the value is not none or the value of the distance is less than the value of # similar_sentence['distance'] similar_sentence = { 'distance': distance, 'Sentence_with_Target': record['Sentence_with_Target'], 'SentID_GM': record['SentID_GM'] } # Then continue and set this dictionary keys of 'distance', 'sentence_with_target', and 'SentID_GM' equal to the # values of distance, record['Sentence_with_Target'],and record['SentID_GM'] if similar_sentence and similar_sentence['SentID_GM'] != row.get( 'SimilarTo_SentID_GM'): row['Alternate_SimilarTo_SentID_GM'] = similar_sentence['SentID_GM'] row['Alternate_SimilarTo_Sentence'] = similar_sentence[ 'Sentence_with_Target']