def get_cost(self, word_1: str, word_2: str): return osa(word_1.lower(), word_2.lower(), transpose_costs=self.transpose_costs, substitute_costs=self.substitute_costs, insert_costs=self.insert_costs, delete_costs=self.delete_costs, )
def get_cost(self, word_1: str, word_2: str): return osa( word_1.lower(), word_2.lower(), transpose_costs=self.transpose_costs, substitute_costs=self.substitute_costs, insert_costs=self.insert_costs, delete_costs=self.delete_costs, )
def test_osa(self): self.assertEqual(osa('1234', '1234'), 0.0) self.assertEqual(osa('', '1234'), 4.0) self.assertEqual(osa('1234', ''), 4.0) self.assertEqual(osa('', ''), 0.0) self.assertEqual(osa('1234', '12'), 2.0) self.assertEqual(osa('1234', '14'), 2.0) self.assertEqual(osa('1111', '1'), 3.0)
def find_closest_string_weighted(string): print("string: " + string) with open('../approaches/edit_distance/cleaned_bucket_data.json', encoding="ASCII") as f: data = json.load(f) # find first letter of every word in the string words = string.split() letters = [word[0] for word in words] # get corresponding buckets first_letter = string[0] products = [] for bucket in data: if bucket[0][0] == first_letter: products += bucket # remove non-ascii characters cleaned_products = [] for entry in products: cleaned_entry = "" for character in entry: if ord(character) <= 128: cleaned_entry += character cleaned_products.append(cleaned_entry) insert_costs = np.full( 128, .3, dtype=np.float64 ) # make an array of all 1's of size 128, the number of ASCII characters transpose_costs = np.full((128, 128), .7, dtype=np.float64) delete_costs = np.full(128, 1.2, dtype=np.float64) closest_distance = 999999 closest_string = None for line in cleaned_products: distance = osa(string.lower(), line.lower(), insert_costs=insert_costs, transpose_costs=transpose_costs, delete_costs=delete_costs) if closest_distance is None or distance < closest_distance: closest_distance = distance closest_string = line.lower() print("closest_string: " + closest_string) return closest_string
def predict(searched=None, internal_call=False, receipt=None, receipt_titles=None): ''' important variables: tokenizer - tokenizer for tokenizing text lstm_model - trained lstm model le - label encoder to decode output ''' # lstm K.clear_session() lstm_path = "../approaches/LSTM/" with open(lstm_path + "pickled/tokenizer_300k_1epoch.pickle", 'rb') as handle: tokenizer = pickle.load(handle) lstm_model = load_model(lstm_path + "models/lstm_300k_epochs_1.h5") le = preprocessing.LabelEncoder() le.classes_ = np.load(lstm_path + "pickled/labelencoder_classes_300k_1epoch.npy") tokenizer.oov_token = None # edit distance with open('../approaches/edit_distance/cleaned_bucket_data.json', encoding="ASCII") as f: data = json.load(f) # gbdt gbdt_path = "../approaches/gbdt/" gbdt_model = pickle.load(open(gbdt_path + "models/gbdt_model.sav", "rb")) gbdt_le = preprocessing.LabelEncoder() gbdt_le.classes_ = np.load(gbdt_path + "pickled/labelencoder_gbdt_classes.npy") ## if internal call ## if internal_call: # lstm encoded_x = tokenizer.texts_to_sequences([searched]) padded = pad_sequences(encoded_x, 25) lstm_preds = lstm_model.predict(padded) pred_labels = [[np.argmax(x)] for x in lstm_preds] lstm_preds = le.inverse_transform(pred_labels) # gbdt df = get_dataframe([searched.upper()]) gbdt_preds = gbdt_model.predict(df.drop(columns=['x', 'y'], axis=1)) gbdt_preds = gbdt_le.inverse_transform(gbdt_preds)[0] #========================================================= # edit distance # find first letter of every word in the string words = searched.split() letters = [word[0] for word in words] # get corresponding buckets first_letter = searched[0] products = [] for bucket in data: if bucket[0][0].lower() == first_letter.lower(): products += bucket # remove non-ascii characters cleaned_products = [] for entry in products: cleaned_entry = "" for character in entry: if ord(character) <= 128: cleaned_entry += character cleaned_products.append(cleaned_entry) insert_costs = np.full( 128, .3, dtype=np.float64 ) # make an array of all 1's of size 128, the number of ASCII characters transpose_costs = np.full((128, 128), .7, dtype=np.float64) delete_costs = np.full(128, 1.2, dtype=np.float64) closest_distance = 999999 closest_string = None for line in cleaned_products: distance = osa(searched.lower(), line.lower(), insert_costs=insert_costs, transpose_costs=transpose_costs, delete_costs=delete_costs) if closest_distance is None or distance < closest_distance: closest_distance = distance closest_string = line.lower() edit_distance_pred = closest_string return (lstm_preds, edit_distance_pred, gbdt_preds) else: print('receipt titles = ', receipt_titles) encoded_x = tokenizer.texts_to_sequences(receipt_titles) padded = pad_sequences(encoded_x, 25) lstm_preds = lstm_model.predict(padded) pred_labels = [[np.argmax(x)] for x in lstm_preds] lstm_preds = le.inverse_transform(pred_labels) # gbdt titles = [] for title in receipt_titles: titles.append(title.upper()) df = get_dataframe(titles) gbdt_preds = gbdt_model.predict(df.drop(columns=['x', 'y'], axis=1)) gbdt_preds = gbdt_le.inverse_transform(gbdt_preds) #========================================================= # edit distance edit_distance_preds = [] for title in receipt_titles[:]: # find first letter of every word in the string words = title.split() letters = [word[0] for word in words] # get corresponding buckets first_letter = title[0] products = [] for bucket in data: if bucket[0][0] == first_letter: products += bucket # remove non-ascii characters cleaned_products = [] for entry in products: cleaned_entry = "" for character in entry: if ord(character) <= 128: cleaned_entry += character cleaned_products.append(cleaned_entry) insert_costs = np.full( 128, .3, dtype=np.float64 ) # make an array of all 1's of size 128, the number of ASCII characters transpose_costs = np.full((128, 128), .7, dtype=np.float64) delete_costs = np.full(128, 1.2, dtype=np.float64) closest_distance = 999999 closest_string = None for line in cleaned_products: distance = osa(title.lower(), line.lower(), insert_costs=insert_costs, transpose_costs=transpose_costs, delete_costs=delete_costs) if closest_distance is None or distance < closest_distance: closest_distance = distance closest_string = line.lower() edit_distance_preds.append(closest_string) global manual_search global text global receipt_titles_global global lstm_output global edit_distance_output global gbdt_output manual_search = False text = receipt receipt_titles_global = receipt_titles lstm_output = lstm_preds edit_distance_output = edit_distance_preds gbdt_output = gbdt_preds print("edit_distance_output " + str(edit_distance_output)) return render_template("index.html", manual_search=False, text=receipt, receipt_titles=receipt_titles, lstm_output=lstm_preds, edit_distance_output=edit_distance_preds, gbdt_output=gbdt_preds)
def _osa(self, x, y): return osa(x, y, self.iw, self.dw, self.sw, self.tw)