def predict(): try: # f = request.files['file'] image = Image.open('./img.jpg').convert("RGB") image = image_loader(image) encoder, decoder, vocab = initialize() features = encoder(image).unsqueeze(1) output = decoder.sample(features) sentence = clean_sentence(output, vocab) res = {} res['pred_1'] = sentence outputs = decoder.sample_beam_search(features) num_sents = min(len(outputs), 3) count = 2 for output in outputs[:num_sents]: sentence = clean_sentence(output, vocab) res['pred_{}'.format(count)] = sentence count += 1 # print(res) return app.response_class(response=json.dumps(res), status=200, mimetype='application/json') except Exception as error: err = str(error) print(err) return app.response_class(response=json.dumps(err), status=500, mimetype='application/json')
def find_line_in_transcript(transcript, sentence): """ Uses the Levenshtein distance between a sentence and the lines in a transcript to identify the line in transcript most likely to correspond to the query sentence Parameters ---------- transcript : np.array of shape (n, 4) A podcast transcript as produced via compile_episode_transcript. Each row corresponds to a line in the transcript, and the columns correspond to [start_time, end_time, utterance, speaker_id] sentence : string A sentence as compiled from the p2fa transcription of a podcast. Returns ------- idx : int The index to the row in transcript most likely to contain sentence """ edit_dist = [] clean_sent = clean_sentence(sentence) for idx, line in enumerate(transcript): trans_line = line[2] edit_dist.append(Levenshtein.distance(clean_sent, trans_line)) idx = np.argmin(edit_dist) return idx
def annotate(self, image): transformed = self.transform(image).unsqueeze(0) features = self.encoder(transformed).unsqueeze(1) # Pass the embedded image features through the model to get a predicted caption. output = self.decoder.sample_beam_search(features) print('example output:', output) sentence = clean_sentence(output[0], self.vocab) print('example sentence:', sentence) return sentence
def reconstruction(self,input_sentence,max_len): sentence = clean_sentence(input_sentence) sentence = convert_sentence2id(input_sentence,self.token2id,max_len) target = np.zeros((1,max_len)) l = len(word_tokenize(input_sentence)) target[0,:l] = sentence[0,:l] target = torch.LongTensor(target) #target = convert_sentence2id(input_sentence[:-1],self.token2id,max_len) # print(" ".join([self.id2token[id.item()] for id in target[0]])) print(sentence) print(target) output = self.model(inputs = sentence,targets=target,lengths=torch.LongTensor([len(sentence)])) prediction = output["predictions"][0] return " ".join(self.id2token[id.item()] for id in prediction)
def get_data_from_file(self, datafile): data = pd.read_csv(datafile) x_data = [] t_data = [] x_length = [] y_data = list(data["Sentiment"]) for sentence in list(data["Review"]): sentence = clean_sentence(sentence) words_list = word_tokenize(sentence) words_list.append("eos") x_length.append(len(words_list)) x = [ self.token2id[word.lower()] if word.lower() in self.token2id else 2 for word in words_list ] x_data.append(x) t_data.append(x[:-1]) return x_data, y_data, x_length, t_data
def write_transcript_segments(transcript, seg_row_ids, ep_id): """ Write transcript segments to separate json files for use with p2fa. Parameters ---------- transcript : np.array of shape (n, 4) An array containing the transcript for the podcast episode associated with trans_id. Each row corresponds to a line in the transcript, and the columns correspond to [start_time, end_time, utterance, speaker_id] seg_row_ids : list A list of row indices in transcript corresponding to the location at which we should start and end each transcript segment ep_id : int The audiosearch episode id associated with the transcript """ for ii in range(len(seg_row_ids) - 1): start_row = seg_row_ids[ii] end_row = seg_row_ids[ii + 1] trans = transcript[start_row:end_row, :] slice_id = '_seg{}'.format(ii + 1) tscrpit = [] file_name = str(ep_id) + slice_id + '.json' file_name = '../seg_json/' + file_name for line in trans: speaker = str(line[3]).upper() utter = clean_sentence(line[2]) catalog = {'speaker': speaker, 'line': utter} tscrpit.append(catalog) with open(file_name, 'wb') as f: json.dump(tscrpit, f) f.close()
with open(config.target_word_count_path, 'rb') as f: target_counter = pickle.load(f) f.close() REMOVE = [ word for word in REMOVE if target_counter[word] < config.TARGET_MIN_COUNT ] + [' '] # read training set train = pd.read_csv(config.new_train_path) train['Question'] = train['Question'].apply(ast.literal_eval) train['Dialogue'] = train['Dialogue'].apply(ast.literal_eval) train['Report'] = train['Report'].apply(ast.literal_eval) train['context'] = train.apply(lambda x: x[0] + x[1], axis=1) train['context'] = train['context'].apply(lambda x: utils.clean_sentence( x, word_list, remove=config.REMOVE, add=False, pad=False)) train['Report'] = train['Report'].apply(lambda x: utils.clean_sentence( x, word_list, remove=REMOVE, add=False, pad=False)) print('Read training data') # read test set test = pd.read_csv(config.new_test_path) test['Question'] = test['Question'].apply(ast.literal_eval) test['Dialogue'] = test['Dialogue'].apply(ast.literal_eval) test['context'] = test.apply(lambda x: x[0] + x[1], axis=1) test['context'] = test['context'].apply(lambda x: utils.clean_sentence( x, word_list, remove=config.REMOVE, add=False, pad=False)) print('Read test data') # get max len of context and clean input_maxlen = utils.get_maxlen(
word for word in string.split(' ') if word not in REMOVE ] for string in line.split('|')] if len(sentence) > 0: sentences += sentence f.close() print('sentences all filtered') # just to tell you this part is over word_list = [ word for word in counter.keys() if counter[word] >= config.MIN_COUNT ] + [config.UNKNOWN, config.START, config.END, config.PAD] word_list = dict(zip(word_list, [1 for _ in word_list])) sentences = [ utils.clean_sentence(sentence, word_list, max_len=int(1e4), remove=REMOVE, add=False, pad=False) for sentence in sentences ] print(len(sentences)) # just to tell you this part is over model = Word2Vec( sentences, size=config.HIDDEN_SIZE, min_count=config. MIN_COUNT, # this min_count is also used to select words in utils.clean_sentence workers=config.NUM_WORKER, window=config.WINDOW, iter=config.ITER)
def split_problem(problem): problem = Problem.remove_diagrams(problem) words, latex = Problem.split_latex(problem) words = utils.clean_sentence(words) basic_latex, commands = Problem.split_commands(latex) return words + commands, basic_latex