示例#1
0
def predict():
    try:
        # f = request.files['file']  
        image = Image.open('./img.jpg').convert("RGB")
        image = image_loader(image)

        encoder, decoder, vocab = initialize()
        features = encoder(image).unsqueeze(1)
        output = decoder.sample(features)
        sentence = clean_sentence(output, vocab)
        res = {}
        res['pred_1'] = sentence

        outputs = decoder.sample_beam_search(features)
        num_sents = min(len(outputs), 3)
        count = 2
        for output in outputs[:num_sents]:
            sentence = clean_sentence(output, vocab)
            res['pred_{}'.format(count)] = sentence
            count += 1
        # print(res)
        return app.response_class(response=json.dumps(res), status=200, mimetype='application/json')
    except Exception as error:
        err = str(error)
        print(err)
        return app.response_class(response=json.dumps(err), status=500, mimetype='application/json')
def find_line_in_transcript(transcript, sentence):
    """
    Uses the Levenshtein distance between a sentence and the lines in a
    transcript to identify the line in transcript most likely to correspond to
    the query sentence

    Parameters
    ----------
    transcript : np.array of shape (n, 4)
        A podcast transcript as produced via compile_episode_transcript. Each
        row corresponds to a line in the transcript, and the columns
        correspond to [start_time, end_time, utterance, speaker_id]

    sentence : string
        A sentence as compiled from the p2fa transcription of a podcast.

    Returns
    -------
    idx : int
        The index to the row in transcript most likely to contain sentence
    """
    edit_dist = []
    clean_sent = clean_sentence(sentence)

    for idx, line in enumerate(transcript):
        trans_line = line[2]
        edit_dist.append(Levenshtein.distance(clean_sent, trans_line))
    idx = np.argmin(edit_dist)
    return idx
示例#3
0
def find_line_in_transcript(transcript, sentence):
    """
    Uses the Levenshtein distance between a sentence and the lines in a
    transcript to identify the line in transcript most likely to correspond to
    the query sentence

    Parameters
    ----------
    transcript : np.array of shape (n, 4)
        A podcast transcript as produced via compile_episode_transcript. Each
        row corresponds to a line in the transcript, and the columns
        correspond to [start_time, end_time, utterance, speaker_id]

    sentence : string
        A sentence as compiled from the p2fa transcription of a podcast.

    Returns
    -------
    idx : int
        The index to the row in transcript most likely to contain sentence
    """
    edit_dist = []
    clean_sent = clean_sentence(sentence)

    for idx, line in enumerate(transcript):
        trans_line = line[2]
        edit_dist.append(Levenshtein.distance(clean_sent, trans_line))
    idx = np.argmin(edit_dist)
    return idx
示例#4
0
    def annotate(self, image):
        transformed = self.transform(image).unsqueeze(0)
        features = self.encoder(transformed).unsqueeze(1)

        # Pass the embedded image features through the model to get a predicted caption.
        output = self.decoder.sample_beam_search(features)
        print('example output:', output)
        sentence = clean_sentence(output[0], self.vocab)
        print('example sentence:', sentence)
        return sentence
    def reconstruction(self,input_sentence,max_len):
        sentence = clean_sentence(input_sentence)
        sentence = convert_sentence2id(input_sentence,self.token2id,max_len)
        target = np.zeros((1,max_len))
        l = len(word_tokenize(input_sentence))
        target[0,:l] = sentence[0,:l]
        target = torch.LongTensor(target)
        #target = convert_sentence2id(input_sentence[:-1],self.token2id,max_len)
       # print(" ".join([self.id2token[id.item()] for id in target[0]]))
        print(sentence)
        print(target)

        output = self.model(inputs = sentence,targets=target,lengths=torch.LongTensor([len(sentence)]))
        prediction = output["predictions"][0]
        return " ".join(self.id2token[id.item()] for id in prediction)
    def get_data_from_file(self, datafile):
        data = pd.read_csv(datafile)
        x_data = []
        t_data = []
        x_length = []
        y_data = list(data["Sentiment"])

        for sentence in list(data["Review"]):
            sentence = clean_sentence(sentence)
            words_list = word_tokenize(sentence)
            words_list.append("eos")
            x_length.append(len(words_list))
            x = [
                self.token2id[word.lower()]
                if word.lower() in self.token2id else 2 for word in words_list
            ]
            x_data.append(x)
            t_data.append(x[:-1])
        return x_data, y_data, x_length, t_data
def write_transcript_segments(transcript, seg_row_ids, ep_id):
    """
    Write transcript segments to separate json files for use with p2fa.

    Parameters
    ----------
    transcript : np.array of shape (n, 4)
        An array containing the transcript for the podcast episode associated
        with trans_id. Each row corresponds to a line in the transcript, and
        the columns correspond to [start_time, end_time, utterance, speaker_id]

    seg_row_ids : list
        A list of row indices in transcript corresponding to the location at
        which we should start and end each transcript segment

    ep_id : int
        The audiosearch episode id associated with the transcript
    """
    for ii in range(len(seg_row_ids) - 1):
        start_row = seg_row_ids[ii]
        end_row = seg_row_ids[ii + 1]
        trans = transcript[start_row:end_row, :]
        slice_id = '_seg{}'.format(ii + 1)

        tscrpit = []
        file_name = str(ep_id) + slice_id + '.json'
        file_name = '../seg_json/' + file_name

        for line in trans:
            speaker = str(line[3]).upper()
            utter = clean_sentence(line[2])
            catalog = {'speaker': speaker, 'line': utter}
            tscrpit.append(catalog)

        with open(file_name, 'wb') as f:
            json.dump(tscrpit, f)
            f.close()
示例#8
0
def write_transcript_segments(transcript, seg_row_ids, ep_id):
    """
    Write transcript segments to separate json files for use with p2fa.

    Parameters
    ----------
    transcript : np.array of shape (n, 4)
        An array containing the transcript for the podcast episode associated
        with trans_id. Each row corresponds to a line in the transcript, and
        the columns correspond to [start_time, end_time, utterance, speaker_id]

    seg_row_ids : list
        A list of row indices in transcript corresponding to the location at
        which we should start and end each transcript segment

    ep_id : int
        The audiosearch episode id associated with the transcript
    """
    for ii in range(len(seg_row_ids) - 1):
        start_row = seg_row_ids[ii]
        end_row = seg_row_ids[ii + 1]
        trans = transcript[start_row:end_row, :]
        slice_id = '_seg{}'.format(ii + 1)

        tscrpit = []
        file_name = str(ep_id) + slice_id + '.json'
        file_name = '../seg_json/' + file_name

        for line in trans:
            speaker = str(line[3]).upper()
            utter = clean_sentence(line[2])
            catalog = {'speaker': speaker, 'line': utter}
            tscrpit.append(catalog)

        with open(file_name, 'wb') as f:
            json.dump(tscrpit, f)
            f.close()
with open(config.target_word_count_path, 'rb') as f:
    target_counter = pickle.load(f)
    f.close()

REMOVE = [
    word for word in REMOVE if target_counter[word] < config.TARGET_MIN_COUNT
] + [' ']

# read training set
train = pd.read_csv(config.new_train_path)
train['Question'] = train['Question'].apply(ast.literal_eval)
train['Dialogue'] = train['Dialogue'].apply(ast.literal_eval)
train['Report'] = train['Report'].apply(ast.literal_eval)
train['context'] = train.apply(lambda x: x[0] + x[1], axis=1)
train['context'] = train['context'].apply(lambda x: utils.clean_sentence(
    x, word_list, remove=config.REMOVE, add=False, pad=False))
train['Report'] = train['Report'].apply(lambda x: utils.clean_sentence(
    x, word_list, remove=REMOVE, add=False, pad=False))
print('Read training data')

# read test set
test = pd.read_csv(config.new_test_path)
test['Question'] = test['Question'].apply(ast.literal_eval)
test['Dialogue'] = test['Dialogue'].apply(ast.literal_eval)
test['context'] = test.apply(lambda x: x[0] + x[1], axis=1)
test['context'] = test['context'].apply(lambda x: utils.clean_sentence(
    x, word_list, remove=config.REMOVE, add=False, pad=False))
print('Read test data')

# get max len of context and clean
input_maxlen = utils.get_maxlen(
                    word for word in string.split(' ') if word not in REMOVE
                ] for string in line.split('|')]
            if len(sentence) > 0:
                sentences += sentence
        f.close()

print('sentences all filtered')  # just to tell you this part is over

word_list = [
    word for word in counter.keys() if counter[word] >= config.MIN_COUNT
] + [config.UNKNOWN, config.START, config.END, config.PAD]
word_list = dict(zip(word_list, [1 for _ in word_list]))
sentences = [
    utils.clean_sentence(sentence,
                         word_list,
                         max_len=int(1e4),
                         remove=REMOVE,
                         add=False,
                         pad=False) for sentence in sentences
]

print(len(sentences))  # just to tell you this part is over

model = Word2Vec(
    sentences,
    size=config.HIDDEN_SIZE,
    min_count=config.
    MIN_COUNT,  # this min_count is also used to select words in utils.clean_sentence
    workers=config.NUM_WORKER,
    window=config.WINDOW,
    iter=config.ITER)
 def split_problem(problem):
     problem = Problem.remove_diagrams(problem)
     words, latex = Problem.split_latex(problem)
     words = utils.clean_sentence(words)
     basic_latex, commands = Problem.split_commands(latex)
     return words + commands, basic_latex