def get_ngrams(self, n):
        # Generate ngrams from the dialogues for searching purposes.
        full_text = ''
        for dia in self.dialogues:
            full_text += dia[1] + ' '

        return get_ngrams(tokenize(normalize(full_text)), n)
    def find_scene_from_transcript(self, transcript):
        transcript_text = ''
        for t in transcript.keys():
            for alternative in transcript[t]:
                transcript_text += ' ' + alternative

        tokens = tokenize(normalize(transcript_text))
        scores = []
        for scene in self.scenes:
            curr_score = 0

            for num_ngrams in range(1, 6):
                scene_ngrams = scene.get_ngrams(num_ngrams)
                dialogue = get_ngrams(tokens, num_ngrams)
                for ngram in dialogue:
                    if ngram in scene_ngrams:
                        curr_score += 1

            scores.append(curr_score)

        match = np.argmax(scores)
        if scores[match] == 0:
            return -1
        else:
            return int(match)
    def __init__(self, text='', lines=[]):
        # If text is not empty, use text to build scene.
        if text != '':
            text = text.split('\n')

            # Header is the first line.
            self.header = text[0]

            # Body is the rest of the text.
            self.body = '\n'.join(text[1:])
            self.tokens = tokenize(normalize(self.body))

        # Otherwise, if lines is not empty, use it instead. Preferable and more thorough.
        elif len(lines) != 0:
            # Identify the header of the scene.
            if lines[0][0] == 'Scene Heading':
                self.header = lines[0][1]
            else:
                self.header = 'No header'

            # Identify the main pieces of a scene.
            self.body = ''
            self.characters = []
            self.actions = []
            self.dialogues = []
            last_character = ''
            for i in range(1, len(lines)):
                self.body += lines[i][1] + '\n'

                # If the element is character, add it if it's not already in the list of characters.
                if lines[i][0] == 'Character':
                    character = lines[i][1]

                    # Gets rid of (V.O.), (CONT'D), etc.
                    if character.find(' (') != -1:
                        character = character[0:character.find(' (')]

                    last_character = character
                    if character not in self.characters:
                        self.characters.append(character)

                # If the element is an action, add it to the list of actions.
                elif lines[i][0] == 'Action':
                    self.actions.append(lines[i][1])

                # If the element is dialogue, add it to the list of dialogues in a tuple of the form
                # (name of the character, dialogue).
                elif lines[i][0] == 'Dialogue':
                    self.dialogues.append((last_character, lines[i][1]))
예제 #4
0
import string_processing as sp

# Values necessary to load the network.
vocab = pd.read_csv("vocabulary.txt",
                    names=['ind', 'word'],
                    encoding='iso-8859-1')
vocab = pd.Series(vocab['ind'].values, index=vocab['word']).to_dict()
vocab_size = du.get_vocab_size("vocabulary.txt")

# Load the network.
network = lstm.LSTMSentiment(vocab_size)
network.load_state_dict(torch.load('model'))
network.eval()

# Get user input.
user_sentence = input("Enter a review: ")

# Process user input and convert it to tokens.
user_sentence = sp.normalize(user_sentence)
user_sentence = sp.tokenize(user_sentence)
user_sentence = sp.get_numbers(user_sentence, vocab)
user_sentence = sp.padding(user_sentence, 30)

# Predict and output results.
output, h = network(torch.LongTensor([user_sentence]), network.init_hidden(1))
pred = torch.round(output.squeeze())
if pred.item() == 1:
    print("Fresh")
else:
    print("Rotten")
import pandas as pd

# Local libraries
import string_processing

# Constants
MAX_LEN = 30
MIN_LEN = 2

# Get reviews from csv file.
df = pd.read_csv("rotten_tomatoes_reviews.csv",
                 encoding='iso-8859-1')  # s/o nicolas-gervais from r/datasets

# Generate tokens from the text reviews.
df['Review_Clean'] = df['Review'].apply(
    lambda x: string_processing.normalize(x))
df['Tokens'] = df['Review_Clean'].apply(
    lambda x: string_processing.tokenize(x))
df.drop(['Review', 'Review_Clean'], axis=1, inplace=True)

# Get rid of reviews with word count below the minimum length.
df = df[df['Tokens'].apply(lambda x: len(x) > MIN_LEN)]
df.reset_index(drop=True, inplace=True)

# Generate a vocabulary.
vocab = string_processing.build_vocab(df['Tokens'].tolist(), 'vocabulary.txt')

# Replace tokens with their respective numbers in the vocabulary.
df['Tokens'] = df['Tokens'].apply(
    lambda x: string_processing.get_numbers(x, vocab))