Python tokenize示例，string_processing.tokenize Python示例

示例#1

0

显示文件

文件： screenplay.py 项目： mudtriangle/ml-tools-film-postproduction

    def get_ngrams(self, n):
        # Generate ngrams from the dialogues for searching purposes.
        full_text = ''
        for dia in self.dialogues:
            full_text += dia[1] + ' '

        return get_ngrams(tokenize(normalize(full_text)), n)

示例#2

0

显示文件

文件： screenplay.py 项目： mudtriangle/ml-tools-film-postproduction

    def find_scene_from_transcript(self, transcript):
        transcript_text = ''
        for t in transcript.keys():
            for alternative in transcript[t]:
                transcript_text += ' ' + alternative

        tokens = tokenize(normalize(transcript_text))
        scores = []
        for scene in self.scenes:
            curr_score = 0

            for num_ngrams in range(1, 6):
                scene_ngrams = scene.get_ngrams(num_ngrams)
                dialogue = get_ngrams(tokens, num_ngrams)
                for ngram in dialogue:
                    if ngram in scene_ngrams:
                        curr_score += 1

            scores.append(curr_score)

        match = np.argmax(scores)
        if scores[match] == 0:
            return -1
        else:
            return int(match)

示例#3

0

显示文件

文件： screenplay.py 项目： mudtriangle/ml-tools-film-postproduction

    def __init__(self, text='', lines=[]):
        # If text is not empty, use text to build scene.
        if text != '':
            text = text.split('\n')

            # Header is the first line.
            self.header = text[0]

            # Body is the rest of the text.
            self.body = '\n'.join(text[1:])
            self.tokens = tokenize(normalize(self.body))

        # Otherwise, if lines is not empty, use it instead. Preferable and more thorough.
        elif len(lines) != 0:
            # Identify the header of the scene.
            if lines[0][0] == 'Scene Heading':
                self.header = lines[0][1]
            else:
                self.header = 'No header'

            # Identify the main pieces of a scene.
            self.body = ''
            self.characters = []
            self.actions = []
            self.dialogues = []
            last_character = ''
            for i in range(1, len(lines)):
                self.body += lines[i][1] + '\n'

                # If the element is character, add it if it's not already in the list of characters.
                if lines[i][0] == 'Character':
                    character = lines[i][1]

                    # Gets rid of (V.O.), (CONT'D), etc.
                    if character.find(' (') != -1:
                        character = character[0:character.find(' (')]

                    last_character = character
                    if character not in self.characters:
                        self.characters.append(character)

                # If the element is an action, add it to the list of actions.
                elif lines[i][0] == 'Action':
                    self.actions.append(lines[i][1])

                # If the element is dialogue, add it to the list of dialogues in a tuple of the form
                # (name of the character, dialogue).
                elif lines[i][0] == 'Dialogue':
                    self.dialogues.append((last_character, lines[i][1]))

示例#4

0

显示文件

文件： tts.py 项目： templeblock/speech_synthesis

def main():
    input_string,output_filepath=validate_arguments()

    print2("1) Checking syntax... ")
    if not string_processing.validate_syntax(input_string):
        sys.exit('\n ERROR: Invalid input string syntax \"%s\". Input must be of the form (CV)+ where C is a consonant taken from {m,l,s,p,k} and V a vowel taken from {a,A}.' % input_string)
    print "done, syntax ok."

    diphones_path="diphones"
    print2("2) Loading diphones wav files from folder %s... " % diphones_path)
    diphones_wavs_db=diphone_db.read_diphones(diphones_path)
    print "done, %d diphones loaded." % len(diphones_wavs_db)

    print2("3) Parsing diphones from: \"%s\"... " % input_string)
    diphones=string_processing.tokenize(input_string)
    print "done, diphones to synthesize: %s." % str(diphones)

    print2("4) Synthesizing... ")
    output_wav=ttslib.synthesize(diphones_wavs_db,diphones)
    print "done."

    print2("5) Saving file %s..." % output_filepath)
    wavlib.write_wav(output_wav,output_filepath)
    print "done."

示例#5

0

显示文件

import string_processing as sp

# Values necessary to load the network.
vocab = pd.read_csv("vocabulary.txt",
                    names=['ind', 'word'],
                    encoding='iso-8859-1')
vocab = pd.Series(vocab['ind'].values, index=vocab['word']).to_dict()
vocab_size = du.get_vocab_size("vocabulary.txt")

# Load the network.
network = lstm.LSTMSentiment(vocab_size)
network.load_state_dict(torch.load('model'))
network.eval()

# Get user input.
user_sentence = input("Enter a review: ")

# Process user input and convert it to tokens.
user_sentence = sp.normalize(user_sentence)
user_sentence = sp.tokenize(user_sentence)
user_sentence = sp.get_numbers(user_sentence, vocab)
user_sentence = sp.padding(user_sentence, 30)

# Predict and output results.
output, h = network(torch.LongTensor([user_sentence]), network.init_hidden(1))
pred = torch.round(output.squeeze())
if pred.item() == 1:
    print("Fresh")
else:
    print("Rotten")

示例#6

0

显示文件

文件： prepare_data.py 项目： mudtriangle/rt-freshness-analysis

# Local libraries
import string_processing

# Constants
MAX_LEN = 30
MIN_LEN = 2

# Get reviews from csv file.
df = pd.read_csv("rotten_tomatoes_reviews.csv",
                 encoding='iso-8859-1')  # s/o nicolas-gervais from r/datasets

# Generate tokens from the text reviews.
df['Review_Clean'] = df['Review'].apply(
    lambda x: string_processing.normalize(x))
df['Tokens'] = df['Review_Clean'].apply(
    lambda x: string_processing.tokenize(x))
df.drop(['Review', 'Review_Clean'], axis=1, inplace=True)

# Get rid of reviews with word count below the minimum length.
df = df[df['Tokens'].apply(lambda x: len(x) > MIN_LEN)]
df.reset_index(drop=True, inplace=True)

# Generate a vocabulary.
vocab = string_processing.build_vocab(df['Tokens'].tolist(), 'vocabulary.txt')

# Replace tokens with their respective numbers in the vocabulary.
df['Tokens'] = df['Tokens'].apply(
    lambda x: string_processing.get_numbers(x, vocab))

# Add zero-padding.
df['Tokens'] = df['Tokens'].apply(