Python tokenize 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: train

메소드/함수: tokenize

hotexamples.com에서의 예제들: 8

Python tokenize - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 train.tokenize에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

 def setup(self):
     tokenizer = train.tokenize(TRAIN_PATH, VALID_PATH, delimiter='\t')
     batch_size = 3
     max_length = 20
     vocab_size = 864
     self.dg = train.DataGenerator(TRAIN_PATH, batch_size, max_length,
                                   vocab_size, tokenizer)

예제 #2

파일 보기

파일: human_eval.py 프로젝트: liuwj2000/ProbLM

def main():
    counter, ngram_size, vocab_size = load_model()

    while True:
        line = input('Enter a sentence (EXIT to break):')

        if line == 'EXIT':
            break

        words = tokenize(line, ngram_size)

        probability = 1.
        for offset in range(0, len(words) - ngram_size + 1):
            history = tuple(words[offset:offset + ngram_size - 1])
            joint = tuple(words[offset:offset + ngram_size])

            history_count = counter[history]
            joint_count = counter[joint]

            logging.info(str(history) + '\t count = %d' % history_count)
            logging.info(str(joint) + '\t count = %d' % joint_count)

            # probability with additive smoothing
            probability *= (joint_count + 1) / (history_count + vocab_size)

        print()
        print('------------------------------------------')
        print('Probability: %.40f' % probability)
        print()
        print()

예제 #3

파일 보기

파일: multiprocess_train.py 프로젝트: liuwj2000/ProbLM

def line_reader(line, ngram_size, vocabulary):
    # tokenize
    words = tokenize(line, ngram_size)

    # update vocabulary
    vocabulary |= set(words)

    # skip lines that are too short to produce proper ngrams
    if len(words) < ngram_size - 1:
        return

    # yield ngrams of size ngram_size and (ngram_size - 1)
    for offset in range(0, len(words) - ngram_size + 1):
        yield tuple(words[offset:offset + ngram_size])
        yield tuple(words[offset:offset + ngram_size - 1])

예제 #4

파일 보기

파일: test.py 프로젝트: eigenfoo-archives/textcat

    def categorize(self, test_list_path, doc_path, outfile):
        '''
        Helper function to categorize one document and write the
        results to the outfile.
        '''
        # Generate list of tokens for the given document
        token_list = tokenize(os.path.join(test_list_path, doc_path))

        # Compute similarity metric for each of the categories
        similarities = {}
        for category in self.ii.category_count.keys():
            similarities[category] = self.similarity(token_list, category)

        # Pick the category with highest similarity and write results to
        # output file
        label = max(similarities, key=similarities.get)
        print(doc_path + ' ' + label, file=outfile)

예제 #5

파일 보기

def test_tokenize():
    tokenizer = train.tokenize(TRAIN_PATH, VALID_PATH, delimiter='\t')
    test_data = [['容疑者', 'が']]
    assert tokenizer.texts_to_sequences(test_data) == [[265, 3]]

예제 #6

파일 보기

from train import complete_prompt, tokenize
from flask import Flask, render_template
from tensorflow.keras.models import load_model

app = Flask(__name__)

model = load_model('./model')

tokenize_result = tokenize()


@app.route('/<name>')
def show_name_poem(name):
    poem = complete_prompt(model, tokenize_result, "Dear " + name)
    return render_template('output.html', name=name, poem=poem)


if __name__ == '__main__':
    # Threaded option to enable multiple instances for multiple user access support
    app.run(threaded=True, port=5000)

예제 #7

파일 보기

파일: label.py 프로젝트: NinjaPerson24119/ThematicAnalyzer

def main():
    # verify arg
    if len(sys.argv) != 2:
        print("Mising argument 1: absolute path to text to label")
        return

    # get arg as name
    filePath = sys.argv[1]

    # try to load file
    if not os.path.isfile(filePath):
        print("File at {} does not exist".format(filePath))
        return

    # check extension
    if filePath[-4:] != '.txt':
        print("File exists, but must be .txt format")
        return

    # load file to label
    contents = ''
    with open(filePath, 'r') as fp:
        for line in fp.readlines():
            contents += line

    # tokenize input text
    tokens = tokenize(contents)

    # enumerate classes
    classFiles = next(os.walk('../result'))[2]
    if len(classFiles) == 0:
        print("No classes to label from")
        return
    classFiles.remove('.dummy')

    # get class names from filenames
    classNames = []
    for f in classFiles:
        classNames.append(f[:-4])

    # load classes
    classWords = []
    for f in classFiles:
        classContents = ''
        with open('../result/' + f, 'r') as fp:
            for line in fp.readlines():
                classContents += line
        thisClassWords = classContents.split('\n')
        classWords.append(thisClassWords)

    # enumerate colors in 1D line: 256^3
    colors = []
    floorColor = (7 * 16 + 7)**3
    colorDist = math.floor(
        (256**3 - floorColor) / (len(classNames) * colorContrastMultiplier))
    for i in range(0, len(classNames)):
        color = floorColor + i * colorDist
        hexString = hex(color)
        colors.append(hexString[2:])

    # try to apply each class
    for token in tokens:
        if token[1] == True:
            for i in range(0, len(classNames)):
                # iterate words to look for
                for w in classWords[i]:
                    if token[0] == w:
                        token[
                            0] = '<span style="background-color: #{}">{}</span>'.format(
                                colors[i], token[0])

    # rebuild text with stylized tokens
    rebuilt = ''
    for token in tokens:
        rebuilt += token[0]

    # write rebuilt text to file
    with open(filePath[:-4] + '.md', 'w') as fp:
        # build legend
        fp.write('Legend:<br />')
        for i in range(0, len(classNames)):
            fp.write(
                '<span style="background-color: #{}">{}</span><br />'.format(
                    colors[i], classNames[i]))
        fp.write('<br />')

        # fix tabs and newlines
        rebuilt = rebuilt.replace('\n', '<br />')
        rebuilt = rebuilt.replace('\t', '&nbsp;' * 4)

        # write contents
        fp.write(rebuilt)

예제 #8

파일 보기

파일: chat.py 프로젝트: SinaSafari/pytorch-chatbot

model_state = data["model_state"]

model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
# Sets the module in evaluation mode.
model.eval()

bot_name = "Bot"
print("Can I help you? (type 'quit' to exit)")
while True:
    # sentence = "do you use credit cards?"
    sentence = input("You: ")
    if sentence == "quit":
        break

    sentence = tokenize(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X).to(device)

    output = model(X)
    _, predicted = torch.max(output, dim=1)

    tag = tags[predicted.item()]

    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]

    if prob.item() > 0.75:
        for intent in intents['intents']:
            if tag == intent["tag"]: