Python tokenize примеры использования

Язык программирования: Python

Пространство имен/Пакет: gtbtokenize

Метод/Функция: tokenize

Примеров на hotexamples.com: 8

Python tokenize - 8 примеров найдено. Это лучшие примеры Python кода для gtbtokenize.tokenize, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: featurise.py Проект: ogh/contra

def main(args):
    argp = _argparser().parse_args(args[1:])

    # TODO: Update the default to the best one we got after experiments
    # TODO: Adding a default has unforeseen consequences
    assert argp.features

    for line in (l.rstrip('\n') for l in stdin):
        _, lbl, pre, _, post = line.split('\t')
        
        # Tokenise the context
        # XXX: Discards meaningful spaces
        pre_toks = tokenize(pre.strip()).split()
        post_toks = tokenize(post.strip()).split()

        toks = pre_toks[-3:] + [FOCUS_DUMMY] + post_toks[:3]

        graph, nodes = prev_next_graph(toks)
        for node in nodes:
            if node.value == FOCUS_DUMMY:
                focus = node
                break
        else:
            assert False

        f_vec = {}
        for f_set in argp.features:
            for f_name, f_val in F_FUNC_BY_F_SET[f_set](nodes, graph, focus):
                f_vec[f_name] = f_val

        if not f_vec:
            print >> stderr, 'WARNING: No features generated!'
            continue

        stdout.write(lbl)
        stdout.write('\t')
        stdout.write(' '.join('{0}:{1}'.format(f_name, f_vec[f_name])
            for f_name in sorted(f_vec)))
        stdout.write('\n')

    return 0

Пример #2

Показать файл

Файл: featurise.py Проект: ogh/contra

def main(args):
    argp = _argparser().parse_args(args[1:])

    # TODO: Update the default to the best one we got after experiments
    # TODO: Adding a default has unforeseen consequences
    assert argp.features

    for line in (l.rstrip('\n') for l in stdin):
        _, lbl, pre, _, post = line.split('\t')

        # Tokenise the context
        # XXX: Discards meaningful spaces
        pre_toks = tokenize(pre.strip()).split()
        post_toks = tokenize(post.strip()).split()

        toks = pre_toks[-3:] + [FOCUS_DUMMY] + post_toks[:3]

        graph, nodes = prev_next_graph(toks)
        for node in nodes:
            if node.value == FOCUS_DUMMY:
                focus = node
                break
        else:
            assert False

        f_vec = {}
        for f_set in argp.features:
            for f_name, f_val in F_FUNC_BY_F_SET[f_set](nodes, graph, focus):
                f_vec[f_name] = f_val

        if not f_vec:
            print >> stderr, 'WARNING: No features generated!'
            continue

        stdout.write(lbl)
        stdout.write('\t')
        stdout.write(' '.join('{0}:{1}'.format(f_name, f_vec[f_name])
                              for f_name in sorted(f_vec)))
        stdout.write('\n')

    return 0

Пример #3

Показать файл

 def gen_word_list(term_dict):
     word_list = {}
     max_ip_count = 0
     for k in term_dict:
         term_ptrs = gtbtokenize.tokenize(term_dict[k]["label"]).split()
         for word in term_ptrs: 
             if not word in word_list:
                 word_list[word] = {"terms": set([]), "mags": [], "magnitude": 0, "color": 0, "unique_ips": 0, "opacity": min_alpha, "label": word}
                 if word.lower() in idfs: word_list[word]["idf"] = idfs[word.lower()]["IDF"]
                 else: word_list[word]["idf"] = UNMAPPED_IDF_CONST
             word_list[word]["terms"].add(term_dict[k]["term_id"])
             word_list[word]["mags"].append(term_dict[k]["magnitude"])
             word_list[word]["unique_ips"] += term_dict[k]["unique_ips"]
             if word_list[word]["unique_ips"] > max_ip_count: max_ip_count = word_list[word]["unique_ips"]
     #print len(word_list)
     for k in word_list:
         word_list[k]["magnitude"] = np.log2(np.median(word_list[k]["mags"])*word_list[k]["idf"])
         word_list[k]["color"] = len(word_list[k]["terms"])
         word_list[k]["opacity"] = min_alpha + (1-min_alpha)*np.log2(word_list[word]["unique_ips"])/np.log2(max_ip_count)
         word_list[k]["terms"] = list(word_list[k]["terms"])
     word_df = pd.DataFrame.from_dict(word_list, orient="index")
     word_df = word_df.sort_values("magnitude", ascending=False)
     sel_word_list = word_df[0:WC_THRES]
     return word_list

Пример #4

Показать файл

Файл: tokenise.py Проект: AlexErmer/CollaborativePDFAnnotation

def gtb_token_boundary_gen(text):
    from gtbtokenize import tokenize
    tokens = tokenize(text).split()
    for o in _token_boundaries_by_alignment(tokens, text):
        yield o

Пример #5

Показать файл

def tokenize_multiline(text):
    return '\n'.join(tokenize(s) for s in text.split('\n'))

Пример #6

Показать файл

Файл: extractTIABs.py Проект: spyysalo/pubmed

def tokenize_multiline(text):
    return '\n'.join(tokenize(s) for s in text.split('\n'))

Пример #7

Показать файл

def gtb_token_boundary_gen(text):
    from gtbtokenize import tokenize
    tokens = tokenize(text).split()
    for o in _token_boundaries_by_alignment(tokens, text):
        yield o

Пример #8

Показать файл

Файл: ontovectorgenerator.py Проект: protegeteam/string-clustering

import json, re, sys, os, collections, csv, math, time
import numpy as np
import gtbtokenize
import networkx as nx
import sklearn.metrics as metrics
from operator import itemgetter
import pandas as pd
from utils import MatrixIO, FileUtils

first_cap_re = re.compile('(.)([A-Z][a-z]+)')
all_cap_re = re.compile('([a-z0-9])([A-Z])')
f = lambda x: re.sub(r'[^a-z0-9]', "", x)
tokenize = lambda x: gtbtokenize.tokenize(x).lower()
EMBEDDING_SIZE = 100

N = 24358723
UNMAPPED_IDF_CONST = 4
MIN_ED = 4
vector_file = "../lod_query/biomed_vectors_p.txt"
vocab_file = "../lod_query/biomed_vocab_p.txt"
idf_file = "../lod_query/idf_file.tsv"
vocab_dict = {}
enc_vocab_dict = {}
stopWords = set([
    "a", "also", "although", "am", "an", "and", "are", ".", "NNNN", "VVVV",
    "as", "at", "back", "be", "became", "because", "become", "becomes",
    "becoming", "been", "being", "bill", "both", "bottom", "but", "by", "call",
    "can", "con", "could", "de", "do", "done", "eg", "etc", "even", "ever",
    "find", "for", "found", "from", "get", "give", "go", "had", "has", "have",
    "he", "her", "here", "hers", "herself", "him", "himself", "his", "how",
    "however", "if", "in", "inc", "into", "is", "it", "its", "itself", "keep",