Python tokenize примеры, speech_tokenizer.tokenize Python примеры использования

Пример #1

0

Показать файл

Файл: speech_transcripts.py Проект: ilibx/nlp

    def split(self, p_test=5, limit=0, min_quality=2, add_all=False):

        ts_all = {}
        ts_train = {}
        ts_test = {}

        cnt = 0

        for cfn in self.ts:

            v = self.ts[cfn]

            cnt += 1

            if limit > 0 and cnt > limit:
                break

            if v['quality'] < min_quality:
                if (v['quality'] != 0) or (not add_all):
                    continue

            if len(v['ts']) == 0:
                if add_all:
                    v['ts'] = ' '.join(tokenize(v['prompt']))
                else:
                    print "WARNING: %s transcript missing" % cfn
                    continue

            ts_all[cfn] = v
            if len(ts_test) < (len(ts_all) * p_test / 100):
                ts_test[cfn] = v
            else:
                ts_train[cfn] = v

        return ts_all, ts_train, ts_test

Пример #2

0

Показать файл

    def compute_x(self, txt):

        tokens = tokenize(txt)

        return map(
            lambda token: self.input_dict[token]
            if token in self.input_dict else UNK_ID, tokens)

Пример #3

0

Показать файл

Файл: nlp_model.py Проект: baaslaawe/nlp

    def compute_input_dict(self):

        logging.info('Computing input dict...')

        self.dictionary = {'': 0}

        self.max_len = 0
        self.num_segments = 0

        for segments in (self.seg_test, self.seg_train):
            for segment in segments:

                tokens = tokenize(segment[0])

                # print segment.txt, '->', repr(tokens)

                l = len(tokens)

                if l > self.max_len:
                    self.max_len = l

                i = 0
                for token in tokens:
                    if not token in self.dictionary:
                        self.dictionary[token] = len(self.dictionary)

                self.num_segments += 1

        logging.info(
            'input dict done. %d entries, max segment len is %d tokens.' %
            (len(self.dictionary), self.max_len))

        return self.dictionary, self.max_len, self.num_segments

Пример #4

0

Показать файл

    def compute_dicts(self):

        # build input and output dicts

        self.input_dict = {
            _PAD: PAD_ID,
            _GO: GO_ID,
            _EOS: EOS_ID,
            _UNK: UNK_ID
        }
        self.output_dict = {
            _PAD: PAD_ID,
            _GO: GO_ID,
            _EOS: EOS_ID,
            _UNK: UNK_ID
        }

        self.input_max_len = 0
        self.output_max_len = 0

        self.num_segments = 0

        for dr in self.session.query(model.DiscourseRound).all():

            # input

            tokens = tokenize(dr.inp)

            l = len(tokens)

            if l > self.input_max_len:
                self.input_max_len = l

            i = 0
            for token in tokens:

                if not token in self.input_dict:
                    self.input_dict[token] = len(self.input_dict)

            # output

            preds = dr.resp.split(';')
            l = len(preds) + 1  # +1 to account for _EOS token

            if l > self.output_max_len:
                self.output_max_len = l

            i = 0
            for pred in preds:
                if not pred in self.output_dict:
                    self.output_dict[pred] = len(self.output_dict)

            self.num_segments += 1

        logging.info(
            'dicts done. input: %d enties, input_max_len=%d. output: %d enties, input_max_len=%d.  num_segments: %d'
            % (len(self.input_dict), self.input_max_len, len(
                self.output_dict), self.output_max_len, self.num_segments))

Пример #5

0

Показать файл

def apply_punkt(text):

    global tokenizer, outf

    sentncs = tokenizer.tokenize(text, realign_boundaries=True)
    for sentence in sentncs:

        print "Sentence: %s" % sentence

        outf.write(u'%s\n' % ' '.join(tokenize(sentence)))

Пример #6

0

Показать файл

    def compute_input_hist(self):

        hist = {}

        for dr in self.session.query(model.DiscourseRound).all():

            tokens = tokenize(dr.inp)

            if not (len(tokens) in hist):
                hist[len(tokens)] = 0

            hist[len(tokens)] += 1

        return hist

Пример #7

0

Показать файл

Файл: nlp_model.py Проект: baaslaawe/nlp

    def compute_x(self, txt):

        x = np.zeros(self.max_len, np.int32)

        tokens = tokenize(txt)

        l = len(tokens)
        i = 0
        for token in tokens:
            x[self.max_len - l +
              i] = self.dictionary[token] if token in self.dictionary else 0
            i += 1

        return x

Пример #8

0

Показать файл

def nlp_gen(src, clause):

    global nlp_macros, session

    args = clause.head.args

    lang = args[0].name

    # extract all macros used

    macro_names = set()

    argc = 1
    while argc < len(args):

        nlp   = args[argc  ].s
        preds = args[argc+1].s

        argc += 2

        for pos, char in enumerate(nlp):

            if char == '@':

                macro = re.match(r'@([A-Z]+):', nlp[pos:])

                # print "MACRO:", macro.group(1)

                macro_names.add(macro.group(1))

    # generate all macro-expansions

    macro_names = sorted(macro_names)
    todo = [ (0, {}) ]

    while True:

        if len(todo) == 0:
            break

        idx, mappings = todo.pop(0)

        if idx < len(macro_names):

            macro_name = macro_names[idx]

            for v in nlp_macros[macro_name]:

                nm = copy(mappings)
                # nm[macro_name] = (v, nlp_macros[macro_name][v])
                nm[macro_name] = v

                todo.append ( (idx+1, nm) )

        else:

            # generate discourse for this set of mappings

            # print repr(mappings)

            # create discourse in db

            discourse = model.Discourse(num_participants = 2,
                                        lang             = lang,
                                        src              = src)
            session.add(discourse)

            argc       = 1
            round_num = 0
            while argc < len(args):

                s = args[argc  ].s
                p = args[argc+1].s

                argc += 2

                for k in mappings:

                    for v in mappings[k]:

                        s = s.replace('@'+k+':'+v, mappings[k][v])
                        p = p.replace('@'+k+':'+v, mappings[k][v])

                inp_raw = utils.compress_ws(s.lstrip().rstrip())
                p       = utils.compress_ws(p.lstrip().rstrip())

                # print s
                # print p

                # tokenize strings, wrap them into say() calls

                inp_tokenized = ' '.join(tokenize(inp_raw, lang))

                preds = p.split(';')
                np = ''
                for pr in preds:
                    if not pr.startswith('"'):
                        if len(np)>0:
                            np += ';'
                        np += pr.strip()
                        continue

                    for word in tokenize (pr, lang):
                        if len(np)>0:
                            np += ';'
                        np += 'say(' + lang + ', "' + word + '")'

                    if len(p) > 2:
                        if p[len(p)-2] in ['.', '?', '!']:
                            if len(np)>0:
                                np += ';'
                            np += 'say(' + lang + ', "' + p[len(p)-2] + '")'
                    np += ';eou'


                dr = model.DiscourseRound(inp_raw       = inp_raw, 
                                          inp_tokenized = inp_tokenized,
                                          response      = np, 
                                          discourse     = discourse, 
                                          round_num     = round_num)
                session.add(dr)

                round_num += 1

Пример #9

0

Показать файл

def nlp_test(clause):

    global nlp_test_engine, db

    args = clause.head.args

    lang = args[0].name

    # extract test rounds, look up matching discourses

    rounds        = [] # [ (in, out, actions), ...]
    round_num     = 0
    discourse_ids = set()

    for ivr in args[1:]:

        if ivr.name != 'ivr':
            raise PrologError ('nlp_test: ivr predicate args expected.')

        test_in = ''
        test_out = ''
        test_actions = set()

        for e in ivr.args:

            if e.name == 'in':
                test_in = ' '.join(tokenize(e.args[0].s, lang))
            elif e.name == 'out':
                test_out = ' '.join(tokenize(e.args[0].s, lang))
            elif e.name == 'action':
                test_actions.add(unicode(e))
            else:
                raise PrologError (u'nlp_test: ivr predicate: unexpected arg: ' + unicode(e))
           
        rounds.append((test_in, test_out, test_actions))

        # look up matching discourse_ids:

        d_ids = set()
        
        for dr in session.query(model.DiscourseRound).filter(model.DiscourseRound.inp_tokenized==test_in) \
                                                     .filter(model.DiscourseRound.round_num==round_num).all():
            d_ids.add(dr.discourse_id)

        if round_num==0:
            discourse_ids = d_ids
        else:
            discourse_ids = discourse_ids & d_ids

        print 'discourse_ids:', repr(discourse_ids)

        round_num += 1

    if len(discourse_ids) == 0:
        raise PrologError ('nlp_test: no matching discourse found.')

    # run the test(s): look up reaction to input in db, execute it, check result
    for did in discourse_ids:
        nlp_test_engine.reset_context()

        round_num = 0
        for dr in session.query(model.DiscourseRound).filter(model.DiscourseRound.discourse_id==did) \
                                                     .order_by(model.DiscourseRound.round_num):
        
            prolog_s = ','.join(dr.response.split(';'))

            print
            print "Round:", round_num, dr.inp_tokenized, '=>', prolog_s

            c = parser.parse_line_clause_body(prolog_s)
            # logging.debug( "Parse result: %s" % c)

            # logging.debug( "Searching for c: %s" % c )

            nlp_test_engine.reset_utterances()
            solutions = nlp_test_engine.search(c)

            if len(solutions) == 0:
                raise PrologError ('nlp_test: no solution found.')
        
            print "round %d utterances: %s" % (round_num, repr(nlp_test_engine.get_utterances())) 

            # check actual utterances vs expected one

            test_in, test_out, test_actions = rounds[round_num]

            found = False
            for utt in nlp_test_engine.get_utterances():
                actual_out = ' '.join(tokenize(utt['utterance'], utt['lang']))
                if actual_out == test_out:
                    found = True
                    break

            if found:
                print "***MATCHED!"
            else:
                raise PrologError ('nlp_test: actual utterance did not match.')
            

            # FIXME: check actions

            round_num += 1

Пример #10

0

Показать файл

    print "looking for missing words..."

    missing = {}  # word -> count

    num = len(transcripts)
    cnt = 0

    for cfn in transcripts:
        ts = transcripts[cfn]

        cnt += 1

        if ts['quality'] > 0:
            continue

        for word in tokenize(ts['prompt']):

            if word in lex:
                continue

            if word in missing:
                missing[word] += 1
            else:
                missing[word] = 1

    cnt = 0
    for item in reversed(sorted(missing.items(), key=lambda x: x[1])):

        lex_base = item[0]

        ipas = sequitur_gen_ipa(lex_base)

Пример #11

0

Показать файл

def paint_main(stdscr, cur_ts):

    global edit_ts, prompt_tokens, prompt_token_idx

    ts = edit_ts[cur_ts]

    stdscr.clear()

    my, mx = stdscr.getmaxyx()

    for x in range(mx):
        stdscr.insstr(0, x, ' ', curses.A_REVERSE)
        stdscr.insstr(my - 2, x, ' ', curses.A_REVERSE)
        stdscr.insstr(my - 1, x, ' ', curses.A_REVERSE)

    # header

    s = u"%2d/%2d %-30s QTY: %d" % (cur_ts + 1, len(edit_ts), ts['cfn'],
                                    ts['quality'])

    stdscr.insstr(0, 0, s.encode('utf8'), curses.A_BOLD | curses.A_REVERSE)
    stdscr.insstr(0, mx - 13, 'Speech Editor', curses.A_REVERSE)

    # prompts file

    if prompt_token_idx < len(prompt_tokens):
        pstr = ' '.join(prompt_tokens[prompt_token_idx:prompt_token_idx + 8])
        stdscr.insstr(1, mx - len(pstr), pstr.encode('utf8'))

    # body / transcript

    stdscr.insstr(2, 0, 'Prompt:', curses.A_BOLD)
    stdscr.insstr(3, 0, ts['prompt'].encode('utf8'))

    if len(ts['ts']) == 0:
        ts['ts'] = ' '.join(tokenize(ts['prompt']))

    cy = 5
    cx = 0

    missing_token = None

    for token in ts['ts'].split(' '):

        if token in lex:

            s = ''

            m = lex.get_multi(token)

            for t in m:

                v = m[t]

                if len(s) > 0:
                    s += ', '

                if len(m) > 1 and t == token:
                    s += '**'
                s += t
                s += ' [' + m[t]['ipa']
                s += ']'

            stdscr.insstr(cy, cx, s.encode('utf8'))

        else:
            if not missing_token:
                missing_token = token

            stdscr.insstr(cy, cx, token.encode('utf8'), curses.A_REVERSE)

        cy += 1
        if cy > my - 2:
            break

    # footer

    stdscr.insstr(my - 2, 0,
                  " P:Play     E:Prompt  T:Transcript                      ",
                  curses.A_REVERSE)
    stdscr.insstr(my - 1, 0,
                  " L:LexEdit            Prompts File: A=add S=skip B=Back ",
                  curses.A_REVERSE)
    stdscr.insstr(my - 2, mx - 40, "           Accept: 1=Poor 2=Fair 3=Good ",
                  curses.A_REVERSE)
    stdscr.insstr(my - 1, mx - 40, "                                 Q:Quit ",
                  curses.A_REVERSE)
    stdscr.refresh()

    return missing_token

Пример #12

0

Показать файл

#

print "loading lexicon..."
lex = Lexicon()
print "loading lexicon...done."

#
# load prompts
#

prompt_tokens = []
prompt_token_idx = 0
if options.promptsfn:
    with codecs.open(options.promptsfn, 'r', 'utf8') as promptsf:
        for line in promptsf:
            prompt_tokens.extend(tokenize(line))

    print "%s read. %d tokens." % (options.promptsfn, len(prompt_tokens))

#
# curses
#

locale.setlocale(locale.LC_ALL, "")

stdscr = curses.initscr()
curses.noecho()
curses.cbreak()
stdscr.keypad(1)

#

Пример #13

0

Показать файл

#

all_tokens = set()

with codecs.open('%s/hal.txt' % WORKDIR, 'w', 'utf8') as allf:

    for tsfn in os.listdir('data/dst'):

        if not tsfn.endswith('.ts'):
            continue

        with codecs.open('data/dst/%s' % tsfn, 'r', 'utf8') as tsf:

            for line in tsf:

                tokens = tokenize(line)

                for token in tokens:
                    all_tokens.add(token)

                allf.write (u'<s> %s </s>\n' % ' '.join(tokens))

#
# wlist, dictionary
#

with codecs.open('%s/hal.vocab' % WORKDIR, 'w', 'utf8') as vocabf, \
     codecs.open('%s/hal.dic' % WORKDIR, 'w', 'utf8') as dicf:

    vocabf.write('</s>\n')
    vocabf.write('<s>\n')

Пример #14

0

Показать файл

    params = punkt_trainer.get_params()
    # print "Params: %s" % repr(params)

    tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(params)
    with open(PUNKT_PICKLEFN, mode='wb') as f:
        pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

    print '%s written.' % PUNKT_PICKLEFN

else:

    print "Loading %s ..." % PUNKT_PICKLEFN

    with open(PUNKT_PICKLEFN, mode='rb') as f:
        tokenizer = pickle.load(f)

    print "Loading %s ... done." % PUNKT_PICKLEFN

with codecs.open(SENTENCEFN, 'w', 'utf8') as outf:

    print "applying punkt to parole..."
    parole_crawl(parole, apply_punkt)

    print "adding sentences from europarl..."
    with codecs.open(europarl, 'r', 'utf8') as inf:
        for line in inf:
            outf.write(u'%s\n' % ' '.join(tokenize(line)))

print '%s written.' % SENTENCEFN
print

Пример #15

0

Показать файл

    def macro_expand(self, lang, nlps, preds):

        logging.debug ('macro_expand: nlps=%s, preds=%s' % (repr(nlps), repr(preds)))

        # handle implicit macros

        implicit_macros = {}

        nlps2 = []

        for nlp in nlps:

            nlp2 = ''

            i = 0
            while i<len(nlp):

                if nlp[i] == '(':

                    j = nlp[i+1:].find(')')
                    if j<0:
                        raise Exception (') missing')
                    j += i

                    # extract macro

                    macro_s = nlp[i+1:j+1]

                    # print "macro_s: %s" % macro_s

                    macro_name = '__INTERNAL_MACRO_%06d__' % len(implicit_macros)

                    implicit_macros[macro_name] = []
                    for s in macro_s.split('|'):
                        implicit_macros[macro_name].append({'w': s.strip()})

                    nlp2 += '@' + macro_name + ':w '

                    i = j+2
                else:

                    nlp2 += nlp[i]
                    i+=1

            nlps2.append(nlp2)

            # print "after implicit macro handling: %s" % nlp2

        # print "implicit macros: %s" % repr(implicit_macros)

        # extract all macros used

        macro_names = set()

        for nlp in nlps2:

            # print nlp

            for pos, char in enumerate(nlp):

                if char == '@':

                    macro = re.match(r'@([A-Z0-9_]+):', nlp[pos:])

                    # print "MACRO:", macro.group(1)

                    macro_names.add(macro.group(1))

        # print "macro names used: %s" % macro_names

        # generate all macro-expansions

        macro_names = sorted(macro_names)
        todo        = [ (0, {}) ]
        discourses  = []

        while True:

            if len(todo) == 0:
                break

            idx, mappings = todo.pop(0)

            if idx < len(macro_names):

                macro_name = macro_names[idx]

                macro_dict = implicit_macros if macro_name in implicit_macros else self.named_macros

                for v in macro_dict[macro_name]:

                    new_mappings = copy(mappings)
                    new_mappings[macro_name] = v

                    todo.append ( (idx+1, new_mappings) )

            else:

                # generate discourse for this set of mappings

                # print 'mappings:', repr(mappings)

                discourse = []

                argc      = 1
                for s, p in zip(nlps2, preds):

                    # print s,p

                    for k in mappings:

                        for v in mappings[k]:

                            s = s.replace('@'+k+':'+v, mappings[k][v])
                            p = p.replace('@'+k+':'+v, mappings[k][v])

                    inp_raw = utils.compress_ws(s.lstrip().rstrip())
                    p       = utils.compress_ws(p.lstrip().rstrip())

                    # print s
                    # print p

                    # tokenize strings, wrap them into say() calls

                    inp_tokenized = ' '.join(tokenize(inp_raw, lang))

                    ps = p.split(';')
                    np = ''
                    for pr in ps:
                        if not pr.startswith('"'):
                            if len(np)>0:
                                np += ';'
                            np += pr.strip()
                            continue

                        for word in tokenize (pr, lang):
                            if len(np)>0:
                                np += ';'
                            np += 'say(' + lang + ', "' + word + '")'

                        if len(p) > 2:
                            if p[len(p)-2] in ['.', '?', '!']:
                                if len(np)>0:
                                    np += ';'
                                np += 'say(' + lang + ', "' + p[len(p)-2] + '")'
                        np += ';eou'


                    discourse.append((inp_tokenized, np))

                logging.debug ('macro_expand:    discourse : %s' % (repr(discourse)))

                discourses.append(discourse)

        return discourses

Пример #16

0

Показать файл

Файл: prolog_compiler.py Проект: ilibx/nlp

    def nlp_test(self, clause):

        args = clause.head.args

        lang = args[0].name

        # extract test rounds, look up matching discourses

        rounds = []  # [ (in, out, actions), ...]
        round_num = 0
        discourse_ids = set()

        for ivr in args[1:]:

            if ivr.name != 'ivr':
                raise PrologError('nlp_test: ivr predicate args expected.')

            test_in = ''
            test_out = ''
            test_actions = []

            for e in ivr.args:

                if e.name == 'in':
                    test_in = ' '.join(tokenize(e.args[0].s, lang))
                elif e.name == 'out':
                    test_out = ' '.join(tokenize(e.args[0].s, lang))
                elif e.name == 'action':
                    test_actions.append(e.args)
                else:
                    raise PrologError(
                        u'nlp_test: ivr predicate: unexpected arg: ' +
                        unicode(e))

            rounds.append((test_in, test_out, test_actions))

            # look up matching discourse_ids:

            d_ids = set()

            for dr in self.session.query(model.DiscourseRound).filter(model.DiscourseRound.inp_tokenized==test_in) \
                                                              .filter(model.DiscourseRound.round_num==round_num).all():
                d_ids.add(dr.discourse_id)

            if round_num == 0:
                discourse_ids = d_ids
            else:
                discourse_ids = discourse_ids & d_ids

            # print 'discourse_ids:', repr(discourse_ids)

            round_num += 1

        if len(discourse_ids) == 0:
            raise PrologError('nlp_test: %s: no matching discourse found.' %
                              clause.location)

        nlp_test_parser = PrologParser()

        # run the test(s): look up reaction to input in db, execute it, check result
        for did in discourse_ids:
            self.nlp_test_engine.reset_context()

            round_num = 0
            for dr in self.session.query(model.DiscourseRound).filter(model.DiscourseRound.discourse_id==did) \
                                                              .order_by(model.DiscourseRound.round_num):

                prolog_s = ','.join(dr.resp.split(';'))

                logging.info(
                    "nlp_test: %s round=%3d, %s => %s" %
                    (clause.location, round_num, dr.inp_tokenized, prolog_s))

                c = nlp_test_parser.parse_line_clause_body(prolog_s)
                # logging.debug( "Parse result: %s" % c)

                # logging.debug( "Searching for c: %s" % c )

                self.nlp_test_engine.reset_utterances()
                self.nlp_test_engine.reset_actions()
                solutions = self.nlp_test_engine.search(c)

                if len(solutions) == 0:
                    raise PrologError('nlp_test: %s no solution found.' %
                                      clause.location)

                # print "round %d utterances: %s" % (round_num, repr(nlp_test_engine.get_utterances()))

                # check actual utterances vs expected one

                test_in, test_out, test_actions = rounds[round_num]

                utterance_matched = False
                actual_out = ''
                utts = self.nlp_test_engine.get_utterances()

                if len(utts) > 0:
                    for utt in utts:
                        actual_out = ' '.join(
                            tokenize(utt['utterance'], utt['lang']))
                        if actual_out == test_out:
                            utterance_matched = True
                            break
                else:
                    utterance_matched = len(test_out) == 0

                if utterance_matched:
                    if len(utts) > 0:
                        logging.info(
                            "nlp_test: %s round=%3d *** UTTERANCE MATCHED!" %
                            (clause.location, round_num))
                else:
                    raise PrologError(
                        u'nlp_test: %s round=%3d actual utterance \'%s\' did not match expected utterance \'%s\'.'
                        % (clause.location, round_num, actual_out, test_out))

                # check actions

                if len(test_actions) > 0:

                    # print repr(test_actions)

                    actions_matched = True
                    acts = self.nlp_test_engine.get_actions()
                    for action in test_actions:
                        for act in acts:
                            # print "    check action match: %s vs %s" % (repr(action), repr(act))
                            if action == act:
                                break
                        if action != act:
                            actions_matched = False
                            break

                    if actions_matched:
                        logging.info(
                            "nlp_test: %s round=%3d *** ACTIONS MATCHED!" %
                            (clause.location, round_num))

                    else:
                        raise PrologError(
                            u'nlp_test: %s round=%3d ACTIONS MISMATCH.' %
                            (clause.location, round_num))

                round_num += 1

Python tokenize примеры использования