#!/usr/bin/env python

import sys, argparse
import ngram

PROGRESS = 500000

WORD = ngram.load_table('word')
WORD_AI = max(WORD.itervalues()) if len(WORD) > 0 else 0
print >>sys.stderr, "Loaded %d words. Starting at word id %d" \
    % (len(WORD), WORD_AI)
def word_id(word, outfile):
    global WORD, WORD_AI
    word = word[:45]
    v = WORD.get(word, None)
    if v is None:
        WORD_AI += 1
        v = WORD_AI
        print >>outfile, '%d\t%s' % (v, word)
    return v
    
POS = ngram.load_table('pos')
POS_AI = max(POS.itervalues()) if len(POS) > 0 else 0
print >>sys.stderr, "Loaded %d POS. Starting at pos id %d" \
    % (len(POS), POS_AI)
NGRAM_POS = set(['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON', 'DET', 'ADP', 'NUM', 
                 'CONJ', 'PRT', 'X', '.'])
def pos_id(tag, outfile):
    global POS, POS_AI, NGRAM_POS
    if tag not in NGRAM_POS:
        raise ValueError("Not a POS tag")
Exemplo n.º 2
0
#!/usr/bin/env python

import sys, argparse
import ngram

PROGRESS = 1000000

WORD = ngram.load_table('word')
WORD_AI = max(WORD.values()) if len(WORD) > 0 else 0
print("Loaded %d words. Starting at word id %d" \
    % (len(WORD), WORD_AI), file=sys.stderr)


def word_id(word, outfile):
    global WORD, WORD_AI
    word = word[:45]
    v = WORD.get(word, None)
    if v is None:
        WORD_AI += 1
        v = WORD_AI
        WORD[word] = v
        print('%d\t%s' % (v, ngram.db.escape_string(word)), file=outfile)
    return v


def cached_lookup(key, cache, outfile):
    v = cache.get(key, None)
    if v is None:
        v = max(cache.values()) + 1
        print('%d\t%s' % (v, ngram.db.escape_string(key)), file=outfile)
        cache[key] = v
Exemplo n.º 3
0
#!/usr/bin/env python

import sys, argparse
import ngram

PROGRESS = 100000

WORD = ngram.load_table('word')
WORD_AI = max(WORD.values()) if len(WORD) > 0 else 0
print("Loaded %d words. Starting at word id %d" \
    % (len(WORD), WORD_AI), file=sys.stderr)


def word_id(word, outfile):
    global WORD, WORD_AI
    word = word[:45]
    v = WORD.get(word, None)
    if v is None:
        WORD_AI += 1
        v = WORD_AI
        WORD[word] = v
        print('%d\t%s' % (v, ngram.db.escape_string(word)), file=outfile)
    return v


POS = ngram.load_table('pos')
POS_AI = max(POS.values()) if len(POS) > 0 else 0
print("Loaded %d POS. Starting at pos id %d" \
    % (len(POS), POS_AI), file=sys.stderr)
NGRAM_POS = [
    'NOUN', 'VERB', 'ADJ', 'ADV', 'PRON', 'DET', 'ADP', 'NUM', 'CONJ', 'PRT',
#!/usr/bin/env python

import sys, argparse
import ngram

PROGRESS = 1000000

WORD = ngram.load_table('word')
WORD_AI = max(WORD.itervalues()) if len(WORD) > 0 else 0
print >>sys.stderr, "Loaded %d words. Starting at word id %d" \
    % (len(WORD), WORD_AI)
def word_id(word, outfile):
    global WORD, WORD_AI
    word = word[:45]
    v = WORD.get(word, None)
    if v is None:
        WORD_AI += 1
        v = WORD_AI
        WORD[word] = v
        print >>outfile, '%d\t%s' % (v, ngram.db.escape_string(word))
    return v
    
def cached_lookup(key, cache, outfile):
    v = cache.get(key, None)
    if v is None:
        v = max(cache.itervalues()) + 1
        print >>outfile, '%d\t%s' % (v, ngram.db.escape_string(key))
        cache[key] = v
    return v
    
DEP = ngram.load_table('dep')