示例#1
0
# http://metaoptimize.com/projects/wordreprs/

from options import args
from configuration import Configuration
from form import *
from tagset import Tagset
from taglexicon import TagLexicon
from wclexicon import WCLexicon
from tools import read_dict

import sys

config = Configuration('wsj', args)

# Read tagset and tag lexicon from corpus
wsj_tags, wsj_norm_tags = read_dict('data/wsj-train.tab', 0, 1)

# Create a Tagset object from the tags we have read
WSJ = Tagset(wsj_tags, config)

# Load a file with word classes
WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config)

text_field  = 0
tag_field   = 1

# Define tags (relative to the current position during a search)
this_tag        = WSJ.tag(tag_field, 0)
last_tag        = WSJ.tag(tag_field, -1)
last_last_tag   = WSJ.tag(tag_field, -2)
示例#2
0
from taglexicon import TagLexicon
from wclexicon import WCLexicon
from tools import read_dict

import sys

assert args.n_train_fields == 4

config = Configuration('suc_ne', args)

if config.skip_generate:
    config.build()
    sys.exit(0)

# Read tagset and tag lexicon (coarse SUC -> UD) from corpus
suc_ne_tags, suc_norm_ne_tags = read_dict('suc-data/suc-blogs-ne-train.tab', 1,
                                          3)

# Create a Tagset object from the tags we have read
SUC_NE = Tagset(suc_ne_tags, config)

text_field = 0
lemma_field = 1
suc_full_field = 2
tag_field = 3

Names = WCLexicon.from_file('names', 'suc-data/names.txt', config)

WC = WCLexicon.from_file('brown', 'suc-data/swe-brown100.txt', config)

# Define tags (relative to the current position during a search)
this_tag = SUC_NE.tag(tag_field, 0)
示例#3
0
# http://metaoptimize.com/projects/wordreprs/

from options import args
from configuration import Configuration
from form import *
from tagset import Tagset
from taglexicon import TagLexicon
from wclexicon import WCLexicon
from tools import read_dict

import sys

config = Configuration('wsj', args)

# Read tagset and tag lexicon from corpus
wsj_tags, wsj_norm_tags = read_dict('data/wsj-train.tab', 0, 1)

# Create a Tagset object from the tags we have read
WSJ = Tagset(wsj_tags, config)

# Load a file with word classes
WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config)

text_field = 0
tag_field = 1

# Define tags (relative to the current position during a search)
this_tag = WSJ.tag(tag_field, 0)
last_tag = WSJ.tag(tag_field, -1)
last_last_tag = WSJ.tag(tag_field, -2)
示例#4
0
from form import *
from tagset import Tagset
from taglexicon import TagLexicon
from wclexicon import WCLexicon
from tools import read_dict

import sys

config = Configuration('suc', args)

if config.skip_generate:
    config.build()
    sys.exit(0)

# Read tagset and tag lexicon from corpus
suc_tags, suc_norm_tags = read_dict('suc-data/suc-blogs.tab', 0, 1)

with open('suc-data/extra.txt', 'r', encoding='utf-8') as f:
    for line in f:
        token, tag = line.rstrip('\n').split('\t')
        suc_norm_tags[token.lower()].add(tag)
        suc_tags.add(tag)

with open('suc-data/saldo.txt', 'r', encoding='utf-8') as f:
    for line in f:
        token, _, tag, _ = line.rstrip('\n').split('\t')
        suc_norm_tags[token.lower()].add(tag)
        suc_tags.add(tag)

# Create a Tagset object from the tags we have read
SUC = Tagset(suc_tags, config)
示例#5
0
from form import *
from tagset import Tagset
from taglexicon import TagLexicon
from wclexicon import WCLexicon
from tools import read_dict

import sys

config = Configuration('suc_dalin', args)

if config.skip_generate:
    config.build()
    sys.exit(0)

# Read tagset and tag lexicon from corpus
suc_tags, suc_norm_tags = read_dict('suc-data/suc-train.tab', 0, 1)

with open('suc-data/extra.txt', 'r', encoding='utf-8') as f:
    for line in f:
        token, tag = line.rstrip('\n').split('\t')
        suc_norm_tags[token.lower()].add(tag)
        suc_tags.add(tag)

with open('suc-data/saldo.txt', 'r', encoding='utf-8') as f:
    for line in f:
        token, _, tag, _ = line.rstrip('\n').split('\t')
        suc_norm_tags[token.lower()].add(tag)
        suc_tags.add(tag)

with open('suc-data/dalin.txt', 'r', encoding='utf-8') as f:
    for line in f:
示例#6
0
from wclexicon import WCLexicon
from tools import read_dict

# There are plenty of other configuration options (see configuration.py), the
# only mandatory one is the name of the model, which will be used for the C
# file generated.
config = Configuration('udt_en', args)
# For debugging purposes, you may want to disable optimizations:
#config = Configuration('udt_en', cflags=['-g', '-O0'])

# On 64-bit systems the following might be better, if the dictionaries are
# large enough to cause many collisions.
#config = Configuration('udt_en', partial_hash_bits=64, feat_hash_bits=64, lexicon_hash_bits=64)

# Read tagset and tag lexicon from corpus
udt_en_tags, udt_en_norm_tags = read_dict('data/udt-en-train.tab', 0, 1)

# Create a Tagset object from the tags we have read
UDT_EN = Tagset(udt_en_tags, config)

# Load a file with word clusters
# This is taken from Turian et al.:
#   http://metaoptimize.com/projects/wordreprs/
# and has been converted using the brown2wcl.py script.
WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config)

text_field = 0
tag_field = 1

# Define tags (relative to the current position during a search)
this_tag = UDT_EN.tag(tag_field, 0)
示例#7
0
from taglexicon import TagLexicon
from wclexicon import WCLexicon
from tools import read_dict

train_filename = args.train
lang = os.path.basename(train_filename).split('-')[0]

# There are plenty of other configuration options (see configuration.py), the
# only mandatory one is the name of the model, which will be used for the C
# file generated.
config = Configuration('udt_' + lang, args)
# For debugging purposes, you may want to disable optimizations:
#config = Configuration('udt_' + lang, cflags=['-g', '-O0'])

# Read tagset and tag lexicon from corpus
udt_tags, udt_norm_tags = read_dict(train_filename, 0, 1)
# UDv1
#udt_tags = set(('ADJ ADP PUNCT ADV AUX SYM INTJ CONJ X NOUN DET PROPN NUM ' +
#                'VERB PART PRON SCONJ').split())
# UDv2
udt_tags = set(('ADJ ADP ADV AUX CCONJ DET INTJ NOUN NUM PART PRON PROPN '
                'PUNCT SCONJ SYM VERB X').split())

# Create a Tagset object from the tags we have read
UDT = Tagset(udt_tags, config)

text_field  = 0
tag_field   = 1

# Define tags (relative to the current position during a search)
this_tag        = UDT.tag(tag_field, 0)
示例#8
0
# SUC-tagged data to UD tags, as part of the Swedish annotation pipeline.

from options import args
from configuration import Configuration
from form import *
from tagset import Tagset
from taglexicon import TagLexicon
from tools import read_dict

# There are plenty of other configuration options (see configuration.py), the
# only mandatory one is the name of the model, which will be used for the C
# file generated.
config = Configuration('udt_suc_sv', args)

# Read tagset and tag lexicon (coarse SUC -> UD) from corpus
udt_sv_tags, udt_sv_suc_tags = read_dict('data/sv-ud-train.tab', 1, 3)
udt_sv_tags.add('X')

# Create a Tagset object from the tags we have read
UDT_SV = Tagset(udt_sv_tags, config)

lemma_field     = 0
suc_field       = 1
suc_full_field  = 2
tag_field       = 3

# UD tag (this is not really a sequence model, so we don't depend on history)
this_tag        = UDT_SV.tag(tag_field, 0)

# Word form features (lemmas)
this_word       = TextField(lemma_field, 0)
示例#9
0
# SUC-tagged data to UD tags, as part of the Swedish annotation pipeline.

from options import args
from configuration import Configuration
from form import *
from tagset import Tagset
from taglexicon import TagLexicon
from tools import read_dict

# There are plenty of other configuration options (see configuration.py), the
# only mandatory one is the name of the model, which will be used for the C
# file generated.
config = Configuration('udt_suc_sv', args)

# Read tagset and tag lexicon (coarse SUC -> UD) from corpus
udt_sv_tags, udt_sv_suc_tags = read_dict('data/sv-ud-train.tab', 1, 3)
udt_sv_tags.add('X')

# Create a Tagset object from the tags we have read
UDT_SV = Tagset(udt_sv_tags, config)

lemma_field = 0
suc_field = 1
suc_full_field = 2
tag_field = 3

# UD tag (this is not really a sequence model, so we don't depend on history)
this_tag = UDT_SV.tag(tag_field, 0)

# Word form features (lemmas)
this_word = TextField(lemma_field, 0)
示例#10
0
from form import *
from tagset import Tagset
from taglexicon import TagLexicon
from wclexicon import WCLexicon
from tools import read_dict

import sys

config = Configuration('suc', args)

if config.skip_generate:
    config.build()
    sys.exit(0)

# Read tagset and tag lexicon from corpus
suc_tags, suc_norm_tags = read_dict('suc-data/suc-blogs.tab', 0, 1)

with open('suc-data/extra.txt', 'r', encoding='utf-8') as f:
    for line in f:
        token, tag = line.rstrip('\n').split('\t')
        suc_norm_tags[token.lower()].add(tag)
        suc_tags.add(tag)

with open('suc-data/saldo.txt', 'r', encoding='utf-8') as f:
    for line in f:
        token, _, tag, _ = line.rstrip('\n').split('\t')
        suc_norm_tags[token.lower()].add(tag)
        suc_tags.add(tag)

# Create a Tagset object from the tags we have read
SUC = Tagset(suc_tags, config)
示例#11
0
from taglexicon import TagLexicon
from wclexicon import WCLexicon
from tools import read_dict

import sys

assert args.n_train_fields == 4

config = Configuration("suc_ne", args)

if config.skip_generate:
    config.build()
    sys.exit(0)

# Read tagset and tag lexicon (coarse SUC -> UD) from corpus
suc_ne_tags, suc_norm_ne_tags = read_dict("suc-data/suc-blogs-ne-train.tab", 1, 3)

# Create a Tagset object from the tags we have read
SUC_NE = Tagset(suc_ne_tags, config)

text_field = 0
lemma_field = 1
suc_full_field = 2
tag_field = 3

Names = WCLexicon.from_file("names", "suc-data/names.txt", config)

WC = WCLexicon.from_file("brown", "suc-data/swe-brown100.txt", config)

# Define tags (relative to the current position during a search)
this_tag = SUC_NE.tag(tag_field, 0)
示例#12
0
from wclexicon import WCLexicon
from tools import read_dict

# There are plenty of other configuration options (see configuration.py), the
# only mandatory one is the name of the model, which will be used for the C
# file generated.
config = Configuration('udt_en', args)
# For debugging purposes, you may want to disable optimizations:
#config = Configuration('udt_en', cflags=['-g', '-O0'])

# On 64-bit systems the following might be better, if the dictionaries are
# large enough to cause many collisions.
#config = Configuration('udt_en', partial_hash_bits=64, feat_hash_bits=64, lexicon_hash_bits=64)

# Read tagset and tag lexicon from corpus
udt_en_tags, udt_en_norm_tags = read_dict('data/udt-en-train.tab', 0, 1)

# Create a Tagset object from the tags we have read
UDT_EN = Tagset(udt_en_tags, config)

# Load a file with word clusters
# This is taken from Turian et al.:
#   http://metaoptimize.com/projects/wordreprs/
# and has been converted using the brown2wcl.py script.
WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config)

text_field  = 0
tag_field   = 1

# Define tags (relative to the current position during a search)
this_tag        = UDT_EN.tag(tag_field, 0)