# http://metaoptimize.com/projects/wordreprs/ from options import args from configuration import Configuration from form import * from tagset import Tagset from taglexicon import TagLexicon from wclexicon import WCLexicon from tools import read_dict import sys config = Configuration('wsj', args) # Read tagset and tag lexicon from corpus wsj_tags, wsj_norm_tags = read_dict('data/wsj-train.tab', 0, 1) # Create a Tagset object from the tags we have read WSJ = Tagset(wsj_tags, config) # Load a file with word classes WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config) text_field = 0 tag_field = 1 # Define tags (relative to the current position during a search) this_tag = WSJ.tag(tag_field, 0) last_tag = WSJ.tag(tag_field, -1) last_last_tag = WSJ.tag(tag_field, -2)
from taglexicon import TagLexicon from wclexicon import WCLexicon from tools import read_dict import sys assert args.n_train_fields == 4 config = Configuration('suc_ne', args) if config.skip_generate: config.build() sys.exit(0) # Read tagset and tag lexicon (coarse SUC -> UD) from corpus suc_ne_tags, suc_norm_ne_tags = read_dict('suc-data/suc-blogs-ne-train.tab', 1, 3) # Create a Tagset object from the tags we have read SUC_NE = Tagset(suc_ne_tags, config) text_field = 0 lemma_field = 1 suc_full_field = 2 tag_field = 3 Names = WCLexicon.from_file('names', 'suc-data/names.txt', config) WC = WCLexicon.from_file('brown', 'suc-data/swe-brown100.txt', config) # Define tags (relative to the current position during a search) this_tag = SUC_NE.tag(tag_field, 0)
from form import * from tagset import Tagset from taglexicon import TagLexicon from wclexicon import WCLexicon from tools import read_dict import sys config = Configuration('suc', args) if config.skip_generate: config.build() sys.exit(0) # Read tagset and tag lexicon from corpus suc_tags, suc_norm_tags = read_dict('suc-data/suc-blogs.tab', 0, 1) with open('suc-data/extra.txt', 'r', encoding='utf-8') as f: for line in f: token, tag = line.rstrip('\n').split('\t') suc_norm_tags[token.lower()].add(tag) suc_tags.add(tag) with open('suc-data/saldo.txt', 'r', encoding='utf-8') as f: for line in f: token, _, tag, _ = line.rstrip('\n').split('\t') suc_norm_tags[token.lower()].add(tag) suc_tags.add(tag) # Create a Tagset object from the tags we have read SUC = Tagset(suc_tags, config)
from form import * from tagset import Tagset from taglexicon import TagLexicon from wclexicon import WCLexicon from tools import read_dict import sys config = Configuration('suc_dalin', args) if config.skip_generate: config.build() sys.exit(0) # Read tagset and tag lexicon from corpus suc_tags, suc_norm_tags = read_dict('suc-data/suc-train.tab', 0, 1) with open('suc-data/extra.txt', 'r', encoding='utf-8') as f: for line in f: token, tag = line.rstrip('\n').split('\t') suc_norm_tags[token.lower()].add(tag) suc_tags.add(tag) with open('suc-data/saldo.txt', 'r', encoding='utf-8') as f: for line in f: token, _, tag, _ = line.rstrip('\n').split('\t') suc_norm_tags[token.lower()].add(tag) suc_tags.add(tag) with open('suc-data/dalin.txt', 'r', encoding='utf-8') as f: for line in f:
from wclexicon import WCLexicon from tools import read_dict # There are plenty of other configuration options (see configuration.py), the # only mandatory one is the name of the model, which will be used for the C # file generated. config = Configuration('udt_en', args) # For debugging purposes, you may want to disable optimizations: #config = Configuration('udt_en', cflags=['-g', '-O0']) # On 64-bit systems the following might be better, if the dictionaries are # large enough to cause many collisions. #config = Configuration('udt_en', partial_hash_bits=64, feat_hash_bits=64, lexicon_hash_bits=64) # Read tagset and tag lexicon from corpus udt_en_tags, udt_en_norm_tags = read_dict('data/udt-en-train.tab', 0, 1) # Create a Tagset object from the tags we have read UDT_EN = Tagset(udt_en_tags, config) # Load a file with word clusters # This is taken from Turian et al.: # http://metaoptimize.com/projects/wordreprs/ # and has been converted using the brown2wcl.py script. WC = WCLexicon.from_file('brown', 'data/en-brown320.txt', config) text_field = 0 tag_field = 1 # Define tags (relative to the current position during a search) this_tag = UDT_EN.tag(tag_field, 0)
from taglexicon import TagLexicon from wclexicon import WCLexicon from tools import read_dict train_filename = args.train lang = os.path.basename(train_filename).split('-')[0] # There are plenty of other configuration options (see configuration.py), the # only mandatory one is the name of the model, which will be used for the C # file generated. config = Configuration('udt_' + lang, args) # For debugging purposes, you may want to disable optimizations: #config = Configuration('udt_' + lang, cflags=['-g', '-O0']) # Read tagset and tag lexicon from corpus udt_tags, udt_norm_tags = read_dict(train_filename, 0, 1) # UDv1 #udt_tags = set(('ADJ ADP PUNCT ADV AUX SYM INTJ CONJ X NOUN DET PROPN NUM ' + # 'VERB PART PRON SCONJ').split()) # UDv2 udt_tags = set(('ADJ ADP ADV AUX CCONJ DET INTJ NOUN NUM PART PRON PROPN ' 'PUNCT SCONJ SYM VERB X').split()) # Create a Tagset object from the tags we have read UDT = Tagset(udt_tags, config) text_field = 0 tag_field = 1 # Define tags (relative to the current position during a search) this_tag = UDT.tag(tag_field, 0)
# SUC-tagged data to UD tags, as part of the Swedish annotation pipeline. from options import args from configuration import Configuration from form import * from tagset import Tagset from taglexicon import TagLexicon from tools import read_dict # There are plenty of other configuration options (see configuration.py), the # only mandatory one is the name of the model, which will be used for the C # file generated. config = Configuration('udt_suc_sv', args) # Read tagset and tag lexicon (coarse SUC -> UD) from corpus udt_sv_tags, udt_sv_suc_tags = read_dict('data/sv-ud-train.tab', 1, 3) udt_sv_tags.add('X') # Create a Tagset object from the tags we have read UDT_SV = Tagset(udt_sv_tags, config) lemma_field = 0 suc_field = 1 suc_full_field = 2 tag_field = 3 # UD tag (this is not really a sequence model, so we don't depend on history) this_tag = UDT_SV.tag(tag_field, 0) # Word form features (lemmas) this_word = TextField(lemma_field, 0)
from taglexicon import TagLexicon from wclexicon import WCLexicon from tools import read_dict import sys assert args.n_train_fields == 4 config = Configuration("suc_ne", args) if config.skip_generate: config.build() sys.exit(0) # Read tagset and tag lexicon (coarse SUC -> UD) from corpus suc_ne_tags, suc_norm_ne_tags = read_dict("suc-data/suc-blogs-ne-train.tab", 1, 3) # Create a Tagset object from the tags we have read SUC_NE = Tagset(suc_ne_tags, config) text_field = 0 lemma_field = 1 suc_full_field = 2 tag_field = 3 Names = WCLexicon.from_file("names", "suc-data/names.txt", config) WC = WCLexicon.from_file("brown", "suc-data/swe-brown100.txt", config) # Define tags (relative to the current position during a search) this_tag = SUC_NE.tag(tag_field, 0)