Exemplo n.º 1
0
#!/usr/bin/python

"""
make features for part of hlines
"""

# NB I ran a variant of this on the Translate server

import sys

sys.path.append("./lib/python")

import takelab.simpfeats as tl


# load word counts for IC weighting
tl.wweight = tl.load_wweight_table("../wordfreq/_wordfreq_hlines.txt")
tl.minwweight = min(tl.wweight.values())

for in_fname in sys.argv[1:]:
    out_fname = in_fname + ".npz"
    sys.stderr.write("creating {}\n".format(out_fname))
    tl.generate_features(in_fname,
                         outf=out_fname, 
                         out_format="numpy", 
                         with_lsa=False)
    
Exemplo n.º 2
0
#!/usr/bin/env python

"""
make features for STS training and test data for use with NTNU system
"""

import sys
from os.path import join, exists
from os import makedirs

import takelab.simpfeats as tl

from sts import sts12, sts13

# load word counts for IC weighting
tl.wweight = tl.load_wweight_table("../_data/wordfreq/wordfreq-STS.txt")
tl.minwweight = min(tl.wweight.values())
   

# load vector spaces    
tl.nyt_sim = tl.Sim('../_data/lsa-matrices/nyt-words-sts.txt', 
                    '../_data/lsa-matrices/nyt-matrix-sts.txt')
tl.wiki_sim = tl.Sim('../_data/lsa-matrices/wiki-words-sts.txt', 
                     '../_data/lsa-matrices/wiki-matrix-sts.txt')

with_lsa = True


dest_dir = "../out/STS2012-train"

for data_id in sts12.train_ids:
Exemplo n.º 3
0
#!/usr/bin/python

"""
make features for part of hlines
"""

# NB I ran a variant of this on the Translate server

import sys

sys.path.append("./lib/python")

import takelab.simpfeats as tl


# load word counts for IC weighting
tl.wweight = tl.load_wweight_table("../wordfreq/_wordfreq_hlines.txt")
tl.minwweight = min(tl.wweight.values())

for in_fname in sys.argv[1:]:
    out_fname = in_fname + ".npz"
    sys.stderr.write("creating {}\n".format(out_fname))
    tl.generate_features(in_fname, outf=out_fname, out_format="numpy", with_lsa=False)
Exemplo n.º 4
0
#!/usr/bin/env python

"""
make features for STS12 training and test data
"""

import sys

import takelab.simpfeats as tl


# load word counts for IC weighting
tl.wweight = tl.load_wweight_table("../wordfreq/_wordfreq-STS2012.txt")
tl.minwweight = min(tl.wweight.values())

with_lsa = False    

# load vector spaces    
if with_lsa:
    tl.nyt_sim = tl.Sim('_vsm_data/nyt_words.txt', '_vsm_data/nyt_word_vectors.txt')
    tl.wiki_sim = tl.Sim('_vsm_data/wikipedia_words.txt', '_vsm_data/wikipedia_word_vectors.txt')

# create training instances
train_dir = "../../data/STS2012-train"

for data in "MSRpar", "MSRvid", "SMTeuroparl":
    out_fname = "_npz_data/_STS2012.train.{}.npz".format(data)
    sys.stderr.write("creating {}\n".format(out_fname))
    tl.generate_features("{}/STS.input.{}.txt".format(train_dir, data),
                         "{}/STS.gs.{}.txt".format(train_dir, data),
                         outf=out_fname, 
Exemplo n.º 5
0
make Takelab's features for STS training and test data
"""
import os
from os.path import join, exists
from os import makedirs

import sts
import takelab.simpfeats as tl


# requires Takelab LSA models
TL_DATA_DIR = "_data"
with_lsa = True

# load word counts for IC weighting
tl.wweight = tl.load_wweight_table(os.path.join(TL_DATA_DIR, "wordfreq/wordfreq-STS.txt"))
tl.minwweight = min(tl.wweight.values())

if with_lsa:
    # load vector spaces
    tl.nyt_sim = tl.Sim(
        os.path.join(TL_DATA_DIR, "lsa-matrices/nyt-words.txt"),
        os.path.join(TL_DATA_DIR, "lsa-matrices/nyt-matrix.txt"),
    )
    tl.wiki_sim = tl.Sim(
        os.path.join(TL_DATA_DIR, "lsa-matrices/wiki-words.txt"),
        os.path.join(TL_DATA_DIR, "lsa-matrices/wiki-matrix.txt"),
    )


def make_feats(ids2fnames, dest_dir, with_lsa=True):