import sys
import msgpackutil
import vectorizer as ve

if len(sys.argv) <= 2:
    print("Usage: " + sys.argv[0] + " infile outfile", file=sys.stderr)
    exit(1)

infile  = sys.argv[1]
outfile = sys.argv[2]
print("infile  = " + infile)
print("outfile = " + outfile)

print("loading...")
in_records = msgpackutil.load(infile)

print("processing...")
# generator = ve.TermDictionaryGenerator()
generator = ve.TermFrequencyDictionaryGenerator(minimum=5)
for index, (category, title, tokens) in enumerate(in_records):
    if index % 1000 == 0:
        print(str(index) + "/" + str(len(in_records)))
    generator.update(tokens)

dictionary = generator.to_dictionary()
print("dictionary.terms = " + str(len(dictionary.terms)))

print("dumping...")
dictionary.save(outfile)
Пример #2
0
import sys
import msgpackutil

sys.path.append(".")
import context as ctx

if len(sys.argv) <= 2:
    print("Usage: " + sys.argv[0] + " infile outfile", file=sys.stderr)
    exit(1)

infile = sys.argv[1]
outfile = sys.argv[2]
print("infile   = " + infile)
print("outfile  = " + outfile)

print("loading...")
in_records = msgpackutil.load(infile)

print("tokenize...")
out_records = []
for index, (category, title) in enumerate(in_records):
    if index % 100 == 0:
        print(str(index) + "/" + str(len(in_records)))
    tokens = ctx.tokenizer.tokenize(title)
    out_records.append([category, title, tokens])

print("dumping...")
msgpackutil.dump(outfile, out_records)

print("ok")
Пример #3
0
 def load(cls, filename):
     terms = msgpackutil.load(filename)
     return TermDictionary(terms)
Пример #4
0
import msgpackutil

if len(sys.argv) <= 3:
    print("Usage: " + sys.argv[0] + " infile1 infile2 outfile",
          file=sys.stderr)
    exit(1)

infile1 = sys.argv[1]
infile2 = sys.argv[2]
outfile = sys.argv[3]
print("infile1 = " + infile1)
print("infile2 = " + infile2)
print("outfile = " + outfile)

print("loading...")
in_records1 = msgpackutil.load(infile1)
in_records2 = msgpackutil.load(infile2)

print("processing...")
out_records = []
out_records.extend(in_records1)
out_records.extend(in_records2)
print("in_records1.len = " + str(len(in_records1)))
print("in_records2.len = " + str(len(in_records2)))
print("out_records.len = " + str(len(out_records)))

print("dumping...")
msgpackutil.dump(outfile, out_records)

print("ok")
Пример #5
0
import sys
import msgpackutil

if len(sys.argv) <= 3:
    print("Usage: " + sys.argv[0] + " infile1 infile2 outfile", file=sys.stderr)
    exit(1)

infile1 = sys.argv[1]
infile2 = sys.argv[2]
outfile = sys.argv[3]
print("infile1 = " + infile1)
print("infile2 = " + infile2)
print("outfile = " + outfile)

print("loading...")
in_records1 = msgpackutil.load(infile1)
in_records2 = msgpackutil.load(infile2)

print("processing...")
out_records = []
out_records.extend(in_records1)
out_records.extend(in_records2)
print("in_records1.len = " + str(len(in_records1)))
print("in_records2.len = " + str(len(in_records2)))
print("out_records.len = " + str(len(out_records)))

print("dumping...")
msgpackutil.dump(outfile, out_records)

print("ok")
Пример #6
0
    exit(1)

dictfile  = sys.argv[1]
testfile  = sys.argv[2]
trainfile = sys.argv[3]
modelfile = sys.argv[4]
datadir   = sys.argv[5]
print("dictfile  = " + dictfile)
print("testfile  = " + testfile)
print("trainfile = " + trainfile)
print("modelfile = " + modelfile)
print("datadir   = " + datadir)

print("loading...")
dictionary    = ve.TermDictionary.load(dictfile)
test_records  = msgpackutil.load(testfile)
train_records = msgpackutil.load(trainfile)
train_rail_records  = [record for record in train_records if record[0] == "rail"]
train_other_records = [record for record in train_records if record[0] == "other"]
print("test_records.len        = " + str(len(test_records)))
print("train_records.len       = " + str(len(train_records)))
print("train_rail_records.len  = " + str(len(train_rail_records)))
print("train_other_records.len = " + str(len(train_other_records)))

# 乱数シードを固定する。
random.seed(0)
np.random.seed(0)
tf.set_random_seed(0)

with tf.Graph().as_default():
    model = ctx.make_model(num_of_terms=len(dictionary.terms))