import sys import msgpackutil import vectorizer as ve if len(sys.argv) <= 2: print("Usage: " + sys.argv[0] + " infile outfile", file=sys.stderr) exit(1) infile = sys.argv[1] outfile = sys.argv[2] print("infile = " + infile) print("outfile = " + outfile) print("loading...") in_records = msgpackutil.load(infile) print("processing...") # generator = ve.TermDictionaryGenerator() generator = ve.TermFrequencyDictionaryGenerator(minimum=5) for index, (category, title, tokens) in enumerate(in_records): if index % 1000 == 0: print(str(index) + "/" + str(len(in_records))) generator.update(tokens) dictionary = generator.to_dictionary() print("dictionary.terms = " + str(len(dictionary.terms))) print("dumping...") dictionary.save(outfile)
import sys import msgpackutil sys.path.append(".") import context as ctx if len(sys.argv) <= 2: print("Usage: " + sys.argv[0] + " infile outfile", file=sys.stderr) exit(1) infile = sys.argv[1] outfile = sys.argv[2] print("infile = " + infile) print("outfile = " + outfile) print("loading...") in_records = msgpackutil.load(infile) print("tokenize...") out_records = [] for index, (category, title) in enumerate(in_records): if index % 100 == 0: print(str(index) + "/" + str(len(in_records))) tokens = ctx.tokenizer.tokenize(title) out_records.append([category, title, tokens]) print("dumping...") msgpackutil.dump(outfile, out_records) print("ok")
def load(cls, filename): terms = msgpackutil.load(filename) return TermDictionary(terms)
import msgpackutil if len(sys.argv) <= 3: print("Usage: " + sys.argv[0] + " infile1 infile2 outfile", file=sys.stderr) exit(1) infile1 = sys.argv[1] infile2 = sys.argv[2] outfile = sys.argv[3] print("infile1 = " + infile1) print("infile2 = " + infile2) print("outfile = " + outfile) print("loading...") in_records1 = msgpackutil.load(infile1) in_records2 = msgpackutil.load(infile2) print("processing...") out_records = [] out_records.extend(in_records1) out_records.extend(in_records2) print("in_records1.len = " + str(len(in_records1))) print("in_records2.len = " + str(len(in_records2))) print("out_records.len = " + str(len(out_records))) print("dumping...") msgpackutil.dump(outfile, out_records) print("ok")
import sys import msgpackutil if len(sys.argv) <= 3: print("Usage: " + sys.argv[0] + " infile1 infile2 outfile", file=sys.stderr) exit(1) infile1 = sys.argv[1] infile2 = sys.argv[2] outfile = sys.argv[3] print("infile1 = " + infile1) print("infile2 = " + infile2) print("outfile = " + outfile) print("loading...") in_records1 = msgpackutil.load(infile1) in_records2 = msgpackutil.load(infile2) print("processing...") out_records = [] out_records.extend(in_records1) out_records.extend(in_records2) print("in_records1.len = " + str(len(in_records1))) print("in_records2.len = " + str(len(in_records2))) print("out_records.len = " + str(len(out_records))) print("dumping...") msgpackutil.dump(outfile, out_records) print("ok")
exit(1) dictfile = sys.argv[1] testfile = sys.argv[2] trainfile = sys.argv[3] modelfile = sys.argv[4] datadir = sys.argv[5] print("dictfile = " + dictfile) print("testfile = " + testfile) print("trainfile = " + trainfile) print("modelfile = " + modelfile) print("datadir = " + datadir) print("loading...") dictionary = ve.TermDictionary.load(dictfile) test_records = msgpackutil.load(testfile) train_records = msgpackutil.load(trainfile) train_rail_records = [record for record in train_records if record[0] == "rail"] train_other_records = [record for record in train_records if record[0] == "other"] print("test_records.len = " + str(len(test_records))) print("train_records.len = " + str(len(train_records))) print("train_rail_records.len = " + str(len(train_rail_records))) print("train_other_records.len = " + str(len(train_other_records))) # 乱数シードを固定する。 random.seed(0) np.random.seed(0) tf.set_random_seed(0) with tf.Graph().as_default(): model = ctx.make_model(num_of_terms=len(dictionary.terms))