示例#1
0
    fix_html=True,
    segmenter='twitter',
    corrector='twitter',
    unpack_contractions=True,
    spell_correct_elong=False,
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons])

logger = Logger()
runner = Runner(logger=logger, ternary=TERNARY,
                model_type='baseline', use_embeddings=True)

logger.write('preprocessing: %s' % (True if preprocessor else False))

data_loader = DataLoader(preprocessor=preprocessor)
train, test = data_loader.get_train_test(ternary=TERNARY)
extra_train = data_loader.get_train(ternary=TERNARY, \
    paths=['data/ydata-ynacc-v1_0_expert_annotations_filt.tsv'])

feature_data_loader = DataLoader(preprocessor=feature_preprocessor)
feature_extractor = FeatureExtractor(data_loader=feature_data_loader,
                                     logger=logger)

train_feats, test_feats = feature_extractor.get_train_test_features(
    ternary=TERNARY,
    manual=True,
    auto=True,
    scaled=False)

runner.run(train, test, extra_train=extra_train)
示例#2
0
                    help="conf_file containes sample files and labels")

parser.add_argument("--w2v_path",
                    type=str,
                    default="/mnt/hgfs/share/pornCensor/query.skip.vec.win3",
                    help="w2v file which provide w2v")
FLAGS, unparsed = parser.parse_known_args()
print("unparsed: ", unparsed)

params = {"ratio": 0.2, "max_len": 15, "embedding_size": 100}
loader = DataLoader()
loader.set_params(params)
loader.set_w2v(FLAGS.w2v_path)
loader.build(FLAGS.conf_file)
#loader.save_dict("data/title_dict.json")
train_data, test_data, train_label, test_label = loader.get_train_test()

conf = {
    "embedding_size": loader.word_vec_len,
    "vocab_size": len(loader.weights),
    "sequence_len": loader.max_len,
    "epochs": 100,
    "classes": loader.classes
}
#model  = Lr(conf)
model = TextCnn(conf)
#model = Fasttext(conf)
#model = TextRnn(conf)
#model = AttentiveTextRnn(conf)
model.set_embedding(loader.get_weights())
model.set_categories(loader.get_categories())