fix_html=True, segmenter='twitter', corrector='twitter', unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) logger = Logger() runner = Runner(logger=logger, ternary=TERNARY, model_type='baseline', use_embeddings=True) logger.write('preprocessing: %s' % (True if preprocessor else False)) data_loader = DataLoader(preprocessor=preprocessor) train, test = data_loader.get_train_test(ternary=TERNARY) extra_train = data_loader.get_train(ternary=TERNARY, \ paths=['data/ydata-ynacc-v1_0_expert_annotations_filt.tsv']) feature_data_loader = DataLoader(preprocessor=feature_preprocessor) feature_extractor = FeatureExtractor(data_loader=feature_data_loader, logger=logger) train_feats, test_feats = feature_extractor.get_train_test_features( ternary=TERNARY, manual=True, auto=True, scaled=False) runner.run(train, test, extra_train=extra_train)
help="conf_file containes sample files and labels") parser.add_argument("--w2v_path", type=str, default="/mnt/hgfs/share/pornCensor/query.skip.vec.win3", help="w2v file which provide w2v") FLAGS, unparsed = parser.parse_known_args() print("unparsed: ", unparsed) params = {"ratio": 0.2, "max_len": 15, "embedding_size": 100} loader = DataLoader() loader.set_params(params) loader.set_w2v(FLAGS.w2v_path) loader.build(FLAGS.conf_file) #loader.save_dict("data/title_dict.json") train_data, test_data, train_label, test_label = loader.get_train_test() conf = { "embedding_size": loader.word_vec_len, "vocab_size": len(loader.weights), "sequence_len": loader.max_len, "epochs": 100, "classes": loader.classes } #model = Lr(conf) model = TextCnn(conf) #model = Fasttext(conf) #model = TextRnn(conf) #model = AttentiveTextRnn(conf) model.set_embedding(loader.get_weights()) model.set_categories(loader.get_categories())