def test_update_empty_vocab(self): empty = Vocabulary(unk_cutoff=2) self.assertEqual(len(empty), 0) self.assertFalse(empty) self.assertIn(empty.unk_label, empty) empty.update(list("abcde")) self.assertIn(empty.unk_label, empty)
parser.add_argument('-test', default=False, action='store_true', help='true/false: run on test/dev set') args = parser.parse_args() tagged_data = [json.loads(line) for line in open_file("twt.train.json")] tagset = Twitter_Tagset() #observables - words #states - part-of-speech tags words, tags = unzip_tagged_sents(tagged_data) #vocabulary set vocab = Vocabulary(unk_cutoff=args.oov) _ = [vocab.update(sent) for sent in words] #bigram and unigram transition model for interpolating smoothing hmm_model = HW2ProbDist(labeled_sequence=tagged_data, states=tagset, transform=handle_lowfreq_words(vocab), alpha1=args.a1, alpha2=args.a2, gammaPrior=args.gp, gammaEmission=args.ge) init_model, emission_model, transition_model = hmm_model.train() #labeled sequences use MLE model for training, unlabelled sequences use #Baum-Welch expectation-maximization for training. Transform calling from #within the model is wacky, we do the transform of the labeled and unlabeled #datasets outside and just use identity in the model.
class CHARACTERISTIC_TRAINER: def __init__(self, savedir=None): self.train = {} self.test = {} self.classifier = {} self.vocab = Vocabulary(unk_cutoff=1) self.prepare_dataset(mode='train') self.prepare_dataset(mode="test") self.vocab_words = { w: 0 for w in self.vocab.counts.keys() if w in self.vocab } self.vocab_words['UNK'] = 0 # initially add UNK feature section # vocab size is currently 20124 # uncomment this and erase the below line for full training. Currently training only gender for speed issue for mode in [ 'gender', 'age_group', 'extroverted', 'stable', 'agreeable', 'conscientious', 'openness' ]: self.run_train(mode) if savedir is not None: with open(savedir, 'wb') as f: pickle.dump(self, f) # self.run_train('gender') def prepare_dataset(self, mode="train"): # mode = ["train", "test"] """ Each line of the truth files encodes the following information: userid:::gender:::age_group:::extroverted:::stable:::agreeable:::conscientious:::openness """ print(f"prepare_dataset: {mode} START") if mode == "train": dir_path = CHAR_TRAIN_DIR saved = self.train elif mode == "test": dir_path = CHAR_TEST_DIR saved = self.test else: raise Exception( "Directory name should be one of 'train' or 'test'") with open(dir_path + "truth.txt", "r") as f: truths = f.read().split('\n')[:-1] for truth in truths: userid, gender, age_group, extroverted, stable, agreeable, conscientious, openness = truth.split( ":::") root = ET.parse(f"{dir_path}{userid}.xml").getroot() words = [ self.preprocess_text(child.text, mode=mode) for child in root ] saved[userid] = { "gender": gender, "age_group": age_group, "extroverted": float(extroverted), "stable": float(stable), "agreeable": float(agreeable), "conscientious": float(conscientious), "openness": float(openness), "text": words } print(f"prepare_dataset: {mode} DONE") def preprocess_text(self, text, mode='train'): # clean up and tokenize text processed_text = [] # remove url # change @username to you if 'http' in text: text = text[:text.index('http')] text = re.sub(r"[^A-Z a-z?!-]+", '', text) words = [w.lower() for w in word_tokenize(text)] if mode == 'train': self.vocab.update(words) # add corresponding word to vocab return words def get_feature_dict(self, words): feature_dict = self.vocab_words.copy() for word in words: if word in self.vocab: feature_dict[word] += 1 else: feature_dict['UNK'] += 1 return feature_dict def run_train(self, mode='agreeable'): # mode in ['gender', 'age_group', 'extroverted', 'stable', # 'agreeable',·'conscientious', 'openness'] train_input = [] print(f"making train_input: {mode}") for infos in tqdm(self.train.values()): for info in infos['text']: # process same label for 100 texts train_input.append((self.get_feature_dict(info), infos[mode])) print(f"running trainer... {mode}") self.classifier[mode] = NB.train(train_input) print("running trainer done") def predict(self, text, mode='gender'): # mode has to be one of classifier.keys() preprocessed_words = self.preprocess_text(text, mode='predict') feature_dict = self.get_feature_dict(preprocessed_words) classified = self.classifier[mode].classify(feature_dict) # print(f"Predicted output: {classified}") return classified