示例#1
0
    def test_update_empty_vocab(self):
        empty = Vocabulary(unk_cutoff=2)
        self.assertEqual(len(empty), 0)
        self.assertFalse(empty)
        self.assertIn(empty.unk_label, empty)

        empty.update(list("abcde"))
        self.assertIn(empty.unk_label, empty)
示例#2
0
    def test_update_empty_vocab(self):
        empty = Vocabulary(unk_cutoff=2)
        self.assertEqual(len(empty), 0)
        self.assertFalse(empty)
        self.assertIn(empty.unk_label, empty)

        empty.update(list("abcde"))
        self.assertIn(empty.unk_label, empty)
示例#3
0
    parser.add_argument('-test',
                        default=False,
                        action='store_true',
                        help='true/false: run on test/dev set')
    args = parser.parse_args()

    tagged_data = [json.loads(line) for line in open_file("twt.train.json")]
    tagset = Twitter_Tagset()

    #observables - words
    #states - part-of-speech tags
    words, tags = unzip_tagged_sents(tagged_data)

    #vocabulary set
    vocab = Vocabulary(unk_cutoff=args.oov)
    _ = [vocab.update(sent) for sent in words]

    #bigram and unigram transition model for interpolating smoothing
    hmm_model = HW2ProbDist(labeled_sequence=tagged_data,
                            states=tagset,
                            transform=handle_lowfreq_words(vocab),
                            alpha1=args.a1,
                            alpha2=args.a2,
                            gammaPrior=args.gp,
                            gammaEmission=args.ge)
    init_model, emission_model, transition_model = hmm_model.train()

    #labeled sequences use MLE model for training, unlabelled sequences use
    #Baum-Welch expectation-maximization for training. Transform calling from
    #within the model is wacky, we do the transform of the labeled and unlabeled
    #datasets outside and just use identity in the model.
示例#4
0
class CHARACTERISTIC_TRAINER:
    def __init__(self, savedir=None):
        self.train = {}
        self.test = {}
        self.classifier = {}
        self.vocab = Vocabulary(unk_cutoff=1)
        self.prepare_dataset(mode='train')
        self.prepare_dataset(mode="test")
        self.vocab_words = {
            w: 0
            for w in self.vocab.counts.keys() if w in self.vocab
        }
        self.vocab_words['UNK'] = 0  # initially add UNK feature section
        # vocab size is currently 20124
        # uncomment this and erase the below line for full training. Currently training only gender for speed issue
        for mode in [
                'gender', 'age_group', 'extroverted', 'stable', 'agreeable',
                'conscientious', 'openness'
        ]:
            self.run_train(mode)

        if savedir is not None:
            with open(savedir, 'wb') as f:
                pickle.dump(self, f)
        # self.run_train('gender')

    def prepare_dataset(self, mode="train"):  # mode = ["train", "test"]
        """
        Each line of the truth files encodes the following information:
        userid:::gender:::age_group:::extroverted:::stable:::agreeable:::conscientious:::openness
        """
        print(f"prepare_dataset: {mode} START")
        if mode == "train":
            dir_path = CHAR_TRAIN_DIR
            saved = self.train
        elif mode == "test":
            dir_path = CHAR_TEST_DIR
            saved = self.test
        else:
            raise Exception(
                "Directory name should be one of 'train' or 'test'")

        with open(dir_path + "truth.txt", "r") as f:
            truths = f.read().split('\n')[:-1]
        for truth in truths:
            userid, gender, age_group, extroverted, stable, agreeable, conscientious, openness = truth.split(
                ":::")
            root = ET.parse(f"{dir_path}{userid}.xml").getroot()
            words = [
                self.preprocess_text(child.text, mode=mode) for child in root
            ]
            saved[userid] = {
                "gender": gender,
                "age_group": age_group,
                "extroverted": float(extroverted),
                "stable": float(stable),
                "agreeable": float(agreeable),
                "conscientious": float(conscientious),
                "openness": float(openness),
                "text": words
            }

        print(f"prepare_dataset: {mode} DONE")

    def preprocess_text(self,
                        text,
                        mode='train'):  # clean up and tokenize text
        processed_text = []
        # remove url
        # change @username to you
        if 'http' in text:
            text = text[:text.index('http')]
        text = re.sub(r"[^A-Z a-z?!-]+", '', text)
        words = [w.lower() for w in word_tokenize(text)]
        if mode == 'train':
            self.vocab.update(words)  # add corresponding word to vocab
        return words

    def get_feature_dict(self, words):
        feature_dict = self.vocab_words.copy()
        for word in words:
            if word in self.vocab:
                feature_dict[word] += 1
            else:
                feature_dict['UNK'] += 1
        return feature_dict

    def run_train(self, mode='agreeable'):
        # mode in ['gender', 'age_group', 'extroverted', 'stable',
        # 'agreeable',·'conscientious', 'openness']
        train_input = []
        print(f"making train_input: {mode}")
        for infos in tqdm(self.train.values()):
            for info in infos['text']:  # process same label for 100 texts
                train_input.append((self.get_feature_dict(info), infos[mode]))
        print(f"running trainer... {mode}")
        self.classifier[mode] = NB.train(train_input)
        print("running trainer done")

    def predict(self,
                text,
                mode='gender'):  # mode has to be one of classifier.keys()
        preprocessed_words = self.preprocess_text(text, mode='predict')
        feature_dict = self.get_feature_dict(preprocessed_words)
        classified = self.classifier[mode].classify(feature_dict)
        # print(f"Predicted output: {classified}")
        return classified