def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, code_freq, training_opt): # Start Training # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq) model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "most_freq_code", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags) vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags) wd_td_ys_bytag = dict() wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] model_filename = models_folder + "/" + "%i_%s__%s" % (fold, code, str(randint(0, 9999999))) # documentation: http://www.chokkan.org/software/crfsuite/manual.html training_opt = {"feature.possible_states": False, "feature.possible_transitions": False, "c2": 2.0 } model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td, model_filename) wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd) td_predictions = model.tag_sents(to_sentences(td)) vd_predictions = model.tag_sents(to_sentences(vd)) # Delete model file now predictions obtained # Note, we are randomizing name above, so we need to clean up here os.remove(model_filename) td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions) vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags) vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags) wd_td_ys_bytag = dict() wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] model_filename = models_folder + "/" + "%i_%s__%s" % (fold, code, str(randint(0, 9999999))) # documentation: http://www.chokkan.org/software/crfsuite/manual.html model = CRFTagger(feature_func=comp_feat_extactor, verbose=False) model.train(td, model_filename) wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd) td_predictions = model.tag_sents(to_sentences(td)) vd_predictions = model.tag_sents(to_sentences(vd)) # Delete model file now predictions obtained # Note, we are randomizing name above, so we need to clean up here os.remove(model_filename) td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions) vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, training_opt): # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents = to_label_powerset_tagged_sentences(essays_VD, regular_tags) model_filename = models_folder + "/" + "%i_%s__%s" % ( fold, "power_set", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code( td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code( vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): # Start Training print("Fold %i Training code" % fold) # Important - only compute code frequency from training data (NO CHEATING) code_freq = tally_code_frequencies(essays_TD) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq) model_filename = models_folder + "/" + "%i_%s__%s" % ( fold, "most_freq_code", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code( td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code( vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
class CRF: def __init__(self, config): self.word_ftrs = config['word_ftrs'] for ftr in self.word_ftrs: if ftr not in CRF.WORD_FTRS: raise Exception( 'Unknown feature {}. See CRF.WORD_FTRS for supported ones.' .format(CRF.WORD_FTRS)) self.stc_ftrs = config['stc_ftrs'] for ftr in self.stc_ftrs: if ftr not in CRF.STC_FTRS: raise Exception( 'Unknown feature {}. See CRF.STC_FTRS for supported ones.'. format(CRF.STC_FTRS)) self.words_ids = config['extr_word_idx'] def prep_data(self, file='data/HaaretzOrnan_annotated.txt'): self.data = [] with codecs.open(file, encoding='utf-8') as f: lines = f.readlines() self.data.append([]) for line in lines: line = line.rstrip() # Start new sentence if line.startswith(u'#'): continue if len(line) == 0: if len(self.data[-1]) > 0: self.data.append([]) continue # Append word to last sentence w = line.split(u' ')[3] w = w.replace(u'-', u'') self.data[-1].append(w) # If sentence is empty - remove it if len(self.data[-1]) == 0: self.data.remove(self.data[-1]) return self def shuffle(self, seed=None): # Shuffle based on seed inds = np.arange(len(self.data)) np.random.seed(seed) np.random.shuffle(inds) self.data = [self.data[i] for i in inds] return self def split(self, valid_ratio=0.1): # Split to train and validation based on ratio. # If ratio is 0 use all data for training num_train = int(len(self.data) * (1 - valid_ratio)) self.train_set = self.data[:num_train] self.valid_set = None if valid_ratio == 0 else self.data[num_train:] return self def train(self, load_model=None): train_set = CRF._fin_data_prep(self.train_set) _extract_ftr = self._gen_ftr_func() self.model = CRFTagger(_extract_ftr, verbose=False, training_opt={ "num_memories": 500, "delta": 1e-8 }) self.model.train(train_set, 'stc_crf_model') return self def eval(self): conf_mat = np.zeros((len(CRF.VOWELS), len(CRF.VOWELS))) valid_set = CRF._fin_data_prep(self.valid_set) valid_stc_cons = [[x[0] for x in w] for w in valid_set] valid_stc_vowel = [[x[1] for x in w] for w in valid_set] predicted = self.model.tag_sents(valid_stc_cons) predicted = [[x[1] for x in w] for w in predicted] for w_ind in range(len(predicted)): for vow_ind, pred_vow in enumerate(predicted[w_ind]): conf_mat[self.VOWELS_IDX[pred_vow], self.VOWELS_IDX[valid_stc_vowel[w_ind][vow_ind]]] += 1 return conf_mat def predict(self, pred_set): data = [] for sent in pred_set: sent_cons = u' '.join(sent) for i, w in enumerate(sent): w_cons = list(w) w_pos = [i] * len(w) unif_sent = [sent_cons] * len(w) d = list(zip(w_cons, w_pos, unif_sent)) data.append(d) pred = self.model.tag_sents(data) result = [] word_idx = 0 for sent in pred_set: result.append([]) for word in sent: pred_smpl = pred[word_idx] w = ''.join([entry[0][0] + entry[-1] for entry in pred_smpl]) result[-1].append(w) word_idx += 1 return result @staticmethod def _fin_data_prep(data_set): data = [] for sent in data_set: sent_cons = u' '.join([x[::2] for x in sent]) for i, w in enumerate(sent): w_cons = list(w[::2]) w_pos = [i] * len(w[::2]) unif_sent = [sent_cons] * len(w[::2]) d = list(zip(w_cons, w_pos, unif_sent)) data.append(list(zip(d, list(w[1::2])))) return data @staticmethod def _len(x): return len(x) if isinstance(x, str) else int(x) VOWELS = [u'a', u'e', u'u', u'i', u'o', u'*'] VOWELS_IDX = {x: i for i, x in enumerate(VOWELS)} WORD_FTRS = [ 'IS_FIRST', 'IS_LAST', 'IDX', 'VAL', 'PRV_VAL', 'NXT_VAL', 'FRST_VAL', 'LST_VAL', 'SCND_VAL', 'SCND_LST_VAL', 'LEN' ] STC_FTRS = ['IS_FIRST', 'IS_LAST', 'IDX'] def _gen_ftr_func(self): # Closure def _extract_ftr(tokens, i): def _extract_wrd_ftr(tokens, i, suff): feature_list = [] if i is not None: if 'IS_FIRST' in self.word_ftrs: feature_list.append("is_first{}={}".format( suff, 1 if i == 0 else 0)) if 'IS_LAST' in self.word_ftrs: feature_list.append("is_last{}={}".format( suff, 1 if i == (len(tokens) - 1) else 0)) if 'IDX' in self.word_ftrs: feature_list.append("pos{}={}".format(suff, i)) if 'VAL' in self.word_ftrs: feature_list.append("cur{}={}".format(suff, tokens[i])) if 'PRV_VAL' in self.word_ftrs: if i > 0: feature_list.append("prev{}={}".format( suff, tokens[i - 1])) if 'NXT_VAL' in self.word_ftrs: if i < (len(tokens) - 1): feature_list.append("next{}={}".format( suff, tokens[i + 1])) if 'FRST_VAL' in self.word_ftrs: feature_list.append("first{}={}".format(suff, tokens[0])) if 'LST_VAL' in self.word_ftrs: feature_list.append("last{}={}".format(suff, tokens[-1])) if 'LEN' in self.word_ftrs: feature_list.append("len{}={}".format(suff, len(tokens))) if 'SCND_VAL' in self.word_ftrs: if len(tokens) > 1: feature_list.append("scnd{}={}".format( suff, tokens[1])) if 'SCND_LST_VAL' in self.word_ftrs: if len(tokens) > 1: feature_list.append("scnd_last{}={}".format( suff, tokens[-2])) return feature_list feature_list = [] word_pos = tokens[0][1] sent = tokens[0][2].split(' ') # Sentence features if 'IS_FIRST' in self.stc_ftrs: if word_pos == 0: feature_list.append('FIRST_WORD') if 'IS_LAST' in self.stc_ftrs: if word_pos == (len(sent) - 1): feature_list.append('LAST_WORD') if 'IDX' in self.stc_ftrs: feature_list.append("idx=" + str(word_pos)) # word features for rel_pos in self.words_ids: word_pos = tokens[0][1] + rel_pos if word_pos >= 0 and word_pos < len(sent): word = sent[word_pos] feature_list += _extract_wrd_ftr( word, i if rel_pos == 0 else None, '_w{}'.format(rel_pos)) return feature_list return _extract_ftr
class CRF: def __init__(self, config): self.ftrs = config['ftrs'] for ftr in self.ftrs: if ftr not in CRF.WORD_FTRS: raise Exception('Unknown feature {}. See CRF.CONFIG for supported ones.'.format(CRF.WORD_FTRS)) def prep_data(self, file='data/HaaretzOrnan_annotated.txt'): self.data = [] # print('Preparing data') with codecs.open(file, encoding='utf-8') as f: lines = f.readlines() for line in lines: line = line.rstrip() if line.startswith(u'#') or len(line) == 0: continue w = line.split(u' ')[3] w = w.replace(u'-', u'') self.data.append(list(zip(list(w[::2]), list(w[1::2])))) return self def shuffle(self, seed=None): # Shuffle based on seed inds = np.arange(len(self.data)) np.random.seed(seed) np.random.shuffle(inds) self.data=[self.data[i] for i in inds] return self def split(self, valid_ratio=0.1): # Split to train and validation based on ratio. # If ratio is 0 use all data for training num_train = int(len(self.data)*(1-valid_ratio)) self.train_set = self.data[:num_train] self.valid_set = None if valid_ratio==0 else self.data[num_train:] return self def train(self, load_model=None): _extract_ftr = self._gen_ftr_func() self.model = CRFTagger(_extract_ftr, verbose=False, training_opt={"num_memories": 500, "delta": 1e-8}) self.model.train(self.train_set, 'word_crf_model') return self def eval(self): conf_mat = np.zeros((len(CRF.VOWELS), len(CRF.VOWELS))) valid_word_cons = [[x[0] for x in w] for w in self.valid_set] valid_word_vowel = [[x[1] for x in w] for w in self.valid_set] predicted = self.model.tag_sents(valid_word_cons) predicted = [[x[1] for x in w] for w in predicted] for w_ind in range(len(predicted)): for vow_ind, pred_vow in enumerate(predicted[w_ind]): conf_mat[self.VOWELS_IDX[pred_vow], self.VOWELS_IDX[valid_word_vowel[w_ind][vow_ind]]] += 1 return conf_mat def predict(self, pred_set): result = [] for sent in pred_set: pred_sent = [] predicted = self.model.tag_sents(sent) for i, w_cons in enumerate(predicted): pred_sent.append(''.join(x+y for x, y in w_cons)) result.append(pred_sent) return result @staticmethod def _len(x): return len(x) if isinstance(x, str) else int(x) VOWELS = [u'a',u'e',u'u',u'i',u'o',u'*'] VOWELS_IDX = {x:i for i,x in enumerate(VOWELS)} WORD_FTRS = ['IS_FIRST', 'IS_LAST', 'IDX', 'VAL', 'PRV_VAL', 'NXT_VAL', 'FRST_VAL', 'LST_VAL', 'SCND_VAL', 'SCND_LST_VAL', 'LEN'] def _gen_ftr_func(self): # Closure def _extract_ftr(tokens, i): # print(tokens, i, tokens[i]) feature_list = [] if 'IS_FIRST' in self.ftrs: feature_list.append("is_first="+str(1 if i == 0 else 0)) if 'IS_LAST' in self.ftrs: feature_list.append("is_last="+str(1 if i == (len(tokens)-1) else 0)) if 'IDX' in self.ftrs: feature_list.append("pos="+str(i)) if 'VAL' in self.ftrs: feature_list.append("cur="+tokens[i]) if 'PRV_VAL' in self.ftrs: if i > 0: feature_list.append("prev="+tokens[i-1]) if 'NXT_VAL' in self.ftrs: if i < (len(tokens)-1): feature_list.append("next="+tokens[i+1]) if 'FRST_VAL' in self.ftrs: feature_list.append("first="+tokens[0]) if 'LST_VAL' in self.ftrs: feature_list.append("last="+tokens[-1]) if 'LEN' in self.ftrs: feature_list.append("len="+str(len(tokens))) if 'SCND_VAL' in self.ftrs: if len(tokens)>1: feature_list.append("scnd="+tokens[1]) if 'SCND_LST_VAL' in self.ftrs: if len(tokens)>1: feature_list.append("scnd_last="+tokens[-2]) return feature_list return _extract_ftr
def main(positive, death): ############# Compile the dataset ############### ## Load the dataset text = list() response = list() file_path = [positive, death] for path in file_path: input_file = jsonlines.open(path) for obj in input_file: text.append(obj['text']) response.append(obj['annotation']['part1.Response']) ## Tweet Preprocessing prep_text = list() for i in text: prep_text.append(p.clean(i)) ## Tag Keywords and Create Labels ### Focus on verbs--therefore, try lemmatization first wnl = WordNetLemmatizer() n_corpus = len(prep_text) token_data = ["test"] * n_corpus n = 0 for sent in prep_text: token_data[n] = [ wnl.lemmatize(i, j[0].lower()) if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i) for i, j in pos_tag(word_tokenize(sent)) ] n = n + 1 ### Create labels death_list = ["die", "dead", "death", "pass", "away"] n = 0 for sent in token_data: for idx, token in enumerate(sent): if ((token.lower() in ["test", "positive", "result"]) and (response[n] == ["yes"])): sent[idx] = [sent[idx], "P-Yes"] elif ((token.lower() in ["test", "positive", "result"]) and (response[n] == ["no"])): sent[idx] = [sent[idx], "P-No"] elif ((token.lower() in death_list) and (response[n] == ["yes"])): sent[idx] = [sent[idx], "D-Yes"] elif ((token.lower() in death_list) and (response[n] == ["no"])): sent[idx] = [sent[idx], "D-No"] else: sent[idx] = [sent[idx], "Irr"] n = n + 1 ## Shuffle and split into train data and dev data token_data = shuffle(token_data, random_state=6) train_data, dev_data = train_test_split(token_data, test_size=0.3, random_state=616) print( f"The number of sentences in training data: {len(train_data)}; The number of sentences in dev data: {len(dev_data)};" ) ############# Fit A CRF Model And Predict ############### condition_to_func = { "base": my_features, "include_neighbors": neighbor_features } for cond, func in condition_to_func.items(): # initialize crf = CRFTagger(feature_func=func) crf.train(train_data, 'model.tagger') # Test crf._feature_func(prep_text[0].split(), 7) crf.tag_sents([['I', 'get', 'covid'], ['he', 'test', 'positive']]) # Output filename = cond + "_final_output.tsv" with open(filename, 'w') as pred_file: for sent in dev_data: sent_words = [item[0] for item in sent] gold_tags = [item[1] for item in sent] with_tags = crf.tag(sent_words) for i, output in enumerate(with_tags): original_word, tag_prediction = output line_as_str = f"{original_word}\t{gold_tags[i]}\t{tag_prediction}\n" pred_file.write(line_as_str) # add an empty line after each sentence pred_file.write("\n") ############# Evaluation ############### ## Extract Data with Meaning Labels cond_list = ['base', 'include_neighbors'] for cond in cond_list: filename = cond + "_final_output.tsv" with open(filename) as fd: rd = csv.reader(fd, delimiter="\t", quotechar='"') D_data = [] P_data = [] for row in rd: if len(row) > 1: if row[1] in ['P-Yes', 'P-No']: P_data.append(row) elif row[1] in ['D-Yes', 'D-No']: D_data.append(row) column_name = ['token', 'label', 'prediction'] P_df = pd.DataFrame(P_data, columns=column_name) D_df = pd.DataFrame(D_data, columns=column_name) Total_df = P_df.append(D_df) # Accuracy ## Overall Accuracy T_a = accuracy_score(Total_df['label'], Total_df['prediction']) ## Accuracy, Precision, and Recall for two events accuracy = [] precision = [] recall = [] for df in [P_df, D_df]: accuracy.append(accuracy_score(df['label'], df['prediction'])) precision.append( sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['label'][item] and 'Yes' in df['prediction'][item])) / sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['prediction'][item]))) recall.append( sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['label'][item] and 'Yes' in df['prediction'][item])) / sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['label'][item]))) ## F-1 f1 = [] for num in [0, 1]: f1.append((2 * precision[num] * recall[num]) / (precision[num] + recall[num])) # Report performance print("condition: " + cond) print(f"Overall Accuracy {T_a:0.03}") covid_event = ['Test Positive', 'Death Case'] num = 0 for event in covid_event: print( f"Scores for {event} : \taccuracy {accuracy[num]:0.03}\tprecision {precision[num]:0.03}\trecall {recall[num]:0.03}\tF1 {f1[num]:0.03}" ) num = num + 1 ## Basicline Performance / Confusion Matrix print("Confusion Matrix:") print(pd.crosstab(Total_df['label'], Total_df['prediction'])) print("Training data:") labels = ["P-Yes", "P-No", "D-Yes", "D-No"] for label in labels: train_data2 = np.concatenate(train_data).flat n_label = sum(1 for item in train_data2 if item == label) print(f"Number of {label}: {n_label}")