def train_classifier(df, labels, label_term_dict, label_adult_dict, label_actor_dict, label_actress_dict, label_producer_dict, label_writer_dict, label_director_dict, label_composer_dict, label_cinematographer_dict, label_editor_dict, label_prod_designer_dict, label_dir_adult_dict, label_dir_actor_dict, label_dir_actress_dict, label_dir_producer_dict, label_dir_writer_dict, label_dir_composer_dict, label_dir_cinematographer_dict, label_dir_editor_dict, label_dir_prod_designer_dict, label_actor_actress_dict, label_to_index, index_to_label, model_name, soft=False): basepath = "/data4/dheeraj/metaguide/" dataset = "imdb/" # glove_dir = basepath + "glove.6B" dump_dir = basepath + "models/" + dataset + model_name + "/" tmp_dir = basepath + "checkpoints/" + dataset + model_name + "/" os.makedirs(dump_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) max_sentence_length = 100 max_sentences = 15 max_words = 20000 embedding_dim = 100 tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) X, y, y_true = get_train_data(df, labels, label_term_dict, label_adult_dict, label_actor_dict, label_actress_dict, label_producer_dict, label_writer_dict, label_director_dict, label_composer_dict, label_cinematographer_dict, label_editor_dict, label_prod_designer_dict, label_dir_adult_dict, label_dir_actor_dict, label_dir_actress_dict, label_dir_producer_dict, label_dir_writer_dict, label_dir_composer_dict, label_dir_cinematographer_dict, label_dir_editor_dict, label_dir_prod_designer_dict, label_actor_actress_dict, tokenizer, label_to_index, soft=soft) print("****************** CLASSIFICATION REPORT FOR TRAINING DATA ********************") # df_train = create_training_df(X, y, y_true) # df_train.to_csv(basepath + dataset + "training_label.csv") if not soft: y_vec = make_one_hot(y, label_to_index) print(classification_report(y_true, y)) else: y_vec = np.array(y) y_argmax = np.argmax(y, axis=-1) y_str = [] for i in y_argmax: y_str.append(index_to_label[i]) print(classification_report(y_true, y_str)) # print("Fitting tokenizer...") # tokenizer = fit_get_tokenizer(X, max_words) print("Getting tokenizer") tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) print("Splitting into train, dev...") X_train, y_train, X_val, y_val, _, _ = create_train_dev(X, labels=y_vec, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words, val=False) # print("Creating Embedding matrix...") # embedding_matrix = create_embedding_matrix(glove_dir, tokenizer, embedding_dim) print("Getting Embedding matrix...") embedding_matrix = pickle.load(open(basepath + dataset + "embedding_matrix.pkl", "rb")) print("Initializing model...") model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...") model.summary() if not soft: model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc']) else: model.compile(loss=kullback_leibler_divergence, optimizer='adam', metrics=['acc']) print("model fitting - Hierachical attention network...") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max', verbose=1, save_weights_only=True, save_best_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es, mc]) # print("****************** CLASSIFICATION REPORT FOR DOCUMENTS WITH LABEL WORDS ********************") # X_label_all = prep_data(texts=X, max_sentences=max_sentences, max_sentence_length=max_sentence_length, # tokenizer=tokenizer) # pred = model.predict(X_label_all) # pred_labels = get_from_one_hot(pred, index_to_label) # print(classification_report(y_true, pred_labels)) print("****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************") X_all = prep_data(texts=df["text"], max_sentences=max_sentences, max_sentence_length=max_sentence_length, tokenizer=tokenizer) y_true_all = df["label"] pred = model.predict(X_all) pred_labels = get_from_one_hot(pred, index_to_label) print(classification_report(y_true_all, pred_labels)) print("Dumping the model...") model.save_weights(dump_dir + "model_weights_" + model_name + ".h5") model.save(dump_dir + "model_" + model_name + ".h5") return pred_labels, pred
def train_classifier(df, labels, label_term_dict, label_author_dict, label_pub_dict, label_year_dict, label_author_pub_dict, label_pub_year_dict, label_author_year_dict, label_to_index, index_to_label, model_name, clf, use_gpu, old=True, soft=False): basepath = "/data4/dheeraj/metaguide/" dataset = "books/" # glove_dir = basepath + "glove.6B" dump_dir = basepath + "models/" + dataset + model_name + "/" tmp_dir = basepath + "checkpoints/" + dataset + model_name + "/" os.makedirs(dump_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) max_sentence_length = 100 max_sentences = 15 max_words = 20000 embedding_dim = 100 tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) if old: X, y, y_true = get_train_data(df, labels, label_term_dict, label_author_dict, label_pub_dict, label_year_dict, label_author_pub_dict, label_pub_year_dict, label_author_year_dict, tokenizer, label_to_index, soft=soft, clf=clf) if clf == "BERT": df_orig = pickle.load(open(basepath + dataset + "df.pkl", "rb")) X = list(df_orig.iloc[X]["text"]) else: X, y, y_true = get_confident_train_data(df, labels, label_term_dict, label_author_dict, label_pub_dict, label_year_dict, label_author_pub_dict, label_pub_year_dict, label_author_year_dict, tokenizer) print("****************** CLASSIFICATION REPORT FOR TRAINING DATA ********************") # df_train = create_training_df(X, y, y_true) # df_train.to_csv(basepath + dataset + "training_label.csv") if not soft: y_vec = make_one_hot(y, label_to_index) print(classification_report(y_true, y)) else: y_vec = np.array(y) y_argmax = np.argmax(y, axis=-1) y_str = [] for i in y_argmax: y_str.append(index_to_label[i]) print(classification_report(y_true, y_str)) # print("Fitting tokenizer...") # tokenizer = fit_get_tokenizer(X, max_words) print("Getting tokenizer") tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) # print("Creating Embedding matrix...") # embedding_matrix = create_embedding_matrix(glove_dir, tokenizer, embedding_dim) if clf == "HAN": print("Splitting into train, dev...") X_train, y_train, X_val, y_val, _, _ = create_train_dev(X, labels=y_vec, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words, val=False) print("Getting Embedding matrix...") embedding_matrix = pickle.load(open(basepath + dataset + "embedding_matrix.pkl", "rb")) print("Initializing model...") model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...") model.summary() if not soft: model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc']) else: model.compile(loss=kullback_leibler_divergence, optimizer='adam', metrics=['acc']) print("model fitting - Hierachical attention network...") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max', verbose=1, save_weights_only=True, save_best_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es, mc]) # print("****************** CLASSIFICATION REPORT FOR DOCUMENTS WITH LABEL WORDS ********************") # X_label_all = prep_data(texts=X, max_sentences=max_sentences, max_sentence_length=max_sentence_length, # tokenizer=tokenizer) # pred = model.predict(X_label_all) # pred_labels = get_from_one_hot(pred, index_to_label) # print(classification_report(y_true, pred_labels)) print("****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************") X_all = prep_data(texts=df["text"], max_sentences=max_sentences, max_sentence_length=max_sentence_length, tokenizer=tokenizer) y_true_all = df["label"] pred = model.predict(X_all) pred_labels = get_from_one_hot(pred, index_to_label) print("Dumping the model...") model.save_weights(dump_dir + "model_weights_" + model_name + ".h5") model.save(dump_dir + "model_" + model_name + ".h5") elif clf == "BERT": y_vec = [] for lbl_ in y: y_vec.append(label_to_index[lbl_]) model = train_bert(X, y_vec, use_gpu) y_true_all = [] for lbl_ in df.label: y_true_all.append(label_to_index[lbl_]) predictions = test(model, df_orig["text"], y_true_all, use_gpu) for i, p in enumerate(predictions): if i == 0: pred = p else: pred = np.concatenate((pred, p)) pred_labels = [] for p in pred: pred_labels.append(index_to_label[p.argmax(axis=-1)]) y_true_all = df["label"] elif clf == "CNN": y_vec = [] for lbl_ in y: y_vec.append(label_to_index[lbl_]) y_true_all = [] for lbl_ in df.label: y_true_all.append(label_to_index[lbl_]) pred_idxs, pred, true_idxs = train_cnn(X, y_vec, df["text"], y_true_all, use_gpu) pred_labels = [] for p in pred_idxs: pred_labels.append(index_to_label[p]) y_true_all = [] for p in true_idxs: y_true_all.append(index_to_label[p]) else: raise ValueError("clf can only be HAN or BERT or CNN") print(classification_report(y_true_all, pred_labels)) return pred_labels, pred
metrics=['acc']) print("model fitting - Hierachical attention network...") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max', verbose=1, save_weights_only=True, save_best_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es, mc]) print( "****************** CLASSIFICATION REPORT ON TEST DATA ********************" ) pred = model.predict(X_test) pred_labels = get_from_one_hot(pred, index_to_label) true_labels = get_from_one_hot(y_test, index_to_label) print(classification_report(true_labels, pred_labels)) print("Dumping the model...") model.save_weights(dump_dir + "model_weights_" + model_name + ".h5") model.save(dump_dir + "model_" + model_name + ".h5")
def train_classifier(df, labels, label_term_dict, label_to_index, index_to_label, dataset_path): print("Going to train classifier..") basepath = dataset_path model_name = "conwea" dump_dir = basepath + "models/" + model_name + "/" tmp_dir = basepath + "checkpoints/" + model_name + "/" os.makedirs(dump_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) max_sentence_length = 100 max_sentences = 15 max_words = 20000 tokenizer = pickle.load(open(dataset_path + "tokenizer.pkl", "rb")) X, y, y_true = generate_pseudo_labels(df, labels, label_term_dict, tokenizer) y_one_hot = make_one_hot(y, label_to_index) print("Fitting tokenizer...") print("Splitting into train, dev...") X_train, y_train, X_val, y_val = create_train_dev( X, labels=y_one_hot, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words) print("Creating Embedding matrix...") embedding_matrix = pickle.load( open(dataset_path + "embedding_matrix.pkl", "rb")) print("Initializing model...") model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...") model.summary() model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc']) print("model fitting - Hierachical attention network...") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max', verbose=1, save_weights_only=True, save_best_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=256, callbacks=[es, mc]) print( "****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************" ) X_all = prep_data(texts=df["sentence"], max_sentences=max_sentences, max_sentence_length=max_sentence_length, tokenizer=tokenizer) y_true_all = df["label"] pred = model.predict(X_all) pred_labels = get_from_one_hot(pred, index_to_label) print(classification_report(y_true_all, pred_labels)) print("Dumping the model...") model.save_weights(dump_dir + "model_weights_" + model_name + ".h5") model.save(dump_dir + "model_" + model_name + ".h5") return pred_labels
def train_weight_classifier(df, labels, label_term_dict, label_author_dict, label_conf_dict, label_to_index, index_to_label, model_name, AND=True): basepath = "/data4/dheeraj/metaguide/" dataset = "dblp/" # glove_dir = basepath + "glove.6B" dump_dir = basepath + "models/" + dataset + model_name + "/" tmp_dir = basepath + "checkpoints/" + dataset + model_name + "/" os.makedirs(dump_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) max_sentence_length = 100 max_sentences = 15 max_words = 20000 embedding_dim = 100 tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) X, y, y_true, weights = get_weighted_train_data(df, labels, label_term_dict, label_author_dict, label_conf_dict, tokenizer, label_to_index, AND=AND) print("****************** CLASSIFICATION REPORT FOR TRAINING DATA ********************") # df_train = create_training_df(X, y, y_true) # df_train.to_csv(basepath + dataset + "training_label.csv") y_vec = make_one_hot(y, label_to_index) print(classification_report(y_true, y)) # y = np.array(y) # print("Fitting tokenizer...") # tokenizer = fit_get_tokenizer(X, max_words) print("Getting tokenizer") tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) print("Splitting into train, dev...") X_train, y_train, X_val, y_val, weights_train, _ = create_train_dev_weights(X, labels=y_vec, weights=weights, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words) # print("Creating Embedding matrix...") # embedding_matrix = create_embedding_matrix(glove_dir, tokenizer, embedding_dim) print("Getting Embedding matrix...") embedding_matrix = pickle.load(open(basepath + dataset + "embedding_matrix.pkl", "rb")) print("Initializing model...") model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...") model.summary() model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc']) print("model fitting - Hierachical attention network...") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max', verbose=1, save_weights_only=True, save_best_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es, mc], sample_weight=np.array(weights_train)) # print("****************** CLASSIFICATION REPORT FOR DOCUMENTS WITH LABEL WORDS ********************") # X_label_all = prep_data(texts=X, max_sentences=max_sentences, max_sentence_length=max_sentence_length, # tokenizer=tokenizer) # pred = model.predict(X_label_all) # pred_labels = get_from_one_hot(pred, index_to_label) # print(classification_report(y_true, pred_labels)) print("****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************") X_all = prep_data(texts=df["abstract"], max_sentences=max_sentences, max_sentence_length=max_sentence_length, tokenizer=tokenizer) y_true_all = df["label"] pred = model.predict(X_all) pred_labels = get_from_one_hot(pred, index_to_label) print(classification_report(y_true_all, pred_labels)) print("Dumping the model...") model.save_weights(dump_dir + "model_weights_" + model_name + ".h5") model.save(dump_dir + "model_" + model_name + ".h5") return pred_labels, pred
def train_classifier(df, tokenizer, embedding_matrix, labels, motpat_label_motifs_dict, label_to_index, index_to_label, index_word, dataset_path, config): def generate_pseudo_labels(df, labels, motpat_label_motifs_dict, tokenizer, index_word, config): y = [] X = [] for index, row in df.iterrows(): count_dict = {} flag = 0 for mot_pat in motpat_label_motifs_dict: label_motifs_dict = motpat_label_motifs_dict[mot_pat] if len(label_motifs_dict) == 0: continue if mot_pat == "phrase": tokens = tokenizer.texts_to_sequences([row["text"]])[0] words = [] for tok in tokens: words.append(index_word[tok]) for l in labels: if len(label_motifs_dict[l]) == 0: continue seed_words = set(label_motifs_dict[l].keys()) int_words = list(set(words).intersection(seed_words)) for word in int_words: flag = 1 try: count_dict[l] += label_motifs_dict[l][word] except: count_dict[l] = label_motifs_dict[l][word] else: size = len(mot_pat) if size == 1: first = mot_pat[0] entities = get_entity_from_col(row[first], first, config) elif size == 2: first = mot_pat[0] second = mot_pat[1] first_ents = get_entity_from_col( row[first], first, config) second_ents = get_entity_from_col( row[second], second, config) if first == second: entities = set( itertools.combinations(first_ents, 2)) else: entities = set( itertools.product(first_ents, second_ents)) else: raise Exception( "Motif patterns of size more than 2 not yet handled but can be easily extended." ) for l in labels: if len(label_motifs_dict[l]) == 0: continue seed_entities = set(label_motifs_dict[l].keys()) int_ents = list(entities.intersection(seed_entities)) for ent in int_ents: flag = 1 try: count_dict[l] += label_motifs_dict[l][ent] except: count_dict[l] = label_motifs_dict[l][ent] if flag: lbl = max(count_dict, key=count_dict.get) if not lbl: continue y.append(lbl) X.append(row["text"]) return X, y basepath = dataset_path model_name = "meta" dump_dir = basepath + "models/" + model_name + "/" tmp_dir = basepath + "checkpoints/" + model_name + "/" os.makedirs(dump_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) max_sentence_length = 100 max_sentences = 15 max_words = 20000 print("Generating pseudo-labels", flush=True) X, y = generate_pseudo_labels(df, labels, motpat_label_motifs_dict, tokenizer, index_word, config) y_vec = make_one_hot(y, label_to_index) print("Splitting into train, dev...", flush=True) X_train, y_train, X_val, y_val = create_train_dev( X, labels=y_vec, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words) print("Initializing model...", flush=True) model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...", flush=True) model.summary() model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc']) print("model fitting - Hierachical attention network...", flush=True) es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max', verbose=1, save_weights_only=True, save_best_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es, mc]) print( "****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************", flush=True) X_all = prep_data(texts=df["text"], max_sentences=max_sentences, max_sentence_length=max_sentence_length, tokenizer=tokenizer) y_true_all = df["label"] pred = model.predict(X_all) pred_labels = get_from_one_hot(pred, index_to_label) print(classification_report(y_true_all, pred_labels), flush=True) print("Dumping the model...", flush=True) model.save_weights(dump_dir + "model_weights_" + model_name + ".h5") model.save(dump_dir + "model_" + model_name + ".h5") return pred_labels, pred