def binarize_data(data, class_mapping=None, binarizer=None): if class_mapping is None: class_mapping = lrf_config.get_class_map() if binarizer is None: binarizer = MultiLabelBinarizer(classes=sorted(class_mapping.keys())) try: fitted_binarizer = binarizer.fit(data) data_transformed = fitted_binarizer.transform(data) except Exception as e: print('See EXCEPTION BELOW... ') print(e) #return fitted_binarizer, data_transformed return fitted_binarizer, data_transformed
def keyword_driver(classifier_type,X_train,Y_train,num_of_keywords=50): if all(isinstance(n, list) for n in X_train): X_train = utility.get_str_from_list(X_train) binarizer = utility.get_multilabel_binarizer(lrf_config.get_class_map()) Y_train_binary = utility.binarize_data(Y_train) tfidf_vectorizer = utility.get_tf_idf_vectorizer(X_train) X_tfidf = tfidf_vectorizer.transform(X_train) model = classifier.get_classification_model(classifier_type, X_tfidf, Y_train_binary[1]) keywords = get_keywords(X_train, model, binarizer,num_of_keywords=num_of_keywords) return keywords
def get_keywords(train_data,classifier,binarizer,class_mapping=lrf_config.get_class_map(),num_of_keywords=20): tfidf_vocab = utility.get_tf_idf_vectorizer(train_data).vocabulary_ tfidf_reversed_vocab = {i: word for word, i in tfidf_vocab.items()} top_keywords = {} for key in class_mapping.keys(): if key!='skip': keywords = get_keywords_for_tag(binarizer,classifier, key, tfidf_reversed_vocab,num_of_keywords) top_keywords[key] = keywords return top_keywords
def list_prepare(lists,lst_type): processed_lists = [] if lst_type == str: for one_list in lists: try: one_list = eval(one_list) processed_lists.append(one_list) except Exception as e: print(one_list) print(e) processed_lists.append('None') elif lst_type == int: class_map = lc.get_class_map() inv_class_map = {v:k for k,v in class_map.items()} for one_list in lists: try: one_list = eval(one_list) new_list = [inv_class_map.get(int(item)) for item in one_list] processed_lists.append(new_list) except Exception as e: print(e) processed_lists.append('None') return processed_lists
def main(): print("main_code goes here") classifier_type = 'svc' news_data = datasets.get_news_data('keyword_data', 'annotator_data_dump_with_text') train_data, test_data = utility.split_data(news_data) X_train = train_data['text'] Y_train = train_data['category'] binarizer = utility.get_multilabel_binarizer(lrf_config.get_class_map()) Y_train_binary = utility.binarize_data(Y_train) tfidf_vectorizer = utility.get_tf_idf_vectorizer(X_train) X_tfidf = tfidf_vectorizer.transform(X_train) model = classifier.get_classification_model(classifier_type, X_tfidf, Y_train_binary) h = get_keywords(X_train, model, binarizer) print(h)
def get_keywords(train_data,classifier,binarizer,class_mapping=lrf_config.get_class_map(),num_of_keywords=20): >>>>>>> 9af9d4b1e982bbca90d4f5989640d5baa4cc6377
def get_keywords(train_data, classifier, binarizer, class_mapping=lrf_config.get_class_map(), num_of_keywords=20): =======
def main(): print("main_code goes here") classifier_type = 'svc' <<<<<<< HEAD news_data = datasets.get_news_data('keyword_data', 'annotator_data_dump_with_text') ======= news_data = datasets.get_news_data('keyword_data','annotator_data_dump_with_text') >>>>>>> 9af9d4b1e982bbca90d4f5989640d5baa4cc6377 train_data,test_data = utility.split_data(news_data) X_train = train_data['text'] Y_train = train_data['category'] binarizer = utility.get_multilabel_binarizer(lrf_config.get_class_map()) Y_train_binary = utility.binarize_data(Y_train) tfidf_vectorizer = utility.get_tf_idf_vectorizer(X_train) X_tfidf = tfidf_vectorizer.transform(X_train) <<<<<<< HEAD model = classifier.get_classification_model(classifier_type, X_tfidf, Y_train_binary) ======= model = classifier.get_classification_model(classifier_type,X_tfidf,Y_train_binary) >>>>>>> 9af9d4b1e982bbca90d4f5989640d5baa4cc6377 h = get_keywords(X_train,model,binarizer) print(h) #############################