示例#1
0
def binarize_data(data, class_mapping=None, binarizer=None):

    if class_mapping is None:
        class_mapping = lrf_config.get_class_map()

    if binarizer is None:
        binarizer = MultiLabelBinarizer(classes=sorted(class_mapping.keys()))
    try:
        fitted_binarizer = binarizer.fit(data)
        data_transformed = fitted_binarizer.transform(data)
    except Exception as e:
        print('See EXCEPTION BELOW... ')
        print(e)

    #return fitted_binarizer, data_transformed
    return fitted_binarizer, data_transformed
示例#2
0
def keyword_driver(classifier_type,X_train,Y_train,num_of_keywords=50):

    if all(isinstance(n, list) for n in X_train):
        X_train = utility.get_str_from_list(X_train)

    binarizer = utility.get_multilabel_binarizer(lrf_config.get_class_map())

    Y_train_binary = utility.binarize_data(Y_train)

    tfidf_vectorizer = utility.get_tf_idf_vectorizer(X_train)
    X_tfidf = tfidf_vectorizer.transform(X_train)

    model = classifier.get_classification_model(classifier_type, X_tfidf, Y_train_binary[1])

    keywords = get_keywords(X_train, model, binarizer,num_of_keywords=num_of_keywords)
    return keywords
示例#3
0
def get_keywords(train_data,classifier,binarizer,class_mapping=lrf_config.get_class_map(),num_of_keywords=20):

    tfidf_vocab = utility.get_tf_idf_vectorizer(train_data).vocabulary_

    tfidf_reversed_vocab = {i: word for word, i in tfidf_vocab.items()}

    top_keywords = {}

    for key in class_mapping.keys():
        if key!='skip':

            keywords = get_keywords_for_tag(binarizer,classifier, key, tfidf_reversed_vocab,num_of_keywords)

            top_keywords[key] = keywords

    return top_keywords
示例#4
0
文件: datasets.py 项目: goswaank/lrfi
def list_prepare(lists,lst_type):

    processed_lists = []

    if lst_type == str:

        for one_list in lists:

            try:

                one_list = eval(one_list)

                processed_lists.append(one_list)

            except Exception as e:

                print(one_list)

                print(e)

                processed_lists.append('None')

    elif lst_type == int:

        class_map = lc.get_class_map()
        inv_class_map = {v:k for k,v in class_map.items()}

        for one_list in lists:

            try:

                one_list = eval(one_list)
                new_list = [inv_class_map.get(int(item)) for item in one_list]

                processed_lists.append(new_list)

            except Exception as e:

                print(e)

                processed_lists.append('None')

    return processed_lists
示例#5
0
def main():
    print("main_code goes here")

    classifier_type = 'svc'

    news_data = datasets.get_news_data('keyword_data',
                                       'annotator_data_dump_with_text')
    train_data, test_data = utility.split_data(news_data)

    X_train = train_data['text']
    Y_train = train_data['category']

    binarizer = utility.get_multilabel_binarizer(lrf_config.get_class_map())
    Y_train_binary = utility.binarize_data(Y_train)

    tfidf_vectorizer = utility.get_tf_idf_vectorizer(X_train)
    X_tfidf = tfidf_vectorizer.transform(X_train)

    model = classifier.get_classification_model(classifier_type, X_tfidf,
                                                Y_train_binary)

    h = get_keywords(X_train, model, binarizer)
    print(h)
def get_keywords(train_data,classifier,binarizer,class_mapping=lrf_config.get_class_map(),num_of_keywords=20):
>>>>>>> 9af9d4b1e982bbca90d4f5989640d5baa4cc6377
def get_keywords(train_data, classifier, binarizer, class_mapping=lrf_config.get_class_map(), num_of_keywords=20):
=======
def main():
    print("main_code goes here")

    classifier_type = 'svc'

<<<<<<< HEAD
    news_data = datasets.get_news_data('keyword_data', 'annotator_data_dump_with_text')
=======
    news_data = datasets.get_news_data('keyword_data','annotator_data_dump_with_text')
>>>>>>> 9af9d4b1e982bbca90d4f5989640d5baa4cc6377
    train_data,test_data = utility.split_data(news_data)

    X_train = train_data['text']
    Y_train = train_data['category']

    binarizer = utility.get_multilabel_binarizer(lrf_config.get_class_map())
    Y_train_binary = utility.binarize_data(Y_train)

    tfidf_vectorizer = utility.get_tf_idf_vectorizer(X_train)
    X_tfidf = tfidf_vectorizer.transform(X_train)

<<<<<<< HEAD
    model = classifier.get_classification_model(classifier_type, X_tfidf, Y_train_binary)
=======
    model = classifier.get_classification_model(classifier_type,X_tfidf,Y_train_binary)
>>>>>>> 9af9d4b1e982bbca90d4f5989640d5baa4cc6377

    h = get_keywords(X_train,model,binarizer)
    print(h)

#############################