示例#1
0
文件: synth.py 项目: daureg/magnet
def evaluate_solution(users, urecovered, observed_index, xs=None, E=None,
                      hidden_edges=None):
    """Evaluate the quality of the recovered user profile"""
    mse = mean_squared_error(users[observed_index, :],
                             urecovered[observed_index, :])
    if hidden_edges is None or len(hidden_edges) < 1:
        return mse, None
    labeler = MultiLabelBinarizer(classes=np.arange(xs.shape[1]))
    gold = labeler.fit_transform([E[e] for e in sorted(hidden_edges)])
    # gold = np.array([E[e] for e in sorted(hidden_edges)])
    eh = sorted(hidden_edges)
    heads, tails = zip(*eh)
    Cr = np.dot(urecovered, xs.T)
    Dr = np.abs(Cr[heads, :] - Cr[tails, :])
    # TODO prediction here could be better: instead of predict the k best
    # directions all the time, look at revealed edge to compute threshold of
    # similarity (i.e replace 0.05)
    best_dirs = np.argsort(Dr, 1).astype(int)[:, :2]
    pred = []
    for all_dir, suggestion in zip(Dr, best_dirs):
        my_pred = [suggestion[0]]
        if all_dir[suggestion[1]] < 0.05:
            my_pred.append(suggestion[1])
        pred.append(my_pred)
    pred = labeler.fit_transform(pred)
    return mse, f1_score(gold, pred, average='samples')
def run_classifier(sentences, labels, test_doc_list, output_file_path_list):
	import numpy as np

	train_matrix, tfidf = tf_idf_fit_transform(sentences)

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer()
	label_matrix = mlb.fit_transform(labels)

	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import LinearSVC
	estimator = LinearSVC()
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, label_matrix)

	for test_doc, output_file_path in zip(test_doc_list, output_file_path_list):
		test_sentences = doc2sentences([test_doc])
		sentence_matrix = tfidf.transform(test_sentences)
		print("Shape of sentence matrix : ", sentence_matrix.shape)
		predictions = classifier.predict(sentence_matrix)

		from lxml import etree
		document = etree.Element('doc')
		doc_tree = etree.ElementTree(document)
		for i in range(len(test_sentences)):
			curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1]
			etree.SubElement(document, "Sent", classes=", ".join(curr_pred)).text = test_sentences[i]
		doc_tree.write(output_file_path)
示例#3
0
def generateTrainFeatures(L):
    """
    This function generates the training data features and its target labels.
    Input: L : The number of training data
    Output: trainX -> a (L * 2000) numpy matrix representing the 2000 features for each of the
                        L training samples
            trainY -> (L * 185) numpy matrix representing the target class of the training samples
    Logic:
    The input text is read, preprocessed to remove stop words, and is appended to a list.
    Similarly, each of the target class values are read into a list.
    Sklearn package TFIDF vectorizer is used for generating TFIDF matrix for the 2000 frequent
    words. 
    The multi-label classification algorithms require a target Y variable of the form,
    (nsamples * nclasses), multilabel binarizer is used for converting the list of classes
    to a matrix form.
    """
    global classOrder
    X = []
    Y = []
    # read the input
    for i in range(L):
        categories = raw_input()
        target = [int(y) for y in categories.split(" ")]
        del target[0]
        meaningfulWords = readInput()
        Y.append(target)
        X.append(meaningfulWords)
    # construct TF-IDF matrix representing the features
    trainX = vectorizer.fit_transform(X).toarray()
    # convert the target label list to a suitable matrix form
    mlb = MultiLabelBinarizer()
    trainY = mlb.fit_transform(Y)
    # for representing the order of the classes
    classOrder = mlb.classes_
    return (trainX, trainY)
示例#4
0
def read_all_data(p):
    img_src = "images/"

    df = pd.read_pickle("frame_no_stem.pkl")
    images = __read_all_images(img_src) 
    print("Finished reading images")

    x_images = []
    x_desc = []
    y_category = []
    all_categories = set()

    for asin in df.index.values:
        if asin in images:
            data = images[asin]
            x_images.append(data)

            item = df.loc[asin]
            x_desc.append(item.description)
            cate = item.categories
            y_category.append(cate)
            for c in cate:
                all_categories.add(c)

    print("Finished reading dataframe")
    mlb = MultiLabelBinarizer()
    y_total = mlb.fit_transform(y_category)
    x_images = np.array(x_images)
    x_desc = np.array(x_desc)

    
    return x_images,x_desc, y_total
示例#5
0
def load_data(config={}):
    """
    Load the Reuters dataset.

    Returns
    -------
    data : dict
        with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
    """
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
                                     for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id)
                                for doc_id in test])
    data = {'x_train': xs['train'], 'y_train': ys['train'],
            'x_test': xs['test'], 'y_test': ys['test'],
            'labels': globals()["labels"]}
    return data
def get_training_data(window_size_ms, train_time_sec=30):
	#loop until empty input is detected
	X = []
	y = []

	print "Training time for each key is {} seconds".format(train_time_sec)
	i = 0
	while True:
		s = raw_input('Press <enter> to begin training key {} or q-<enter> to quit'.format(i))
		if s: break

		j = 0
		while j < train_time_sec:
			j += (window_size_ms / float(1000))
			freq_spect = read_spectral_data_for_time(window_size_ms)
			X.append(freq_spect)
			y.append([i])

		#increment key counter
		i += 1

	mb = MultiLabelBinarizer()
	y = mb.fit_transform(y)

	X = np.asarray(X)
	y = np.asarray(y)
	return X, y
def main():
    #Explore the data for how many class labels
    reviewsDict = {}
    with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/reviewUsefulDict.pickle") as f:
        reviewsDict = pickle.load(f)
    print "Reviews Dictionary loaded .. "
    '''
    usefulCountDict = {}
    for key, value in reviewsDict.iteritems():
        if value not in usefulCountDict:
            usefulCountDict[value] = 1
        else:
            usefulCountDict[value] = usefulCountDict[value]+1
    pprint(usefulCountDict)
    '''
    corpus, target = DictToList(reviewsDict)
    
    vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True)
    XAll = vectorizer.fit_transform(corpus)
    mlb = MultiLabelBinarizer()
    yAll = mlb.fit_transform(target)
    
    with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.fv", 'w') as f:
        pickle.dump(XAll, f)
    with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.target2", 'w') as f:
        pickle.dump(yAll, f)
    with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.mlb", 'w') as f:
        pickle.dump(mlb, f)
    
    print "Dumped featrue vectors .... "
def run_classifier(sentences, labels, test_docs):
	import numpy as np

	train_matrix, tfidf = tf_idf_fit_transform(sentences)

	test_sentences = doc2sentences(test_docs)
	sentence_matrix = tfidf.transform(test_sentences)
	print("Shape of sentence matrix : ", sentence_matrix.shape)

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer()
	label_matrix = mlb.fit_transform(labels)

	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.svm import linearSVC
	# estimator = SVC(kernel='linear')
	estimator = linearSVC()
	classifier = OneVsRestClassifier(estimator, n_jobs=-1)
	classifier.fit(train_matrix, label_matrix)
	predictions = classifier.predict(sentence_matrix)

	import csv
	with open("classified.csv", "w") as fl:
		writer = csv.writer(fl)
		for i in range(len(test_sentences)):
			curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1]
			writer.writerow((test_sentences[i], curr_pred))
def perform_train_test_split(db_name=ds.DEFAULT_DB_NAME,
                                        train_size=ds.DEFAULT_TRAININGSET_SIZE):
    
    """
    Get all document_ids of given database and split's it according to given
    train_size.
    The tricky part is that we n
    
    :param db_name: Name of database to split documents (default DEFAULT_DB_NAME)
    :param train_size: Size in percentage [0,1] of the training set.
    :return splitted_dataset - List of lists 
                    [[DEFAULT_DATASET_LIST_INDEX_TRAINING], 
                    [DEFAULT_DATASET_LIST_INDEX_TEST]]
    """
    
    database = db.couch_database(db_name)
    all_docs = database.getAllDocumentsFromDatabase()
    
    doc_ids_list = []
    all_tag_list = []
    
    i = 0
    
    for row in all_docs.rows:
        
        document = row.doc
        #append the document id to doc_ids_list
        doc_ids_list.append(document[cp.COUCHDB_DOCUMENT_FIELD_ID])
        
        tag_list = []
        
        #if document has tags than split and add them
        if pp.STACKEXCHANGE_TAGS_COLUM in document.keys():
            
            document_tags = document[pp.STACKEXCHANGE_TAGS_COLUM]
            
            tags_list = document_tags.split(sep=dtm_provider.TAG_SPLIT_separator)
            
            for tag in tags_list:
                
                #remove the closing tag (last item)
                tag_list.append(tag[:-1])
        #append the list of document tags to all_tag_list        
        all_tag_list.append(tag_list)
        
        i += 1
        
        if i > 10000:
            break
    
    mlb = MultiLabelBinarizer()
    tags_encoded = mlb.fit_transform(all_tag_list)

    
    print(len(doc_ids_list))
    
    splitted_dataset = cross_validation.train_test_split(doc_ids_list,tags_encoded,
                                               train_size=0.8, random_state=42, 
                                               stratify=tags_encoded)
示例#10
0
class VectorizedData:
    """ Simple container that holds the input dataset
    in a sklearn-friendly form, with X, y numpy vectors.

    TODO: we ignore # of matches for each fbpath """
    def __init__(self, data, Xdict=None, Ydict=None):
        fdict = [q_to_fdict(q) for q in data]
        lset = [q_to_lset(q) for q in data]

        if Xdict is None:
            self.Xdict = DictVectorizer()
            self.X = self.Xdict.fit_transform(fdict)
        else:
            self.Xdict = Xdict
            self.X = self.Xdict.transform(fdict)

        if Ydict is None:
            self.Ydict = MultiLabelBinarizer()
            self.Y = self.Ydict.fit_transform(lset)
        else:
            self.Ydict = Ydict

            # Filter out data with unknown labels, MultiLabelBinarizer() cannot
            # handle this
            known_lset = [set([label for label in ls if label in self.Ydict.classes_]) for ls in lset]
            lset_n = sum([len(ls) for ls in lset])
            known_lset_n = sum([len(ls) for ls in known_lset])
            if known_lset_n < lset_n:
                print('dropped %d out of %d labels (not in training set)' % (lset_n - known_lset_n, lset_n), file=sys.stderr)

            self.Y = self.Ydict.transform(known_lset)

    def cfier_score(self, cfier, scorer):
        """ Measure cfier performance on this dataset.

        scorer -> lambda cfier, X: cfier.predict_proba(X)
        (or decision_function when probabilities not predicted) """
        skl_score = cfier.score(self.X.toarray(), self.Y)

        # XXX: Matched paths might/could be weighted by their nMatches too...

        # Measure prediction performance
        Ypred = cfier.predict(self.X.toarray())
        n_q = float(np.size(self.Y, axis=0))
        # number of questions where all correct paths have been recalled
        recall_all = np.sum(np.sum(self.Y, axis=1) == np.sum(Ypred * self.Y, axis=1)) / n_q
        # number of questions where at least one correct path has been recalled
        recall_any = np.sum((np.sum(self.Y, axis=1) != 0) == (np.sum(Ypred * self.Y, axis=1) != 0)) / n_q
        # number of *PATHS* (not q.) that were correct
        precision = np.sum((Ypred + self.Y) == 2) / float(np.sum(Ypred))

        # Measure scoring performance
        Yscores = scorer(cfier, self.X.toarray())
        # MRR of first correct path
        mrr = mrr_by_score(self.Y, Yscores)
        # number of questions where at least one correct path has been recalled in top N paths
        # TODO

        return {'sklScore': skl_score, 'qRecallAll': recall_all, 'qRecallAny': recall_any, 'pPrec': precision, 'qScoreMRR': mrr}
def createDataMatrix(ngram_features, character_gram_features,tweetText, pos, pos_features, different_pos_tags, pos_text, voca_clusters, categories):
    tokenizer_case_preserve = Tokenizer(preserve_case=True)
    tokenizer = Tokenizer(preserve_case=False)
    handmade_features, cll, cll2 = [], [], []
    for tweet in tweetText:
        feat = []
        feat.append(exclamations(tweet))
        feat.append(questions(tweet))
        feat.append(questions_and_exclamation(tweet))
        feat.append(emoticon_negative(tweet))
        feat.append(emoticon_positive(tweet))
        words = tokenizer_case_preserve.tokenize(tweet) #preserving casing
        feat.append(allCaps(words))
        feat.append(elongated(words))
        feat.append(questions_and_exclamation(words[-1]))
        handmade_features.append(np.array(feat))
        words = tokenizer.tokenize(tweet)
        words = [word.strip("_NEG") for word in words]
        cll.append(getClusters(voca_clusters, words))
        #cll2.append(getClusters(voca_handmade, words))


    bl = csr_matrix(bing_lius(tweetText, pos, different_pos_tags, pos_text))
    nrc_emo = csr_matrix(nrc_emotion(tweetText, pos, different_pos_tags, pos_text ))
    mpqa_feat = csr_matrix(mpqa(tweetText,pos, different_pos_tags, pos_text))
    handmade_features = np.array(handmade_features)
    mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_clusters.values())))
    cluster_memberships_binarized = csr_matrix(mlb.fit_transform(cll))
    #mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_handmade.values())))
    #cluster_memberships_binarized_2 = csr_matrix(mlb.fit_transform(cll2))
    
    hasht = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-unigrams.txt'))
#    sent140aff_data = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-unigrams.txt'))
    hasht_bigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-bigrams.txt'))
#    sent140affBigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-bigrams.txt'))
    sentQ = csr_matrix(get_sentiwordnet(pos_text, pos))
    pos_features = csr_matrix(pos_features)
    handmade_features = csr_matrix(handmade_features)
    # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, 
#                             sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
#    ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float)
    ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, sentQ, handmade_features, pos_features, cluster_memberships_binarized, bl, mpqa_feat, nrc_emo, hasht, hasht_bigrams ), dtype=float)

#     print ngram_features.shape, character_gram_features.shape, cluster_memberships_binarized.shape, handmade_features.shape, pos_features.shape, 
#     sent140affBigrams.shape, hasht_bigrams, hasht.shape, sent140aff_data.shape, bl.shape, mpqa_feat.shape, nrc_emo.shape
    y=[]
    for i in categories:
        if i=='positive':
            y.append(1)
        elif i == 'negative':
            y.append(-1)
        elif i == 'UNKNOWN':
            y.append(0)
        else:
            print i
    ffeatures = normalize(ffeatures)
#     ffeatures, y = shuffle(ffeatures,y)
    return ffeatures, y
def xval(clf, x, y, train_index, test_index):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(x_train, y_train)
    mlb = MultiLabelBinarizer()
    y_pred = clf.predict_proba(x_test)
    mse = mean_squared_error(mlb.fit_transform(label_binarize(y_test, clf.classes_)), y_pred)
    acc = accuracy_score(y_test, y_pred.argmax(axis=1))
    evals = clf.get_num_evals()
    return mse, acc, evals
示例#13
0
    def test_BRKnna_no_labels_take_closest(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)
        knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
        print(pred)
        np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
示例#14
0
    def test_BRKnna_predict_dense(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
        np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
示例#15
0
    def test_BRKnnb_predict_two_samples(self):
        data = csr.csr_matrix([[0, 1], [1, 1.1], [1, 1], [0.5, 1]])
        train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid5'], ['lid4', 'lid5']]
        mlb = MultiLabelBinarizer(sparse_output=True)
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
        knn.fit(data, y)

        pred = knn.predict(csr.csr_matrix([[0, 1], [2, 2]])).todense()
        np.testing.assert_array_equal([[1, 1, 0, 0], [0, 0, 1, 1]], pred)
示例#16
0
def main():
    #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"])
    sets = select_sets_by_tag(20,4,tag_names)
    #sets = random_select_sets(30,6)
    train_tags = fetch_tags(sets["train"])
    train_texts = id_to_filename(sets["train"])#txt_to_list(sets["train"])
    #vectorize
    count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename")
    X_train_counts = count_vect.fit_transform(train_texts)

    #tf-idf transformation
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    #process tags
    mlb = MultiLabelBinarizer()
    processed_train_tags = mlb.fit_transform(train_tags)
    #rint(processed_train_tags)
    #classifier
    #clf = OneVsRestClassifier(MultinomialNB())
    clf = OneVsRestClassifier(LinearSVC())
    clf.fit(X_train_tfidf,processed_train_tags)
    print("classes:{}".format(clf.classes_))
    #process test set

    test_texts = id_to_filename(sets["test"])#txt_to_list(sets["test"])
    X_test_counts = count_vect.transform(test_texts)
    #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts)))
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    predicted_tags = clf.predict(X_test_tfidf)
    predicted_tags_readable = mlb.inverse_transform(predicted_tags)
    test_tags_actual = fetch_tags(sets["test"])
    predicted_probs = clf.decision_function(X_test_tfidf)
    #predicted_probs = clf.get_params(X_test_tfidf)
    class_list = mlb.classes_
    report = metrics.classification_report(mlb.transform(test_tags_actual),predicted_tags,target_names=class_list)
    print(report)
    #retrieve top 30% for each class
    top_percentage = 30
    threshold_index = int( len(sets["test"]) *(top_percentage/100.0) )
    threshold_vals_dic = {}
    threshold_vals = []
    num_classes = len(class_list)
    for i in range(num_classes):
        z = [ predicted_probs[j,i] for j in range(len(sets["test"]))]
        z.sort(reverse=True)
        threshold_vals_dic[class_list[i]]= z[threshold_index]
        threshold_vals.append(z[threshold_index])
    print(threshold_vals_dic)


    print_predictions(sets["test"],predicted_tags_readable,class_list, class_probablities=predicted_probs,threshold_vals=threshold_vals)
示例#17
0
def get_data(train_file, test_file):
    X_train, Y_train = load_data(train_file)
    X_train = [ln.split('\t')[1] for ln in X_train]
    X_test, Y_test = load_data(test_file)
    X_test = [ln.split('\t')[1] for ln in X_test]

    mlb = MultiLabelBinarizer()
    Y_train = [set(s.split('_')) - {'None'} for s in Y_train]
    Y_test = [set(s.split('_')) - {'None'} for s in Y_test]
    Y_train = mlb.fit_transform(Y_train)
    Y_test = mlb.transform(Y_test)

    return X_train, X_test, Y_train, Y_test, mlb.classes_
示例#18
0
    def test_BRKnnb_auto_optimize_k(self):
        data = csr.csr_matrix([[0, 1], [1, 1], [0, 1.1], [1.1, 1]])
        train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid2', 'lid3'], ['lid0', 'lid1']]
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_ids)

        knn = BRKNeighborsClassifier(mode='b', n_neighbor_candidates=[1, 3], auto_optimize_k=True)

        # noinspection PyUnusedLocal
        def fun(s, X, y_):
            return data[[1, 2, 3]], data[[0]], y[[1, 2, 3]], y[[0]]

        BRKNeighborsClassifier._get_split = fun
        knn.fit(data, y)
        self.assertEquals(3, knn.n_neighbors)
        pred = knn.predict(csr.csr_matrix([[0.1, 1], [2, 2]])).todense()
        np.testing.assert_array_equal([[1, 1, 0, 0], [1, 1, 0, 0]], pred)
示例#19
0
文件: classify.py 项目: vlasy/skool
def get_all_data():
    ''' Get data for educational subjects classifier'''
    print "Gathering data"
    data_train = []
    labels_train_normal = []
    for page in [x for x in Page.objects if x.cze]:
        data_train.append(page.cleannostops.encode('utf-8'))
        labs = [x.id for x in page.labels_all]
        labs = set(labs)
        labels_train_normal.append(labs)
    mlb = MultiLabelBinarizer()
    labels_train = mlb.fit_transform(labels_train_normal)
    print "Saving MLB"
    path = os.path.join(makepath('model'), DEFAULT_FILENAMES['MultiLabelBinarizer'] + '.pkl')
    joblib.dump(mlb, path)
    print "Saved"
    return (data_train, labels_train)
示例#20
0
    def _get_coursera_corpus(self):
        """collect coursera course text and metadata"""
        with open('./data/coursera/coursera_courses.json') as c_file:
            coursera_courses = json.load(c_file)

        course_id_to_index = {} # dict to allow reverse searching from id
        course_text_list = []
        course_list = []
        course_categories = []

        i = 0
        for course in coursera_courses['elements']:
            if course['language'] == 'en':
                course_id_to_index[course['id']] = i
                course_text_list.append(self.concatenate_coursera_text_data(course))
                course_list.append(course)
                if self.categorizer:
                    course_categories.append(course['links'].get('categories', [-1]))
                i += 1


        if self.categorizer:
            # get category list
            cat_info_list = coursera_courses['linked']['categories']
            self.cat_id_to_name = {cat['id']:
                {'name':cat['name'], 'shortName':cat['shortName']} for cat in cat_info_list}

            # binarize labels and discard low-count categories    
            mlb = MultiLabelBinarizer()
            course_cats_binarized = mlb.fit_transform(course_categories)

            # filter to only tags with > 40 courses
            mask = course_cats_binarized.sum(axis=0) > 40
            course_cats_binarized = course_cats_binarized[:, mask]
            self.course_cats_binarized = course_cats_binarized

            # create dict to get back from masked index, to index, to id
            label_arr_to_cat_id = {}
            for i, k in enumerate(mask.nonzero()[0].tolist()):
                label_arr_to_cat_id[i] = mlb.classes_[k]

            self.label_arr_to_cat_id = label_arr_to_cat_id


        return course_list, course_text_list, course_id_to_index
def load_movie_data():
    fr = open("labels_summary.txt","r")
    x_data, y_data = [], []
    lb = createLB()
    mlb = MultiLabelBinarizer()
    label_set = get_labels_set()
    for line in fr.readlines():
        line = line.rstrip()
        line_datas = line.split("--")
        summary = line_datas[-1]
        labels = line_datas[-2].split(' ')
        labels = [item for item in labels if item in label_set]
        if len(labels) == 0:
            continue
        labels = lb.transform(labels)
        x_data.append(summary)
        y_data.append(labels)
    y_data = mlb.fit_transform(y_data)
    return x_data, y_data, mlb, lb
def create_model(key, answers, tags):

    filename     = '%s/%s.pickle' % (folder, key)
    X_train      = np.array(answers)
    y_train_text = tags

    mlb = MultiLabelBinarizer()
    Y = mlb.fit_transform(y_train_text)

    classifier = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', OneVsRestClassifier(LinearSVC()))])

    classifier.fit(X_train, Y)

    # Serialize both the pipeline and binarizer to disk.
    with open(filename, 'wb') as f:
        pickle.dump((mlb, classifier), f)
示例#23
0
class predictor():
    # In this perhaps pass in customizable parameters so
    # __init__(self, loss="hinge", penalty="l2")
    # This way, we can try out different loss functions easily
    def __init__(self):
        self.trainExamples = ['exodus_gods_and_kings.p', 'how_to_train_your_dragon_2.p', 'bears.p', 'see_no_evil_2.p', 'addicted.p', "the_internet's_own_boy_the_story_of_aaron_swartz.p", 'the_salt_of_the_earth.p', 'the_other_woman.p', 'project_almanac.p', 'edge_of_tomorrow.p', 'maya_the_bee_movie.p', 'cowspiracy_the_sustainability_secret.p', "let's_be_cops.p", "winter's_tale.p", 'the_trip_to_italy.p', 'yellowbird.p', 'alexander_and_the_terrible_horrible_no_good_very_bad_day.p', 'rosewater.p', 'the_hero_of_color_city.p', 'endless_love.p', 'dracula_untold.p', 'dumb_and_dumber_to.p', 'tomorrowland.p', 'the_hunger_games_mockingjay_part_1.p', 'tammy.p', 'hot_tub_time_machine_2.p', 'lucy.p', 'the_lego_movie.p', 'the_judge.p', 'cake.p', 'st_vincent.p', 'black_or_white.p', 'american_sniper.p', 'mr_peabody_&_sherman.p', 'this_is_where_i_leave_you.p', 'x-men_days_of_future_past.p', 'non-stop.p', 'get_on_up.p', 'the_fault_in_our_stars.p', 'song_one.p', 'robocop.p', 'into_the_storm.p', 'a_most_wanted_man.p', 'the_good_lie.p', 'wild.p', 'the_maze_runner.p', 'beyond_the_lights.p', 'divergent.p', 'spring.p', 'as_above_so_below.p', 'noble.p', 'hercules.p', 'i-lived&y=2015.p', 'night_at_the_museum_secret_of_the_tomb.p', 'planes:fire_&_rescue.p', 'old_fashioned.p', 'the_identical.p', 'dawn_of_the_planet_of_the_apes.p', 'cabin_fever_patient_zero.p', 'ride_along.p', 'dear_white_people.p', 'if_i_stay.p', 'red_army.p', 'the_boxtrolls.p', 'captain_america_the_winter_soldier.p', 'virunga.p', 'the_interview.p', 'earth_to_echo.p', 'a_walk_among_the_tombstones.p', 'persecuted.p', 'the_book_of_life.p', 'unbroken.p', 'the_drop.p', 'need_for_speed.p', 'brick_mansions.p', 'maleficent.p', 'blended.p', "devil's_due.p", 'jessabelle.p', 'fear_clinic.p', 'gone_girl.p', 'birdman_or_the_unexpected_virtue_of_ignorance.p', 'kill_the_messenger.p', 'my_little_pony_equestria_girls.p', 'rio_2.p', 'big_hero_6.p', 'guardians_of_the_galaxy.p', 'noah.p', 'the_hobbit_the_battle_of_the_five_armies.p', 'i_frankenstein.p', 'the_november_man.p', 'the_pyramid.p', 'and_so_it_goes.p', 'birdman_or_the_unexpected_virtue_of_ignorance.p', 'inherent_vice.p', 'merchants_of_doubt.p', 'iris.p', 'lambert,_stamp.p']
        self.testExamples = [x for x in util2.getMovieDataset() if x not in self.trainExamples]
        # Standard DictVectorizer fitted with all colors as the features.
        self.dVec = DictVectorizer(sparse=False)
        self.dVec.fit([dict((feature,0) for feature in util2.getColors())])
        # Standard MultiLabelBinarizer with all genre names 
        self.mlb = MultiLabelBinarizer()
        self.pipeline = Pipeline([
            ('organizeData', Movie_Data_Aggregator()),
            ('union', FeatureUnion(
                transformer_list = [
                ('colors', Pipeline([
                    ('selector', Data_Selector(key='colors')),
                    ('dVec', self.dVec),
                    ])),
                ('subs', Pipeline([
                    ('selector', Data_Selector(key='subs')),
                    ('tfidf', TfidfVectorizer(strip_accents='ascii', max_features=15)),
                    ])),
                ],
                transformer_weights={
                'colors': 0.5,
                'subs': 0.5,
                },
                )),
            ('sgd', SGDClassifier(alpha= 1e-06, loss="perceptron", n_iter= 150, penalty="l2")),
            ])
		# OneVsRestClassifier used for prediction
        self.classif = OneVsRestClassifier(self.pipeline)
                
    def learnPredictor(self, numbers=False):
        train_genres = self.mlb.fit_transform(util2.getCorrectGenres(self.trainExamples))
        self.classif.fit(self.trainExamples, train_genres)
        return train_genres

    def predict(self, numbers=False): 
		return self.classif.predict(self.testExamples)
class MultiLabelDataset(Dataset):
    def __init__(self, csv_path, img_path, transform=None):
        tmp_df = pd.read_csv(csv_path)
        
        self.mlb = MultiLabelBinarizer()
        self.img_path = img_path
        self.transform = transform

        self.X_train = tmp_df['image_name']
        self.y_train = self.mlb.fit_transform(tmp_df['tags'].str.split()).astype(np.float32)

    def __getitem__(self, index):
        img = Image.open(os.path.join(self.img_path, self.X_train[index]))
        img = img.convert('RGB')
        if self.transform is not None:
            img = self.transform(img)
        label = torch.from_numpy(self.y_train[index])
        return img, label

    def __len__(self):
        return len(self.X_train.index)
def run_classifierAccuracy(terms, labels, testSentences, testLabels):
	labels = ["Drought", "Earthquake", "Flood", "Epidemic", "Hurricane", \
			"Rebellion", "Terrorism", "Tornado", "Tsunami", "displaced_people_and_evacuations", \
			"donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \
			"injured_or_dead_people", "missing_trapped_or_found_people"]
	import numpy as np
	class_terms_matrix, tfidf = tf_idf_fit_transform(terms)

	sentence_matrix = tfidf.transform(testSentences)

	print("Shape of sentence matrix : ", sentence_matrix.shape)
	# print("Original order of lables:")
	# print(labels)

	from sklearn.metrics.pairwise import cosine_similarity
	similarity_matrix = cosine_similarity(sentence_matrix, class_terms_matrix)
	similarity_matrix = binary_rel(similarity_matrix)

	predictions = []
	for i in range(len(testSentences)):
		predictions.append([labels[x] for x in range(similarity_matrix.shape[1]) if similarity_matrix[i][x]==1])

	from sklearn.preprocessing import MultiLabelBinarizer
	mlb = MultiLabelBinarizer(classes=labels)
	# mlb = MultiLabelBinarizer()
	test_label_matrix = mlb.fit_transform(testLabels)
	predictions = mlb.transform(predictions)
	print("Shape of label matrix : ", test_label_matrix.shape)
	print("Labels : ", mlb.classes_)

	from sklearn.metrics import f1_score, precision_score, recall_score
	print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro'))
	print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro'))
	print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro'))
	print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro'))
	print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro'))
	print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro'))
	print("Macro-Precision", precision_score(test_label_matrix, predictions, average=None))
	print("Macro-Recall", recall_score(test_label_matrix, predictions, average=None))
	print("Macro-F1", f1_score(test_label_matrix, predictions, average=None))
示例#26
0
def _create_classifier():
    data_train = [ln.rsplit(None, 1) for ln in open(_download())]
    X_train, Y_train = zip(*data_train)
    del data_train

    mlb = MultiLabelBinarizer()
    Y_train = [set(s.split('_')) - {'None'} for s in Y_train]
    Y_train = mlb.fit_transform(Y_train)

    clf = make_pipeline(TfidfVectorizer(sublinear_tf=True, use_idf=False),
                            LinearSVC(dual=False))
    # XXX class_weight="auto" causes a lot of deprecation warnings, but it
    # still fares better than the new class_weight="balanced" heuristic.
    # n_jobs=-1 causes nosetests to hang so that is disabled for now.
    params = {'tfidfvectorizer__use_idf': [True, False],
              'tfidfvectorizer__sublinear_tf': [True, False],
              'linearsvc__class_weight': ["auto", None],
              'linearsvc__C': [.01, .1, 1, 10, 100, 1000],
              'linearsvc__penalty': ['l1', 'l2'],
             }
    clf = OneVsRestClassifier(_GridSearch(clf, params, scoring='f1',
                                              verbose=1, cv=5))
    return clf.fit(X_train, Y_train), mlb
示例#27
0
def train(window_size_ms, train_time_sec=30, clf = OneVsRestClassifier(DecisionTreeClassifier()), n_keys=2):
	#loop until empty input is detected
	X = []
	y = []
	labels = [(i,) for i in range(n_keys+1)]

	mb = MultiLabelBinarizer()
	labels = mb.fit_transform(labels)

	print "Training time for each key is {} seconds".format(train_time_sec)
	for label_num, label in enumerate(labels):
		raw_input('Press <enter> to begin training key {}'.format(label_num))
		i = 0
		while i < train_time_sec:
			i += (window_size_ms / float(1000))
			freq_spect = read_spectral_data_for_time(window_size_ms)
			X.append(freq_spect)
			y.append(label)

	X = np.asarray(X)
	y = np.asarray(y)
	clf.fit(X, y)
	return (clf, mb)
示例#28
0
def get_labels(csv_path):
    mlb = MultiLabelBinarizer()

    labels_file = os.path.join(csv_path)
    l = np.loadtxt(labels_file, dtype=np.dtype('string'), delimiter=",",skiprows=1)

    biz2labels = dict([(x[0],x[1].split()) for x in l])

    bin_labels = mlb.fit_transform(biz2labels.values())

    new_dict = dict()
    for i, bid in enumerate(biz2labels.keys()):
        new_dict[bid] = bin_labels[i]


    biz_labels_df = pandas.DataFrame.from_dict(new_dict, orient='index', dtype='bool')
    biz_labels_df.columns = ['good_for_lunch','good_for_dinner'
    ,'takes_reservations','outdoor_seating'
    ,'restaurant_is_expensive','has_alcohol'
    ,'has_table_service','ambience_is_classy'
    ,'good_for_kids']

    return biz_labels_df
示例#29
0
def get_classify():
    X_train, Y_train = load_data()

    # 定义分类器
    classifier = Pipeline([
        ('counter', CountVectorizer(tokenizer=jieba_tokenizer)),  # 标记和计数,提取特征用 向量化
        ('tfidf', TfidfTransformer()),                            # IF-IDF 权重
        ('clf', OneVsRestClassifier(LinearSVC())),                # 1-rest 多分类(多标签)
    ])
    mlb = MultiLabelBinarizer()
    Y_train = mlb.fit_transform(Y_train)                          # 分类号数值化

    classifier.fit(X_train, Y_train)

    # X_test = ["数据分析"]
    # 把所有的测试文本存到一个list中
    test_list = []
    test_name = []
    filelist2 = os.listdir(base_path + "data_test/")
    for files in filelist2:
        # print (files)
        test_name.append(files)
        f = open(base_path + "data_test/" + files, 'r')
        test_list.append(f.read())

    prediction = classifier.predict(test_list)
    result = mlb.inverse_transform(prediction)

    f = open('result2.txt', 'w')
    for i in range(len(test_name)):
        f.write(str(test_name[i]) + '   ' + str(result[i]) + '\n')

    print (result, len(result))
    num_dict = Counter(result)
    print (len(num_dict))
    print ((num_dict[('1',)] + num_dict[('2',)] + num_dict[('3',)]) / float(len(result)))  # 整数除整数为0,应把其中一个改为浮点数。
def featurize():
    instances, labels = datahandler.load_corpus(settings.DATA_DIR)
    # when code testing limit the instances
    if settings.TEST:
        instances, labels = instances[:settings.
                                      N_TEST_INSTANCES], labels[:settings.
                                                                N_TEST_INSTANCES]

    all_stats = datahandler.get_label_stats(labels)

    print(
        sum([v for k, v in all_stats["multilabel_distr"].items() if "," in k]))
    print(
        sum([v for k, v in all_stats["multilabel_count"].items() if "," in k]))

    # Encode labels
    labels_orig = np.array(labels, dtype=object)
    labels = [
        l if l != -1 else [] for l in labels
    ]  # TODO test with negative label instances as a label in learning
    label_encoder = MultiLabelBinarizer()  # makes multihot label encodings
    y = label_encoder.fit_transform(labels)

    # Make sequence data from text
    # # Load predetermined holdout split
    with open(settings.EXPERIMENT_DATA, "rt") as exp_in_test:
        experiment = json.load(exp_in_test)

    # when code testing limit the instances by using the slicing done by predetermined holdout split indices
    if settings.TEST:
        idc_in, idc_out = train_test_split(np.arange(
            settings.N_TEST_INSTANCES),
                                           test_size=0.2)
    else:
        idc_in, idc_out = experiment["meta_holdin_indices"], experiment[
            "meta_holdout_indices"]

    x, word_index, max_sequence_length = make_sequences(instances)

    x_in = x[idc_in]
    x_out = x[idc_out]
    y_in = y[idc_in]
    y_out = y[idc_out]
    instances_in = np.array(instances)[idc_in]
    instances_out = np.array(instances)[idc_out]
    labels_in = labels_orig[idc_in]
    labels_out = labels_orig[idc_out]

    logging.info("Train class category counts: \n{}\n---------\n"
                 "Test class category counts: \n{}.".format(
                     datahandler.get_label_info(labels_in),
                     datahandler.get_label_info(labels_out)))

    emb_input_dim = len(word_index) + 1
    output_units = len(label_encoder.classes_)

    # write the featurized data
    feature_data = {
        "x_in": x_in.tolist(),
        "x_out": x_out.tolist(),
        "y_in": y_in.tolist(),
        "y_out": y_out.tolist(),
        "instances_in": instances_in.tolist(),
        "instances_out": instances_out.tolist(),
        "labels_in": labels_in.tolist(),
        "labels_out": labels_out.tolist(),
        "max_sequence_length": max_sequence_length,
        "emb_input_dim": emb_input_dim,
        "output_units": output_units,
        "word_index": word_index,
        "all_stats": all_stats,
        "classes": label_encoder.classes_.tolist(),
    }

    util.write_features(feature_data)

    return feature_data
示例#31
0
    def eval(self,
             model,
             return_preds_and_labels=False,
             calibrate_conf_scores=False):
        """
        Performs evaluation on a given model.

        :param model: The model on which to perform evaluation
        :type model: AdaptiveModel
        :param return_preds_and_labels: Whether to add preds and labels in the returned dicts of the
        :type return_preds_and_labels: bool
        :param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores
        :type calibrate_conf_scores: bool
        :return all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics
                             and reports generated during evaluation.
        :rtype all_results: list of dicts
        """
        model.eval()

        # init empty lists per prediction head
        loss_all = [0 for _ in model.prediction_heads]
        preds_all = [[] for _ in model.prediction_heads]
        label_all = [[] for _ in model.prediction_heads]
        ids_all = [[] for _ in model.prediction_heads]
        passage_start_t_all = [[] for _ in model.prediction_heads]
        logits_all = [[] for _ in model.prediction_heads]

        for step, batch in enumerate(
                tqdm(self.data_loader, desc="Evaluating", mininterval=10)):
            batch = {key: batch[key].to(self.device) for key in batch}

            with torch.no_grad():

                logits = model.forward(**batch)
                losses_per_head = model.logits_to_loss_per_head(logits=logits,
                                                                **batch)
                preds = model.logits_to_preds(logits=logits, **batch)
                labels = model.prepare_labels(**batch)

            # stack results of all batches per prediction head
            for head_num, head in enumerate(model.prediction_heads):
                loss_all[head_num] += np.sum(
                    to_numpy(losses_per_head[head_num]))
                preds_all[head_num] += list(to_numpy(preds[head_num]))
                label_all[head_num] += list(to_numpy(labels[head_num]))
                if head.model_type == "span_classification":
                    ids_all[head_num] += list(to_numpy(batch["id"]))
                    passage_start_t_all[head_num] += list(
                        to_numpy(batch["passage_start_t"]))
                    if calibrate_conf_scores:
                        logits_all[head_num] += list(to_numpy(logits))

        # Evaluate per prediction head
        all_results = []
        for head_num, head in enumerate(model.prediction_heads):
            if head.model_type == "multilabel_text_classification":
                # converting from string preds back to multi-hot encoding
                from sklearn.preprocessing import MultiLabelBinarizer
                mlb = MultiLabelBinarizer(classes=head.label_list)
                # TODO check why .fit() should be called on predictions, rather than on labels
                preds_all[head_num] = mlb.fit_transform(preds_all[head_num])
                label_all[head_num] = mlb.transform(label_all[head_num])
            if head.model_type == "span_classification" and calibrate_conf_scores:
                temperature_previous = head.temperature_for_confidence.item()
                logger.info(
                    f"temperature used for confidence scores before calibration: {temperature_previous}"
                )
                head.calibrate_conf(logits_all[head_num], label_all[head_num])
                temperature_current = head.temperature_for_confidence.item()
                logger.info(
                    f"temperature used for confidence scores after calibration: {temperature_current}"
                )
                temperature_change = (
                    abs(temperature_current - temperature_previous) /
                    temperature_previous) * 100.0
                if temperature_change > 50:
                    logger.warning(
                        f"temperature used for calibration of confidence scores changed by more than {temperature_change} percent"
                    )
            if hasattr(head, 'aggregate_preds'):
                # Needed to convert NQ ids from np arrays to strings
                ids_all_str = [x.astype(str) for x in ids_all[head_num]]
                ids_all_list = [list(x) for x in ids_all_str]
                head_ids = ["-".join(x) for x in ids_all_list]
                preds_all[head_num], label_all[
                    head_num] = head.aggregate_preds(
                        preds=preds_all[head_num],
                        labels=label_all[head_num],
                        passage_start_t=passage_start_t_all[head_num],
                        ids=head_ids)

            result = {
                "loss": loss_all[head_num] / len(self.data_loader.dataset),
                "task_name": head.task_name
            }
            result.update(
                compute_metrics(metric=head.metric,
                                preds=preds_all[head_num],
                                labels=label_all[head_num]))

            # Select type of report depending on prediction head output type
            if self.report:
                try:
                    result["report"] = compute_report_metrics(
                        head, preds_all[head_num], label_all[head_num])
                except:
                    # logger.error(f"Couldn't create eval report for head {head_num} with following preds and labels:"
                    #              f"\n Preds: {preds_all[head_num]} \n Labels: {label_all[head_num]}")
                    result["report"] = "Error"

            if return_preds_and_labels:
                result["preds"] = preds_all[head_num]
                result["labels"] = label_all[head_num]

            all_results.append(result)

        return all_results
示例#32
0
def binarize_dataset(df, classes):
    binarizer = MultiLabelBinarizer(classes=classes, sparse_output=False)
    return binarizer.fit_transform(df)
示例#33
0
train['genre_list'] = train['genre_list'].apply(lambda x: ast.literal_eval(x))
test['genre_list'] = test['genre_list'].apply(lambda x: ast.literal_eval(x))
val['genre_list'] = val['genre_list'].apply(lambda x: ast.literal_eval(x))

labels = {}

for genre in test['genre_list']:
    if len(genre) in labels:
        labels[len(genre)] += 1
    else:
        labels[len(genre)] = 1

mlb = MultiLabelBinarizer()
mlb.fit(dataset['genre_list'].tolist())

transformed_labels = mlb.fit_transform(dataset['genre_list'].tolist())

train_labels = mlb.transform(train['genre_list'].tolist())

test_labels = mlb.transform(test['genre_list'].tolist())

val_labels = mlb.transform(val['genre_list'].tolist())

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = text.translate(str.maketrans('', '', punctuation))
    text = text.lower().strip()
    text = ' '.join([
示例#34
0
import time
from code.lib.projectlib import make_train_set,metriccalculation, scorer, sort_by_frequency, last_index_of_freq
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib
from skmultilearn.problem_transform import ClassifierChain

x_train, y_train = make_train_set("../../data", "training_dict_f_91.json", "labels.csv", second_features_videos=None, weighted=False)

mlb = MultiLabelBinarizer(sparse_output=False)
original_binlabels = mlb.fit_transform(y_train[:])
original_classes = list(mlb.classes_)
binlabels, classes, class_frequencies = sort_by_frequency(original_binlabels, original_classes)

# If a label have more than 5 occurences, it is considered
number_of_labels = last_index_of_freq(class_frequencies, 5)
y_train = binlabels[:, :number_of_labels]

# create classifier
algorithm = DecisionTreeClassifier()
ensemble = BaggingClassifier(algorithm, random_state=10)
classifier = ClassifierChain(ensemble)

# run cross validation to evaluate the classifier
start_ex = time.time()
m=cross_val_score(classifier,x_train,y_train,scoring=scorer(metric=metriccalculation),cv=10)
end_ex = time.time()

train_time = end_ex - start_ex
示例#35
0
def setup(train_files, test_files, specific):
    scalar = StandardScaler()
    mlb = MultiLabelBinarizer()
    train_labels = []
    train_data = []
    train_keys = []
    for f in train_files.keys():
        path = constants.path + 'acousticbrainz-mediaeval-train/' + f[:
                                                                      2] + '/' + f + '.json'
        song = readjson(path)
        feat = getFeature(song)
        if len(feat) != 391:
            continue
        train_keys.append(f)
        train_data.append(feat)
        train_labels.append(train_files[f])
    print('finished train')
    train_labels = mlb.fit_transform(train_labels)
    train_data = scalar.fit_transform(train_data)
    print('finished transforming')
    path = constants.path + specific + '_mlb.pkl'
    dump(mlb, path)

    path = constants.path + specific + '_scalar.pkl'
    dump(scalar, path)

    path = constants.path + specific + '_train.pkl'
    data = dict()
    data['features'] = train_data
    data['labels'] = train_labels
    data['keys'] = train_keys
    dump(data, path)

    print('finished dumping')
    #classifier = MultiOutputClassifier(LinearSVC(C=10, class_weight='balanced', dual=True), n_jobs = 4)
    classifier = MultiOutputClassifier(RandomForestClassifier(
        n_estimators=20, class_weight='balanced'),
                                       n_jobs=4)
    classifier.fit(train_data, train_labels)
    print('finished fitting')
    data = 0
    train_data = 0
    train_labels = 0
    train_keys = 0
    gc.collect()

    #test_labels = []
    test_data = []
    test_keys = list(test_files.keys())
    mean = scalar.mean_
    for f in test_keys:
        path = constants.path + 'acousticbrainz-mediaeval-train/' + f[:
                                                                      2] + '/' + f + '.json'
        song = readjson(path)
        feat = getFeature(song)
        if len(feat) < 391:
            length = len(feat)
            for m in mean[length:]:
                feat += [m]
        test_data.append(feat)
    test_data = scalar.transform(test_data)
    predictions = classifier.predict(test_data)
    print('finished predictions')
    genre_predictions = mlb.inverse_transform(predictions)
    write(genre_predictions, test_keys, specific)
    print('finished writing predictions')
示例#36
0
# tempX = pd.DataFrame(U[0])
# tempY = pd.DataFrame(U[1])

# In[47]:

# Importing sklearn required libraries
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack
from sklearn.svm import LinearSVC

# In[48]:

multilabel_binarizer = MultiLabelBinarizer()
y_bin = multilabel_binarizer.fit_transform(U_code[1].values)

# In[49]:


def evaluation_metrics(y_actual, y_predicted):

    accuracy = accuracy_score(y_actual, y_predicted)
    print("Accuracy:", accuracy)
    recall = recall_score(y_actual, y_predicted, average='micro')
    print("Recall:", recall)
    precision = precision_score(y_actual, y_predicted, average='micro')
    print("Precision:", precision)
    f1 = f1_score(y_actual, y_predicted, average='micro')
    print("F1-Score:", f1)
    hamming = hamming_loss(y_actual, y_predicted)
示例#37
0
])
title_pd['from'] = title_pd['from'].astype(float)
title_pd['favorites'] = title_pd['favorites'] / title_pd['members']
cols_to_multihot = ['producers', 'studios', 'genres']
temp = [['episodes', 'duration', 'scored_by', 'members', 'favorites'],
        list(title_pd.loc[:, 'opening_themes':'Spin-off'].columns)]
cols_to_num = [item for elem in temp for item in elem]
cols_to_onehot = ['type', 'source', 'rating', 'season']
one_hot = OneHotEncoder(sparse=False)
mlb1 = MultiLabelBinarizer(sparse_output=False)
mlb2 = MultiLabelBinarizer(sparse_output=False)
mlb3 = MultiLabelBinarizer(sparse_output=False)
scaler = StandardScaler()
mlb_pd = pd.concat([
    pd.DataFrame(title_pd['index']),
    pd.DataFrame(mlb1.fit_transform(title_pd['producers']),
                 columns=mlb1.classes_ + '_p'),
    pd.DataFrame(mlb2.fit_transform(title_pd['studios']),
                 columns=mlb2.classes_),
    pd.DataFrame(mlb3.fit_transform(title_pd['genres']),
                 columns=mlb3.classes_ + '_g')
],
                   axis=1,
                   sort=False)
train, test = train_test_split(title_pd, test_size=0.2, random_state=228)
#test is transformed implicitly to avoid copying the code below by setting train=test
train = train.sort_values(by=['index'])
mlb_pd = mlb_pd[mlb_pd['index'].isin(train['index'])]
trans_train = pd.concat([
    pd.DataFrame(train['index'], columns=['index']),
    pd.DataFrame(scaler.fit_transform(train[cols_to_num]),
    label = imagePath.split(os.path.sep)[-2].split("_")
    labels.append(label)

data = np.array(data, dtype="float")
labels = np.array(labels)
Labels_verbal = labels

print("[INFO] Private data images loaded!")

print("Reshaping data!")
print("Data Reshaped to feed into models channels last")

from sklearn.preprocessing import MultiLabelBinarizer
print("Labels formatting")
lb = MultiLabelBinarizer()
labels = lb.fit_transform(labels)
print("Labels ok!")

#%%
time1 = time.time()  #initiate time counter
n_split = 5  #10fold cross validation
scores = []  #here every fold accuracy will be kept
predictions_all = np.empty(0)  # here, every fold predictions will be kept
test_labels = np.empty(0)  #here, every fold labels are kept

omega = 1

for train_index, test_index in KFold(n_split).split(data):
    trainX, testX = data[train_index], data[test_index]
    trainY, testY = labels[train_index], labels[test_index]
def main():
    parser = ArgumentParser("scoring",
                            formatter_class=ArgumentDefaultsHelpFormatter,
                            conflict_handler='resolve')
    parser.add_argument("--emb", required=True, help='Embeddings file')
    parser.add_argument("--network", required=True,
                        help='A .mat file containing the adjacency matrix and node labels of the input network.')
    parser.add_argument("--adj-matrix-name", default='network',
                        help='Variable name of the adjacency matrix inside the .mat file.')
    parser.add_argument("--label-matrix-name", default='group',
                        help='Variable name of the labels matrix inside the .mat file.')
    parser.add_argument("--num-shuffles", default=2, type=int, help='Number of shuffles.')
    parser.add_argument("--all", default=False, action='store_true',
                        help='The embeddings are evaluated on all training percents from 10 to 90 when this flag is set to true. '
                             'By default, only training percents of 10, 50 and 90 are used.')

    args = parser.parse_args()
    # 0. Files
    embeddings_file = args.emb
    matfile = args.network

    # 1. Load Embeddings
    model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)

    # 2. Load labels
    mat = loadmat(matfile)
    A = mat[args.adj_matrix_name]
    graph = sparse2graph(A)
    labels_matrix = mat[args.label_matrix_name]
    labels_count = labels_matrix.shape[1]
    mlb = MultiLabelBinarizer(range(labels_count))

    # Map nodes to their features (note:  assumes nodes are labeled as integers 1:N)
    features_matrix = numpy.asarray([model[str(node)] for node in range(len(graph))])

    # 2. Shuffle, to create train/test groups
    shuffles = []
    for x in range(args.num_shuffles):
        shuffles.append(skshuffle(features_matrix, labels_matrix))

    # 3. to score each train/test group
    all_results = defaultdict(list)

    if args.all:
        training_percents = numpy.asarray(range(1, 10)) * .1
    else:
        training_percents = [0.1, 0.5, 0.9]
    for train_percent in training_percents:
        for shuf in shuffles:

            X, y = shuf

            training_size = int(train_percent * X.shape[0])

            X_train = csr_matrix(X[:training_size, :])
            y_train_ = csr_matrix(y[:training_size])

            y_train = [[] for x in range(y_train_.shape[0])]

            cy = y_train_.tocoo()
            for i, j in zip(cy.row, cy.col):
                y_train[i].append(j)

            assert sum(len(l) for l in y_train) == y_train_.nnz

            X_test = csr_matrix(X[training_size:, :])
            y_test_ = csr_matrix(y[training_size:])

            y_test = [[] for _ in range(y_test_.shape[0])]

            cy = y_test_.tocoo()
            for i, j in zip(cy.row, cy.col):
                y_test[i].append(j)

            clf = TopKRanker(LogisticRegression())
            clf.fit(X_train, y_train_)

            # find out how many labels should be predicted
            top_k_list = [len(l) for l in y_test]
            preds = clf.predict(X_test, top_k_list)

            results = {}
            averages = ["micro", "macro"]
            for average in averages:
                results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average)

            all_results[train_percent].append(results)

    print('Results, using embeddings of dimensionality', X.shape[1])
    print('-------------------')
    for train_percent in sorted(all_results.keys()):
        print('Train percent:', train_percent)
        for index, result in enumerate(all_results[train_percent]):
            print('Shuffle #%d:   ' % (index + 1), result)
        avg_score = defaultdict(float)
        for score_dict in all_results[train_percent]:
            for metric, score in iteritems(score_dict):
                avg_score[metric] += score
        for metric in avg_score:
            avg_score[metric] /= len(all_results[train_percent])
        print('Average score:', dict(avg_score))
        print('-------------------')
示例#40
0
  y = data.iloc[:,2:]
  print(repr(data))

  x=np.array([point[0] for point in data[:300] if point[0]!='comment_text'])
  y=np.array([[point[i] for i in range(2,len(point))] for point in data[:300] if point[0]!='comment_text'])
  print(len(x))
  print(len(y))
  X_TRAIN, X_TEST , Y_TRAIN, Y_TEST = train_test_split(x,y,test_size=.2)
  return X_TRAIN, X_TEST, Y_TRAIN, Y_TEST


train_data, test_data, train_labels, test_labels = loadData('training.csv')


mlb = MultiLabelBinarizer()
binary_train_labels = mlb.fit_transform(train_labels)
binary_test_labels = mlb.transform(test_labels)

vectorizer = TfidfVectorizer()
vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

print(vectorised_test_data)
print(vectorised_train_data)
print(train_labels)
print(test_labels)
print("ok1")
classifier = BinaryRelevance(GaussianNB())
print("ok2")

classifier.fit(vectorised_train_data.todense(), train_labels)
    parser.add_argument('--max-df', default=1.0, type=float)
    parser.add_argument('--min-df', default=1, type=int)
    return parser


if __name__ == '__main__':
    parser = get_parser()
    args = parser.parse_args()

    stop_list = stopwords.words('english')

    all_questions = []
    all_labels = []
    with open(args.questions, 'r') as f:
        for line in f:
            doc = json.loads(line)
            question = "{} {}".format(doc['Title'], doc['Body'])
            all_questions.append(question)
            all_labels.append(doc['Tags'].split(' '))

    vectorizer = TfidfVectorizer(min_df=args.min_df,
                                 stop_words=stop_list,
                                 max_df=args.max_df)
    X = vectorizer.fit_transform(all_questions)
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(all_labels)
    classifier = OneVsRestClassifier(LinearSVC())

    scores = cross_val_score(classifier, X, y=y, cv=10, scoring='f1_micro')
    print("Average F1: {}".format(np.mean(scores)))
      one_hot.inverse_transform(one_hot.transform(feature)))

# Import library
import pandas as pd
# Create dummy variables from feature
print("Matrix encoding with pandas: \n", pd.get_dummies(feature[:, 0]))

# Create multiclass feature
multiclass_feature = [("Texas", "Florida"), ("California", "Alabama"),
                      ("Texas", "Florida"), ("Delware", "Florida"),
                      ("Texas", "Alabama")]
# Create multiclass one-hot encoder
one_hot_multiclass = MultiLabelBinarizer()
# One-hot encode multiclass feature
print("Multi label binarizer: \n",
      one_hot_multiclass.fit_transform(multiclass_feature))
print("Feature classes: \n", one_hot_multiclass.classes_)

print("\n")
print("\n")
print("\n")

#5.2 Encoding Ordinal Categorical Features

# Load library
import pandas as pd
# Create features
dataframe = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})

# Create mapper
scale_mapper = {"Low": 1, "Medium": 2, "High": 3}
示例#43
0
def mycode(train_files, test_files, specific, indicies):
    indicies = np.array(indicies)
    #indicies = indicies[:len(indicies)//4]
    scalar = StandardScaler()
    mlb = MultiLabelBinarizer()
    train_labels = []
    train_data = []
    train_keys = []
    keys = list(train_files.keys())
    random.shuffle(keys)
    subset = 150000  #len(keys)
    count = 0
    for f in keys[:subset]:
        count += 1
        path = constants.path + 'acousticbrainz-mediaeval-train/' + f[:
                                                                      2] + '/' + f + '.json'
        song = readjson(path)
        feat = getAllFeatures(song)
        if len(feat) != 2647:
            continue
        feat = np.array(feat)
        feat = feat[indicies]
        train_keys.append(f)
        train_data.append(feat)
        train_labels.append(train_files[f])
        if count % 10000 == 0:
            print("on ", count, "length of keys: ", len(train_keys))

    print('finished train')
    train_labels = mlb.fit_transform(train_labels)
    train_data = scalar.fit_transform(train_data)
    print('finished transforming')
    path = constants.path + specific + '_all2_mlb.pkl'
    dump(mlb, path)

    path = constants.path + specific + '_all2_scalar.pkl'
    dump(scalar, path)

    print(np.shape(train_data))
    path = constants.path + specific + '_all2_train.pkl'
    data = dict()
    data['features'] = train_data
    data['labels'] = train_labels
    data['keys'] = train_keys
    dump(data, path)

    print('finished dumping')
    #classifier = MultiOutputClassifier(LinearSVC(C=10, class_weight='balanced', dual=True), n_jobs = 4)
    classifier = MultiOutputClassifier(RandomForestClassifier(
        n_estimators=32, class_weight='balanced'),
                                       n_jobs=4)
    data = 0
    train_files = 0
    train_keys = 0
    keys = 0
    gc.collect()
    classifier.fit(train_data, train_labels)
    print('finished fitting')
    path = constants.path + specific + '_all2_classifier.pkl'
    dump(classifier, path)
    """
    with open(constants.path + specific + '_all2_scalar.pkl', 'rb') as data_file:
        scalar = pickle.load(data_file)
    with open(constants.path + specific + '_all2_mlb.pkl', 'rb') as data_file:
        mlb = pickle.load(data_file)
    with open(constants.path + specific + '_all2_classifier.pkl', 'rb') as data_file:
        classifier = pickle.load(data_file)
    """
    data = 0
    train_data = 0
    train_labels = 0
    train_keys = 0
    gc.collect()

    #test_labels = []
    test_data = []
    test_keys = list(test_files.keys())
    mean = scalar.mean_
    for f in test_keys:
        path = constants.path + 'acousticbrainz-mediaeval-train/' + f[:
                                                                      2] + '/' + f + '.json'
        song = readjson(path)
        feat = getAllFeatures(song)
        """
        if len(feat) < 2647:
            length = len(feat)
            for m in mean[length:]:
                feat += [m]
        """
        #feat = np.array(feat)
        if len(feat) < 2647:
            length = len(feat)
            print('Before: ', length)
            for m in range(2647 - length):
                feat += [np.random.rand()]
            #m = mean[indicies.index(2647)]
            #feat += [m]
            print('After: ', len(feat))
        feat = np.array(feat)
        feat = feat[indicies]
        test_data.append(feat)
    test_data = scalar.transform(test_data)
    predictions = classifier.predict(test_data)
    print('finished predictions')
    genre_predictions = mlb.inverse_transform(predictions)
    write(genre_predictions, test_keys, specific)
    print('finished writing predictions')
示例#44
0
# load libraries
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# Create NumPy array
y = [('Texas', 'Florida'), ('California', 'Alabama'), ('Texas', 'Florida'),
     ('Delware', 'Florida'), ('Texas', 'Alabama')]

# Create MultiLabelBinarizer object
one_hot = MultiLabelBinarizer()

# One-hot encode data
one_hot.fit_transform(y)

# view classes
one_hot.classes_
示例#45
0
def run(n_splits=5):
    """
    Prepare the Brexit Blog Corpus data set for analysis
    :param n_splits: int, the number of train/test splits to degenerate, default=5
    :return:
    """
    print('Reading and processing the xlsx file...', end='')
    brexit_blog_corpus = pd.read_excel('brexit_blog_corpus.xlsx')

    # fix up some typos
    brexit_blog_corpus.replace('concession/contrarines', np.nan, inplace=True)
    brexit_blog_corpus.replace('hypotheticallity',
                               'hypotheticality',
                               inplace=True)

    # unfortunately, quite a few utterances are duplicates :(
    clean_dataset = brexit_blog_corpus.drop_duplicates(subset='Utterance')

    stance_columns = [
        'Stance category', 'second stance category', 'third', 'fourth', 'fifth'
    ]

    clean_dataset = clean_dataset[['Utterance ID No', 'Utterance'] +
                                  stance_columns].set_index('Utterance ID No')

    # extract the stance categories and do some cleaning
    stance_categories = set(clean_dataset[stance_columns].values.flatten())
    stance_categories.discard(np.nan)
    stance_categories = sorted(list(stance_categories))
    stance_categories = [
        w.replace(' ', '-').replace('/', '-') for w in stance_categories
    ]

    # one-hot encode the assigned stance labels
    mlb = MultiLabelBinarizer()
    k_hot_encoded_stances = mlb.fit_transform(
        [x[~pd.isnull(x)] for x in clean_dataset[stance_columns].values])
    k_hot_encoded_stances = pd.DataFrame(index=clean_dataset.index,
                                         data=k_hot_encoded_stances,
                                         columns=list(mlb.classes_))
    k_hot_encoded_stances.columns = stance_categories

    # join the one-hot encoded labels and utterances back together again
    clean_dataset_one_hot = clean_dataset[['Utterance', 'Stance category']] \
        .join(k_hot_encoded_stances)
    print('done.')

    print('Tokenising the utterances...', end='')
    # tokenize the Utterance
    tokenizer = NISTTokenizer()
    clean_dataset_one_hot.Utterance = clean_dataset_one_hot.Utterance.apply(
        lambda x: ' '.join(tokenizer.tokenize(x, lowercase=True)))
    print('done.')

    print('Constructing train/test split and saving to disk...', end='')
    # split the data into train and test sets in the ratio 80:20
    stance_columns = set(clean_dataset_one_hot.columns).difference(
        ['Utterance', 'Stance category'])
    stance_columns = sorted(list(stance_columns))

    # first split the data in two to get train and test sets
    reset_seeds()

    X_train, X_test, y_train, y_test = \
        train_test_split(clean_dataset_one_hot['Utterance'],
                         clean_dataset_one_hot[stance_columns],
                         test_size=0.2,
                         stratify=clean_dataset_one_hot['Stance category'])

    y_train['set'] = 'train'
    y_test['set'] = 'test'

    dataset = pd.concat([
        pd.DataFrame(data={
            'Utterance': X_train
        }).join(y_train),
        pd.DataFrame(data={
            'Utterance': X_test
        }).join(y_test)
    ],
                        axis=0)

    dataset.to_csv('bbc_dataset.csv')
    print('done.')

    print('Constructing the cv folds and saving to disk...', end='')
    X_train_folds = pd.DataFrame(
        index=X_train.index,
        columns=['fold_{}'.format(i) for i in range(1, n_splits + 1)])
    skf = StratifiedKFold(n_splits=n_splits)
    y = clean_dataset_one_hot.loc[y_train.index, 'Stance category']
    for i, (train_idx,
            test_idx) in enumerate(skf.split(np.zeros(X_train.shape[0]), y)):
        X_train_folds.iloc[train_idx, i] = 'train'
        X_train_folds.iloc[test_idx, i] = 'test'

    X_train_folds.to_csv('bbc_dataset_folds.csv')
    print('done.')

    print('Pre-computing the ELMO embeddings and saving to disk...', end='')

    elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=False)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        elmo_train_embeddings = session.run(
            elmo(tf.squeeze(tf.cast(X_train.values, tf.string)),
                 signature='default',
                 as_dict=True)['default'])
        elmo_train_embeddings = pd.DataFrame(index=X_train.index,
                                             data=elmo_train_embeddings)
        elmo_train_embeddings.to_csv('bbc_elmo_train_embeddings.csv')

        elmo_test_embeddings = session.run(
            elmo(tf.squeeze(tf.cast(X_test.values, tf.string)),
                 signature='default',
                 as_dict=True)['default'])
        elmo_test_embeddings = pd.DataFrame(index=X_test.index,
                                            data=elmo_test_embeddings)
        elmo_test_embeddings.to_csv('bbc_elmo_test_embeddings.csv')

    print('done.')
示例#46
0
df = pd.read_csv(data_path / "scraped" / "bgg_GameItem.csv", index_col="bgg_id")
df.shape

# %%
df.sample(5, random_state=SEED).T

# %%
df.num_votes.sum()

# %%
mlb = MultiLabelBinarizer()
values = mlb.fit_transform(
    df.game_type.apply(
        lambda x: [str(x)]
        if isinstance(x, float) and pd.notna(x)
        else x.split(",")
        if isinstance(x, str) and x
        else []
    )
)
values.shape

# %%
gt_df = pd.DataFrame(data=values, columns=mlb.classes_, index=df.index)
gt_df.shape

# %%
games = df.join(gt_df[list(game_types)].rename(columns=game_types))
games.shape

# %%
示例#47
0
    i += 1
    if i <= total_valid_data * train_portion:
        train_data.append(content)
        train_target.append(labels)
    else:
        test_data.append(content)
        test_target.append(labels)
print "训练样本量", len(train_data), len(train_target)
print "测试样本量", len(test_data), len(test_target)
print("Loading newsgroups training set... ")
print("Extracting features from the dataset using a sparse vectorizer")
t0 = time()
X_train = np.array((train_data))
print "eee", type(X_train)
print("done in %fs" % (time() - t0))
y_train = mlb.fit_transform(train_target)
print("An ordering for the training class labels:")
print list(mlb.classes_)
print("Loading newsgroups test set... ")
print("Extracting features from the dataset using the same vectorizer")
t0 = time()
X_test = np.array((test_data))
y_test = mlb.fit_transform(test_target)
print("An ordering for the test class labels:")
print list(mlb.classes_)
classifier = Pipeline([('vectorizer', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', OneVsRestClassifier(LinearSVC()))])
print("done in %fs" % (time() - t0))

classifier.fit(X_train, y_train)
 X_valid] = parser('./CleanData/crypto_light.csv')
[y_train, y_test, y_valid, X_train, X_test, X_valid] = [
    y_train[:165], y_test[:20], y_valid[:15], X_train[:165], X_test[:20],
    X_valid[15]
]

# Multilabel Binarizer
mlb = MultiLabelBinarizer()
# print "===========Training Data=============="
# print X_train
# print y_train
# print type(y_train[0])
# print y_train[1]
# print y_train[2]
# y_train = [['New York'],['New York'],['New York'],['New York'],['New York'],['New York'],['London'],['London'], ['London'],['London'],['London'],['London'],['New York', 'London'],['New York', 'London'] ]
y_train = mlb.fit_transform(y_train)
# print y_train
print "classes", list(mlb.classes_)
print len(list(mlb.classes_))
print len(X_train)
# print "-----Binarize y_train----------"
# print y_train

# Pipeline(vectorization, tfid weighting and classifier)
# ppl = Pipeline([
#     ('vectorizer', HashingVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', OneVsRestClassifier(LinearSVC()))])

ppl = Pipeline([('vectorizer', HashingVectorizer()),
                ('clf', OneVsRestClassifier(LinearSVC()))])
示例#49
0
    data = pd.read_csv(args.train_data_path)
    data['content'] = data.content.map(
        lambda x: ''.join(x.strip().split()))  # 去掉换行等

    # 把主题和情感拼拼接起来 一共10*3类   原本是主题为十分类 情感是三分类 现在整成30分类
    data['label'] = data['subject'] + data['sentiment_value'].astype(str)
    subj_lst = list(filter(lambda x: x is not np.nan, list(set(data.label))))
    subj_lst_dict = {value: key for key, value in enumerate(subj_lst)}
    data['label'] = data['label'].apply(lambda x: subj_lst_dict.get(x))

    # 多标签
    data_tmp = data.groupby('content').agg({
        'label': lambda x: set(x)
    }).reset_index()
    mlb = MultiLabelBinarizer()  # 转为类似于one-hot
    data_tmp['hh'] = mlb.fit_transform(data_tmp.label).tolist()
    y_train = np.array(data_tmp.hh.tolist())

    # 构建embedding
    bow = BOW(data_tmp.content.apply(jieba.lcut).tolist(),
              min_count=1,
              maxlen=100)  # 长短补齐  固定长度为100
    vocab_size = len(bow.word2idx)
    # print(vocab_size)   # 19887
    # print(bow.word_count)    # 统计每个词出现的次数
    embedding_matrix = np.zeros((vocab_size + 1, 300))
    # 加载词向量
    embedding = load_embedding()
    for key, value in bow.word2idx.items():
        if key in embedding.keys():
            embedding_matrix[value] = embedding[key]
示例#50
0
]
df = pd.read_table('../data/meddra_all_se.tsv', names=columns)
df.drop(df[df.meddra_type == "LLT"].index, inplace=True)
print(df.info())

df = df.groupby('stitch_id_flat').side_effect_name.apply(list).reset_index()
df['pubchem_id'] = df.stitch_id_flat.map(stitch_to_pubchem)
print(df.head())

d2 = pd.read_excel("../data/2d_prop.xlsx")
d3 = pd.read_excel("../data/3d_prop.xlsx")
print(d2.shape, d3.shape)

d2 = d2.select_dtypes(include=['int64', 'float64'])
d3 = d3.select_dtypes(include=['float64'])
y = mlb.fit_transform(sedf['side_effect_name'])

print(y.shape)

se = pd.read_excel('..data/sub_sys.xlsx')
#se.info()
se.info()
print(sorted(se.count().values, reverse=True))
test_cols = list(se.columns)
test_cols_update = test_cols[:]

df = se[test_cols_update]
print(list(df.columns))  #.index('blood')

print(len(df.columns))
示例#51
0
def multilabel2(total):
    X = total.frase
    y = total[["clase1", "clase2"]]
    y = y.replace(np.nan, '', regex=True)

    X = total.iloc[:, [0]]
    y = total.iloc[:, [1, 2]]
    y = y.replace(np.nan, '', regex=True)

    X = np.array(X)
    y = np.array(y)

    pipeline = Pipeline([
        ('vectorize', CountVectorizer()), ('tf_idf', TfidfTransformer()),
        ('clf', OneVsRestClassifier(SGDClassifier(loss='modified_huber')))
    ])

    mlb = MultiLabelBinarizer()
    scores = []
    kf = KFold(n_splits=10, random_state=0, shuffle=True)
    for train, test in kf.split(total):

        X_train = total.iloc[train, [0]]
        X_train = np.array(X_train)

        y_train = total.iloc[train, [1, 2]]
        y_train = np.array(y_train)

        X_test = total.iloc[test, [0]]
        X_test = np.array(X_test)

        y_test = total.iloc[test, [1, 2]]
        y_test = np.array(y_test)
        aux = []
        for test in X_test:
            aux.append(test[0])

        X_test = aux

        aux = []
        for train in X_train:

            aux.append(train[0])
        X_train = aux

        y_train = mlb.fit_transform(y_train)
        y_test = mlb.transform(y_test)

        pipeline.fit(X_train, y_train)
        predicted = pipeline.predict(X_test)
        scores.append(evaluacion(y_test, predicted))

    recall = metrics.recall_score(y_test, predicted, average='macro')
    print("Recall: %f" % recall)
    precision = metrics.precision_score(y_test, predicted, average='macro')
    print("Precision: %f" % precision)
    f1_score = metrics.f1_score(y_test, predicted, average='macro')
    print("F1-score: %f" % f1_score)
    accuracy = metrics.accuracy_score(y_test, predicted)
    print("accuracy: %f" % accuracy)
    return recall, precision, f1_score, accuracy
示例#52
0
# https://medium.com/@michaeldelsole/what-is-one-hot-encoding-and-how-to-do-it-f0ae272f1179

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import pandas as pd




dataset = pd.read_csv('/home/ubuntu/keras/enver/dmlvh2/data2.csv')
Y = dataset.iloc[:,1].values
mlb = MultiLabelBinarizer()

YY = mlb.fit_transform(Y)
print(YY)
np.savetxt('Y2.csv',YY, fmt='%d')

示例#53
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('train_doc_codes',
                        type=str,
                        help='path to the train doc codes file')
    parser.add_argument('train_doc_labels',
                        type=str,
                        help='path to the train doc labels file')
    parser.add_argument('test_doc_codes',
                        type=str,
                        help='path to the test doc codes file')
    parser.add_argument('test_doc_labels',
                        type=str,
                        help='path to the test doc labels file')
    parser.add_argument('-nv',
                        '--n_val',
                        type=int,
                        default=1000,
                        help='size of validation set (default 1000)')
    parser.add_argument('-ne',
                        '--n_epoch',
                        type=int,
                        default=100,
                        help='num of epoches (default 100)')
    parser.add_argument('-bs',
                        '--batch_size',
                        type=int,
                        default=100,
                        help='batch size (default 100)')
    parser.add_argument('-cv',
                        '--cross_validation',
                        type=int,
                        help='k-fold cross validation')
    parser.add_argument('-mlc',
                        '--multilabel_clf',
                        action='store_true',
                        help='multilabel classification flag')

    args = parser.parse_args()

    # autoencoder
    train_doc_codes = load_json(args.train_doc_codes)
    train_doc_labels = load_json(args.train_doc_labels)
    test_doc_codes = load_json(args.test_doc_codes)
    test_doc_labels = load_json(args.test_doc_labels)
    X_train = np.r_[train_doc_codes.values()]
    Y_train = [train_doc_labels[i] for i in train_doc_codes]
    X_test = np.r_[test_doc_codes.values()]
    Y_test = [test_doc_labels[i] for i in test_doc_codes]

    # # DBN
    # X_train = np.array(load_pickle(args.train_doc_codes))
    # Y_train = load_pickle(args.train_doc_labels)
    # X_test = np.array(load_pickle(args.test_doc_codes))
    # Y_test = load_pickle(args.test_doc_labels)
    # import pdb;pdb.set_trace()

    if args.multilabel_clf:
        encoder = MultiLabelBinarizer()
        encoder.fit(Y_train + Y_test)
        Y_train = encoder.transform(Y_train)
        Y_test = encoder.transform(Y_test)
    else:
        Y = Y_train + Y_test
        n_train = len(Y_train)
        n_test = len(Y_test)
        encoder = LabelEncoder()
        Y = np_utils.to_categorical(encoder.fit_transform(Y))
        Y_train = Y[:n_train]
        Y_test = Y[-n_test:]

    seed = 7
    np.random.seed(seed)
    if not args.cross_validation:
        val_idx = np.random.choice(range(X_train.shape[0]),
                                   args.n_val,
                                   replace=False)
        train_idx = list(set(range(X_train.shape[0])) - set(val_idx))
        X_new_train = X_train[train_idx]
        Y_new_train = Y_train[train_idx]
        X_new_val = X_train[val_idx]
        Y_new_val = Y_train[val_idx]
        print 'train: %s, val: %s, test: %s' % (
            X_new_train.shape[0], X_new_val.shape[0], X_test.shape[0])
        if args.multilabel_clf:
            results = multilabel_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \
                    X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed)
            print 'f1 score on test set: macro_f1: %s, micro_f1: %s' % tuple(
                results)
        else:
            results = multiclass_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \
                    X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed)
            print 'acc on test set: %s' % results
    else:
        X = np.concatenate((X_train, X_test), axis=0)
        Y = np.concatenate((Y_train, Y_test), axis=0)
        ss = ShuffleSplit(n_splits=int(args.cross_validation),
                          test_size=X_test.shape[0],
                          random_state=seed)
        results = []
        for train_idx, test_idx in ss.split(X):
            val_idx = np.random.choice(train_idx, args.n_val, replace=False)
            new_train_idx = list(set(train_idx) - set(val_idx))
            X_new_train = X[new_train_idx]
            Y_new_train = Y[new_train_idx]
            X_new_val = X[val_idx]
            Y_new_val = Y[val_idx]
            if args.multilabel_clf:
                results.append(multilabel_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \
                        X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed))
            else:
                results.append(multiclass_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \
                    X[test_idx], Y[test_idx], nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed))

        if args.multilabel_clf:
            macro_f1, micro_f1 = zip(*results)
            macro_mean = np.mean(macro_f1)
            macro_std = np.std(macro_f1)
            micro_mean = np.mean(micro_f1)
            micro_std = np.std(micro_f1)
            print 'f1 score on %s-fold cross validation: macro_f1: %s (%s), micro_f1: %s (%s)' \
                    % (int(args.cross_validation), macro_mean, macro_std, micro_mean, micro_std)
        else:
            mean = np.mean(results)
            std = np.std(results)
            print 'acc on %s-fold cross validation: %s (%s)' % (int(
                args.cross_validation), mean, std)
    import pdb
    pdb.set_trace()
示例#54
0
class Dataset(object):
    def __init__(self, inputs, labels, test_indices=None, **kwargs):
        """Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it
        easy to serialize and deserialize everything as a unit.

        Args:
            inputs: The raw model inputs. This can be set to None if you dont want
                to serialize this value when you save the dataset.
            labels: The raw output labels.
            test_indices: The optional test indices to use. Ideally, this should be generated one time and reused
                across experiments to make results comparable. `generate_test_indices` can be used generate first
                time indices.
            **kwargs: Additional key value items to store.
        """
        self.X = np.array(inputs)
        self.y = np.array(labels)
        for key, value in kwargs.items():
            setattr(self, key, value)

        self._test_indices = None
        self._train_indices = None
        self.test_indices = test_indices

        self.is_multi_label = isinstance(labels[0], (set, list, tuple))
        self.label_encoder = MultiLabelBinarizer(
        ) if self.is_multi_label else LabelBinarizer()
        self.y = self.label_encoder.fit_transform(self.y).flatten()

    def update_test_indices(self, test_size=0.1):
        """Updates `test_indices` property with indices of `test_size` proportion.

        Args:
            test_size: The test proportion in [0, 1] (Default value: 0.1)
        """
        if self.is_multi_label:
            self._train_indices, self._test_indices = sampling.multi_label_train_test_split(
                self.y, test_size)
        else:
            sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size)
            self._train_indices, self._test_indices = next(
                sss.split(self.X, self.y))

    def save(self, file_path):
        """Serializes this dataset to a file.

        Args:
            file_path: The file path to use.
        """
        utils.dump(self, file_path)

    def train_val_split(self, split_ratio=0.1):
        """Generates train and validation sets from the training indices.

        Args:
            split_ratio: The split proportion in [0, 1] (Default value: 0.1)

        Returns:
            The stratified train and val subsets. Multi-label outputs are handled as well.
        """
        if self.is_multi_label:
            train_indices, val_indices = sampling.multi_label_train_test_split(
                self.y, split_ratio)
        else:
            sss = StratifiedShuffleSplit(n_splits=1, test_size=split_ratio)
            train_indices, val_indices = next(sss.split(self.X, self.y))
        return self.X[train_indices], self.X[val_indices], self.y[
            train_indices], self.y[val_indices]

    @staticmethod
    def load(file_path):
        """Loads the dataset from a file.

        Args:
            file_path: The file path to use.

        Returns:
            The `Dataset` instance.
        """
        return utils.load(file_path)

    @property
    def test_indices(self):
        return self._test_indices

    @test_indices.setter
    def test_indices(self, test_indices):
        if test_indices is None:
            self._train_indices = np.arange(0, len(self.y))
        else:
            self._test_indices = test_indices
            self._train_indices = np.setdiff1d(np.arange(0, len(self.y)),
                                               self.test_indices)

    @property
    def train_indices(self):
        return self._train_indices

    @property
    def labels(self):
        return self.label_encoder.classes_

    @property
    def num_classes(self):
        if len(self.y.shape) == 1:
            return 1
        else:
            return len(self.labels)
示例#55
0
"""If you can't find it, we need to understand how did it happen that we lost them? It happened during the built-in tokenization of TfidfVectorizer. Luckily, we can influence on this process. Get back to the function above and use '(\S+)' regexp as a *token_pattern* in the constructor of the vectorizer.

Now, use this transormation for the data and check again.
"""

tfidf_reversed_vocab[1976] ######### YOUR CODE HERE #############

"""### MultiLabel classifier

As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose it is convenient to use [MultiLabelBinarizer](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html) from *sklearn*.
"""

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_val = mlb.fit_transform(y_val)

"""Implement the function *train_classifier* for training a classifier. In this task we suggest to use One-vs-Rest approach, which is implemented in [OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) class. In this approach *k* classifiers (= number of tags) are trained. As a basic classifier, use [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time, because a number of classifiers to train is large."""

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition.nmf import NMF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

def train_classifier(X_train, y_train):
    """
示例#56
0
def prepare_data_frame(visu=True, load_raw=False, filename="predict_movie_categories_dataframe.csv", save=True):
    df = None
    labels = None
    mb = None
    print("start prepare data")
    start = timeit.default_timer()
    genres_with_occurences = {}
    words_with_occurences = {}
    if load_raw:
        df = processing.clean(processing.raw())
        df = df[['id', 'belongs_to_collection', 'genres', 'original_title', 'overview',
                 'tagline', 'title']]
        df['belongs_to_collection'] = df['belongs_to_collection'] \
            .apply(lambda x: x['name'] if 'name' in x.keys() else '')
        df['genres'] = df['genres'].apply(lambda x: [e['name'] for e in x])

        keywords = keywords_processing.get_films_with_keywords(keywords_processing.raw())

        df = pd.merge(df, keywords, how='left', on=['id'])

        genres_with_occurences = {}
        df['genres'].apply(lambda x: collect_occurences(x, genres_with_occurences))

        genres_with_occurences = {x: y for x, y in
                                  sorted(genres_with_occurences.items(),
                                         key=lambda e: e[1], reverse=True)[:10]
                                  }
        print(genres_with_occurences)

        df['genres'] = df['genres'].apply(lambda x: delIfNotKnown(x, genres_with_occurences.keys()))
        df = df[df['genres'].apply(lambda x: len(x) > 0)]
        # Prepare labels (as a 2D binary array)
        mb = MultiLabelBinarizer()
        labels = mb.fit_transform(df['genres'])

        df['keywords'] = df['keywords'].apply(clean_text)
        df['overview'] = df['overview'].apply(clean_text)
        df['tagline'] = df['tagline'].apply(clean_text)
        df['title_not_modified'] = df['title']
        df['title'] = df['title'].apply(clean_text)
        df['original_title'] = df['original_title'].apply(clean_text)
        df['belongs_to_collection'] = df['belongs_to_collection'].apply(lambda x: x.replace(' ', ''))
        words_with_occurences = {}
        df['overview'].apply(lambda x: collect_occurences(x, words_with_occurences))
        words_with_occurences = {x: y for x, y in
                                 sorted(words_with_occurences.items(),
                                        key=lambda e: e[1], reverse=True)
                                 }

        df['clean_x'] = df.apply(join, axis=1)
        df = df[['title_not_modified', 'clean_x', 'genres']]
        if save:
            df.to_csv(up.data_processed_dir + filename)
    else:
        df = pd.read_csv(up.data_processed_dir + filename)
        df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x))
        mb = MultiLabelBinarizer()
        labels = mb.fit_transform(df['genres'])

    end = timeit.default_timer()
    print("End prepare data, time : ", end - start)
    if visu:
        print(df.head())
        print(df.columns)
        print(df.values[1])
        if genres_with_occurences != {}:
            print('10 most present genres with their occurences :', genres_with_occurences)

            genres_df = pd.DataFrame({'Genre': list(genres_with_occurences.keys()),
                                      'Occurences': list(genres_with_occurences.values())}) \
                .set_index('Genre').rename_axis(None)

            genres_df = genres_df.sort_values('Occurences', ascending=True)

            genres_df.plot.barh()

            plt.show()
        if words_with_occurences != {}:
            print('Words with their occurences : ', words_with_occurences)

            words_df = pd.DataFrame({'Word': list(words_with_occurences.keys())[:100],
                                     'Occurences': list(words_with_occurences.values())[:100]}) \
                .set_index('Word').rename_axis(None)
            words_df = words_df.sort_values('Occurences', ascending=True)
            words_df.plot.barh(figsize=(15, 20))

            plt.show()

        print("Dataframe shape : ", df.shape)
        print(df.info())

        print("Labels")
        print(labels)
        print("Labels shape : ", labels.shape)
        print("For example, ", labels[0], "Stands for", mb.inverse_transform(labels)[0])
    return df, mb, labels
示例#57
0
    def train(self,
              max_iterations=10000,
              learning_rate=5e-4,
              units=32,
              hold_prob=1,
              gpu_id='/cpu:0'):
        os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
            [str(n) for n in range(self.static_data['ngpus'])])
        print('lstm strart')
        H_train, H_val, H_test = self.create_inputs(self.X_train, self.X_val,
                                                    self.X_test)
        if not self.probabilistic:
            y_val = self.y_val
            y_test = self.y_test
            y_train = self.y_train
        else:
            classes = np.arange(0.1, 20, 0.2)

            y_val = np.digitize(self.y_val, classes, right=True)
            y_test = np.digitize(self.y_test, classes, right=True)
            y_train = np.digitize(self.y_train, classes, right=True)
            binarizer = MultiLabelBinarizer(classes=classes)
            y_train = binarizer.fit_transform(y_train)
            y_val = binarizer.transform(y_val)
            y_test = binarizer.transform(y_test)

        batch_size = np.min([100, int(self.N / 5)])
        tf.compat.v1.reset_default_graph()
        graph_lstm = tf.Graph()
        with graph_lstm.as_default():
            with tf.device(gpu_id):
                x1 = tf.compat.v1.placeholder(
                    'float',
                    shape=[None, H_train.shape[1], H_train.shape[2]],
                    name='input_data')
                y_pred_ = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=[None, y_train.shape[1]],
                    name='target_lstm')

            with tf.device(gpu_id):
                train_lstm, cost_lstm, accuracy_lstm, sse_lstm, rse_lstm, weights = self.build_graph(
                    x1, y_pred_, learning_rate, units, hold_prob)

        obj_old = np.inf * np.ones(4)
        obj_max = np.inf * np.ones(4)
        obj_min = np.inf * np.ones(4)
        batches = [
            np.random.choice(self.N, batch_size, replace=False)
            for _ in range(max_iterations + 1)
        ]

        path_group = self.static_data['path_group']
        cpu_status = joblib.load(os.path.join(path_group, 'cpu_status.pickle'))

        if sys.platform != 'linux':
            config = tf.compat.v1.ConfigProto(
                allow_soft_placement=True,
                intra_op_parallelism_threads=self.static_data['intra_op'],
                inter_op_parallelism_threads=1)
            config.gpu_options.allow_growth = True
        else:
            config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
            config.gpu_options.allow_growth = True
        res = dict()
        self.best_weights = dict()
        best_iteration = 0
        best_glob_iterations = 0
        ext_iterations = max_iterations
        train_flag = True
        patience = 8000
        wait = 0
        loops = 0

        with tf.compat.v1.Session(graph=graph_lstm, config=config) as sess:

            sess.run(tf.compat.v1.global_variables_initializer())
            while train_flag:
                for i in tqdm(range(max_iterations)):
                    if i % 500 == 0:

                        sess.run([train_lstm],
                                 feed_dict={
                                     x1: H_train[batches[i]],
                                     y_pred_: y_train[batches[i]]
                                 })

                        acc_new_v, mse_new_v, sse_new_v, rse_new_v, weights_lstm = sess.run(
                            [
                                accuracy_lstm, cost_lstm, sse_lstm, rse_lstm,
                                weights
                            ],
                            feed_dict={
                                x1: H_val,
                                y_pred_: y_val,
                            })
                        acc_new_t, mse_new_t, sse_new_t, rse_new_t = sess.run(
                            [accuracy_lstm, cost_lstm, sse_lstm, rse_lstm],
                            feed_dict={
                                x1: H_test,
                                y_pred_: y_test
                            })

                        acc_new = 0.4 * acc_new_v + 0.6 * acc_new_t
                        mse_new = 0.4 * mse_new_v + 0.6 * mse_new_t
                        sse_new = 0.4 * sse_new_v + 0.6 * sse_new_t
                        rse_new = 0.4 * rse_new_v + 0.6 * rse_new_t

                        obj_new = np.array(
                            [acc_new, mse_new, sse_new, rse_new])
                        flag, obj_old, obj_max, obj_min = self.distance(
                            obj_new, obj_old, obj_max, obj_min)
                        if flag:
                            variables_names = [
                                v.name
                                for v in tf.compat.v1.trainable_variables()
                            ]
                            for k, v in zip(variables_names, weights_lstm):
                                self.best_weights[k] = v
                            res[str(i)] = obj_old
                            print(acc_new)

                            best_iteration = i
                            wait = 0
                        else:
                            wait += 1
                        if wait > patience:
                            train_flag = False
                            break
                    else:
                        sess.run(train_lstm,
                                 feed_dict={
                                     x1: H_train[batches[i]],
                                     y_pred_: y_train[batches[i]]
                                 })
                        wait += 1
                best_glob_iterations = ext_iterations + best_iteration
                if (max_iterations -
                        best_iteration) <= 5000 and max_iterations > 2000:
                    if loops > 3:
                        best_glob_iterations = ext_iterations + best_iteration
                        train_flag = False
                    else:
                        ext_iterations += 8000
                        max_iterations = 8000
                        best_iteration = 0
                        loops += 1
                else:
                    best_glob_iterations = ext_iterations + best_iteration
                    train_flag = False

            sess.close()

        model_dict = dict()
        model_dict['units'] = units
        model_dict['hold_prob'] = hold_prob
        model_dict['best_weights'] = self.best_weights
        model_dict['static_data'] = self.static_data
        model_dict['n_vars'] = self.D1
        model_dict['depth'] = self.D2
        model_dict['best_iteration'] = best_glob_iterations
        model_dict['metrics'] = obj_old
        model_dict['error_func'] = res
        print("Total accuracy lstm-3d: %s" % obj_old[0])

        return obj_old[0], self.scale_lstm, model_dict
示例#58
0

for row in traindata:
    content = str(remove_stopwords(row[0].encode('utf-8')))
    content_pk[content] = row[2]
    try:
        labels = set(row[1].split(';'))
    except:
        print 'error'
    test_data.append(content)
    test_target.append(labels)
path = 'E://model.pkl'
print "测试样本量", len(test_data), len(test_target)
loaded_model = joblib.load(path)
X_test = np.array((test_data))
y_test = mlb.fit_transform(test_target)
print("An ordering for the test class labels:")
print list(mlb.classes_)
predicted = loaded_model.predict(X_test)

# all_labels = mlb.inverse_transform(predicted)
# for item, labels in zip(X_test, all_labels):
#     pk_main_news = content_pk[item]
#     # print('{0} => {1}'.format(item, ', '.join(labels)))
#     sqlcursor.execute("update cctv_news_content set tname_by_classifier = %s where pk=%s", ((';'.join(labels)),pk_main_news))
#     sqlConn.commit()
print("Classification report on test set for classifier:")
print(classification_report(y_test, predicted))
# exit(0)
# # file produce
# # sqlcursor.execute("""SELECT 'MID', 'key_words', 'related_news','tname_by_classifier'
    return [int(x) for x in str_label if len(x)>0]

def convert_feature_to_vector(str_feature):
    str_feature = str_feature[1:-1]
    str_feature = str_feature.split(',')
    return [float(x) for x in str_feature]

y_train = np.array([convert_label_to_array(y) for y in train_df['label']])
X_train = np.array([convert_feature_to_vector(x) for x in train_df['feature vector']])
X_test = np.array([convert_feature_to_vector(x) for x in test_df['feature vector']])


t=time.time()

mlb = MultiLabelBinarizer()
train_set_labels= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

random_state = np.random.RandomState(0)
train_set_features, test_set_features, train_set_labels, test_set_true_labels = train_test_split(X_train, train_set_labels, test_size=.2, random_state=random_state)
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
classifier.fit(train_set_features, train_set_labels)

predicted_labels = classifier.predict(test_set_features)

print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"
print "F1 score: ", f1_score(test_set_true_labels, predicted_labels, average='micro')




示例#60
0
    def eval_classification(self,
                            truth_file,
                            pred_files,
                            eval_list,
                            plot_list,
                            over_all=False,
                            ensemble_voting="soft",
                            ensemble_weights=None,
                            class_names=None,
                            convert_binary=False,
                            binary_threshold=None):
        """
      This function calculates the evaluation measures
      required by the user for classification problems.
    Args:
      truth_file: A csv file containing ids and truth annotations.
      pred_files: A list of csv files containing ids and prediction annotations.
      eval_list: A list of evaluation measures.
      plot_list: A list of evaluation plots.
      over_all: if class wise results are required or overall result,
      in case of multiclass classification, default false.
      ensemble_voting: Type of voting in case of multiple prediction(soft/hard) default soft.
      ensemble_weights: Weights for each class in case of ensemble, default None.
      calss_names: An array containing class names, default None.
      convert_binary: If multiclass predictions should be evaluated as binary problem
                      (normal vs abnormal)first value should represent probability of normal class.
      binary_threshold: threshold to be used in case of multiclass to binary conversion.
    Returns:
      A dictionary containing evaluation result.
    Raises:
      "Invalid evaluation term" if a term is not
      present in supported list.
    """
        self.eval_result = {}
        self.classes = None
        self.eval_plots = []
        self.read_data(truth_file, pred_files)
        eval_list = [element.strip().lower() for element in eval_list]
        if self.ensemble:
            self.eval_ensemble(ensemble_voting, ensemble_weights)
        if self.multilabel:
            self.truth = np.array([literal_eval(p) for p in self.truth])
            self.pred = np.array([literal_eval(p) for p in self.pred])
        classes = self.classes
        if class_names:
            classes = class_names
        if isinstance(self.truth[0], type("str")):
            self.truth = np.array([classes.index(tval) for tval in self.truth])
        if not self.multilabel:
            if len(self.pred.shape
                   ) > 1 and convert_binary and binary_threshold:
                self.pred_max = np.array([
                    0 if prd_n[0] >= binary_threshold else 1
                    for prd_n in self.pred
                ])
                self.truth = np.array(
                    [1 if truth_n != 0 else truth_n for truth_n in self.truth])
                classes = [self.classes[0], '!' + self.classes[0]]
            elif len(self.pred.shape) > 1:
                self.pred_max = np.argmax(self.pred, axis=1)
            else:
                self.pred_max = self.pred
            conf_matrix = metrics.confusion_matrix(self.truth, self.pred_max)
            true_pos = [0] * len(classes)
            false_pos = [0] * len(classes)
            false_neg = [0] * len(classes)
            true_neg = [0] * len(classes)
            col_sum = np.sum(conf_matrix, axis=0)
            row_sum = np.sum(conf_matrix, axis=1)
            cum_sum = np.sum(conf_matrix)
            for k in range(0, len(classes)):
                true_pos[k] += conf_matrix[k, k]
                false_pos[k] += col_sum[k] - true_pos[k]
                false_neg[k] += row_sum[k] - true_pos[k]
                true_neg[k] += cum_sum - true_pos[k] - \
                    false_pos[k] - false_neg[k]
        else:
            mlb = MultiLabelBinarizer()
            self.truth = mlb.fit_transform(self.truth)
            self.pred = mlb.transform(self.pred)
            classes = mlb.classes_
            self.truth = self.truth * 2
            np_sum = np.add(self.truth, self.pred)
            true_pos = [0] * len(classes)
            false_pos = [0] * len(classes)
            false_neg = [0] * len(classes)
            true_neg = [0] * len(classes)
            for i in range(0, len(classes)):
                true_pos[i] = np.sum(np_sum[:, i] == 3)
                true_neg[i] = np.sum(np_sum[:, i] == 0)
                false_pos[i] = np.sum(np_sum[:, i] == 1)
                false_neg[i] = np.sum(np_sum[:, i] == 2)

        # class wise evaluation
        for cls in classes:
            self.eval_result[cls] = {}
        for element in eval_list:
            if element in ['recall', 'true positive rate', 'sensitivity']:
                for i, cls in enumerate(classes):
                    self.eval_result[cls]['recall'] = calc_recall(
                        true_pos[i], false_neg[i])
            elif element in ['specificity', 'true negative rate']:
                for i, cls in enumerate(classes):
                    self.eval_result[cls]['specificity'] = calc_specificity(
                        true_neg[i], false_pos[i])
            elif element == 'accuracy':
                for i, cls in enumerate(classes):
                    self.eval_result[cls]['accuracy'] = calc_acc(
                        true_pos[i], true_neg[i], false_pos[i], false_neg[i])
            elif element in ['f1_score', 'f1score', 'fscore']:
                for i, cls in enumerate(classes):
                    self.eval_result[cls]['f1score'] = calc_f1score(
                        true_pos[i], false_pos[i], false_neg[i])
            elif element in ['precision', 'positive predictive value', 'ppv']:
                for i, cls in enumerate(classes):
                    self.eval_result[cls]['precision'] = calc_precision(
                        true_pos[i], false_pos[i])
            elif element in ['negative predictive value', 'npv']:
                for i, cls in enumerate(classes):
                    self.eval_result[cls]['npv'] = calc_npv(
                        true_neg[i], false_neg[i])
            elif element in ['false negative rate', 'fnr']:
                for i, cls in enumerate(classes):
                    self.eval_result[cls]['fnr'] = calc_fnr(
                        false_neg[i], true_pos[i])
            elif element in ['false positive rate', 'fpr']:
                for i, cls in enumerate(classes):
                    self.eval_result[cls]['fpr'] = calc_fpr(
                        false_pos[i], true_neg[i])
            elif element in ['false discovery rate', 'fdr']:
                for i, cls in enumerate(classes):
                    self.eval_result[cls]['fdr'] = calc_fdr(
                        true_pos[i], false_pos[i])
            elif element in ['false omission rate', 'for']:
                for i, cls in enumerate(classes):
                    self.eval_result[cls]['for'] = calc_for(
                        false_neg[i], true_neg[i])
            elif element in ['matthews correlatin coefficient', 'mcc']:
                for i, cls in enumerate(classes):
                    self.eval_result[cls]['mcc'] = calc_mcc(
                        true_pos[i], true_neg[i], false_pos[i], false_neg[i])
            elif element in ['kappa']:
                for i, cls in enumerate(classes):
                    self.eval_result[cls]['kappa'] = calc_kappa(
                        self.truth, self.pred_max)
            else:
                raise ValueError("invalid Evaluation Term")

        if plot_list:
            self.eval_plot_classification(plot_list, classes, true_pos,
                                          false_pos, false_neg, true_neg)
        if over_all:
            self.calc_overall()
        return self.eval_result, self.eval_plots