示例#1
0
def tran_file(filename):
	# dic_attr = name(filename1)
	fin = open(filename) # task1.trainSentence.clean
	dic_train = collections.defaultdict(dict)
	titlelist = []
	for line in fin:
		line = line.strip().split("\t")
		dic_train[line[0]][len(dic_train[line[0]])] = {"label": int(line[4]), "title": line[3], "pair": (line[1], line[2])}
		titlelist.append(line[3])
	dic_features = collections.defaultdict(dict)
	unigram = getGrams(titlelist, 3, 1)
	bigram = getGrams(titlelist, 3, 1)
	dic_func = {"同居": tongju, "昔日情敌": xiriqingdi, "闺蜜": guimi, "朋友": pengyou, "分手": fenshou, "老师": laoshi, "同学": tongxue, "前女友": qiannvyou, "翻版": fanban, "妻子": qizi, "撞衫": zhuangshan, "同为校花": tongweixiaohua, "绯闻女友": feiwennvyou, "偶像": ouxiang, "暧昧": aimei, "传闻不和": chuanwenbuhe, "老乡":laoxiang}
	for relation in dic_train:
		namelist = []; labels = []; titlelist = []
		for id in dic_train[relation]:
			dic_ = dic_train[relation][id]
			namelist.append((len(namelist), dic_["pair"][0], dic_["pair"][1]))
			labels.append(dic_["label"])
			titlelist.append(dic_["title"])
		logging.info("%s:\t%d"%(relation, len(titlelist)))
		# features = dic_func[relation](namelist, dic_attr)
		uni_features = build_BOW(titlelist, unigram, 1)
		bi_features = build_BOW(titlelist, bigram, 1)
		features = uni_features
		features = bi_features
		# for index, feature in enumerate(features):
		# 	# features[index] += uni_features[index]
		# 	features[index] += bi_features[index]
		dic_features[relation]["features"] = features
		dic_features[relation]["label"] = labels
		dic_features[relation]["namelist"] = namelist
		classifier(features, labels)
示例#2
0
def main():
    if input(
            "Do you want to scrape (s) new images or use an existing (e) folder? (s/e): "
    ) == 's':
        folder = scrape()
    else:
        folder = None
    if input(
            "\nWould you like to use classifier.py to remove outliers? (y/n): "
    ) != 'n':
        classifier(folder)
    if input("\nWould you like to use average.py to generate an image? (y/n): "
             ) != 'n':
        average(folder)
    return
示例#3
0
 def trainingModel(self):
     self.registerWorking.finishThread.emit()
     state = 0
     while True:
         if state == 0:
             # Pre-process
             obj = preprocessing(self.input_datadir, self.output_datadir)
             nrof_images_total, nrof_successfully_aligned = obj.alignProcessing(
             )
             print('Total number of images: %d' % nrof_images_total)
             print('Number of successfully aligned images: %d' %
                   nrof_successfully_aligned)
             state += 1
             # Classifier
         elif state == 1:
             print("Training Start")
             objModel = classifier(
                 mode='TRAIN',
                 datadir=self.datadir,
                 modeldir=self.modeldir,
                 classifierFilename=self.classifier_filename)
             get_file = objModel.main()
             sys.exit("All Done")
             state += 1
         else:
             break
示例#4
0
def welcome():
    if request.method == 'POST':
        comment = request.form['comment']
        label = 'Spam' if classifier(comment) else 'Ham'
        return render_template('home.html',
                               comment=comment,
                               label=label,
                               results=True)
    return render_template('home.html', results=False)
示例#5
0
def runIncrementalClustering(sClusterDirectory, sNewFPDirectory):
    newBSList = []
    fileList = []
    bitsetfromDirectory(sNewFPDirectory, newBSList, fileList)

    for i in range(len(fileList)):
        fileList[i] = join(sNewFPDirectory, fileList[i])

    pairDistanceList = []
    d_ok = all_pair_distance(newBSList, JACCARD, pairDistanceList)

    for root, dirs, files in os.walk(sClusterDirectory):
        for directory in dirs:
            sCurrentClusterDirectory = join(sClusterDirectory, directory)
            clusterBSList = []
            clusterFileList = []
            bitsetfromDirectory(sCurrentClusterDirectory, clusterBSList,
                                clusterFileList)

            groupDistanceList = []
            group_distance(newBSList, clusterBSList, JACCARD,
                           groupDistanceList)

            setInsert = Set([])

            for i in range(len(groupDistanceList)):
                flag = False
                for j in range(len(groupDistanceList[i])):
                    if groupDistanceList[i][j] <= 0.2:
                        flag = True
                        break

                if flag:
                    setInsert.add(i)

                    for j in range(len(pairDistanceList[i])):
                        if pairDistanceList[i][j] <= 0.2:
                            setInsert.add(j)

            indexList = sorted(setInsert, reverse=True)
            for index in indexList:
                shutil.copy(fileList[index], sCurrentClusterDirectory)
                del newBSList[index]
                del fileList[index]

                for i in range(len(pairDistanceList)):
                    del pairDistanceList[i][index]

                del pairDistanceList[index]

    classf = classifier(newBSList, pairDistanceList, fileList)
    classf.setThreshold(0.2)

    classf.clustering(SINGLE_LINKAGE)

    classf.separateFilesToClusters(sClusterDirectory, 1)
示例#6
0
def runDirectoryClustering(sDirectory):
    bsList = []
    distanceList = []
    fileList = []

    bitsetfromDirectory(sDirectory, bsList, fileList)
    d_ok = all_pair_distance(bsList, JACCARD, distanceList)
    classf = classifier(bsList, distanceList, [])
    classf.setThreshold(0.2)

    classf.clustering(SINGLE_LINKAGE)
    classf.printClusters(1)
def accuracy():
    solver = classifier()
    correct = 0.0
    total = 0.0
    with open("validation.data", "r") as f:
        data = f.read().splitlines()
    for row in data:
        instance = row.split()
        predict = solver.solve(instance[1])
        if (instance[0] == predict):
            correct = correct + 1
        total = total + 1
    print "Naive Bayes accuracy: %f" % (correct / total)
示例#8
0
def runClustering(sPrefix):
    bsList = []
    distanceList = []

    bitsetFromCCS(sPrefix, bsList)
    d_ok = all_pair_distance(bsList, JACCARD, distanceList)

    classf = classifier(bsList, distanceList, [])
    classf.setThreshold(0.2)

    classf.clustering(SINGLE_LINKAGE)

    classf.printClusters(1)
def accuracy():
    solver = classifier();
    correct = 0.0;
    total = 0.0;
    with open("validation.data", "r") as f:
        data = f.read().splitlines();
    for row in data:
        instance = row.split();
        predict = solver.solve(instance[1]);
        if (instance[0] == predict):
            correct = correct + 1;
        total = total + 1;
    print "Naive Bayes accuracy: %f" % (correct / total);
示例#10
0
def runIncrementalClusteringFromOneDirectory(sFPDirectory, threshold,
                                             sDstDirectory):
    bss = []
    fpFiles = []
    bitsetfromDirectory(sFPDirectory, bss, fpFiles)

    distanceList = []
    d_ok = all_pair_distance_numpy(bss, distanceList)
    classf = classifier(bss, distanceList, fpFiles)
    classf.setThreshold(threshold)

    classf.clustering(SINGLE_LINKAGE)
    classf.separateFilesToClusters(sDstDirectory, 1)
示例#11
0
def classify(over_sampl, tf_idf, use_idf, pca, alphas, neighbors, slack,
             estimators, portion):
    """
    input:
        over_sampl: string variable to indicate the name of oversampling method 
        tf_idf: boolean variable to indicate whether to use tf or not
        use_idf: boolean variable to indicate whether to use idf or not
        pca: int variable to indicate whether to use PCA or not (<=0 means no, yes otherwise)
        alphas: NB tuning parameter
        neighbors: KNN tuning parameter
        slack: SVM tuning parameter
        estimators: GradientBoosting, AdaBoost tuning parameter
        portion: which airline data to work with (None means all airlines)
    """
    if not tf_idf:
        if pca > 0:
            return None
        else:
            message = "Preprocessing used is Word2Vec & Over Sampling method is  " + over_sampl + "  data portion  " + portion
    else:
        if use_idf:
            message = "Preprocessing used is tf-idf & Over Sampling method is  " + over_sampl + "   PCA dimension = " + str(
                pca) + "  data portion  " + portion
        else:
            message = "Preprocessing used is tf & Over Sampling method is  " + over_sampl + "   PCA dimension = " + str(
                pca) + "  data portion  " + portion
    # load dataset
    ds = get_dataset()
    X_train, X_test, Y_train, Y_test = ds.load_data(tf_idf=tf_idf,
                                                    use_idf=use_idf,
                                                    use_pca=pca,
                                                    airway_name=portion)
    if over_sampl == "RandomOverSampler":
        X_train, Y_train = RandomOverSampler().fit_sample(X_train, Y_train)
    elif over_sampl == "SMOTE":
        X_train, Y_train = SMOTE().fit_sample(X_train, Y_train)
    elif over_sampl == "ADASYN":
        X_train, Y_train = ADASYN().fit_sample(X_train, Y_train)
    clas = classifier()
    print(message)
    SVM_result, GB_result, AB_result, KNN_result, NB_result = clas.classify(
        X_train, X_test, Y_train, Y_test)
    compare_performance(SVM_result, GB_result, AB_result, KNN_result,
                        NB_result, message)
def classify(i, p):
    differ = i
    if (differ == 0):
        #        p=[]
        #        print("\n\n\n")
        #        print(p)
        name = []
        #        p.append("E:/guifinal/pe/hh.exe")
        print("\n\n\n")
        print(p)
        print("\n\n\n")
        #        p=[['E:/guifinal/pe/HelpCtr.exe']]
        pp = p[:]
        for i in p:
            str = i[0]

            str = str[::-1]
            print("in for loop")
            print(str)
            #            str="exe.rtCpleH/ep/lanifiug/:E"
            for j in range(0, len(str)):
                if (str[j] == "/"):
                    break
            str = str[0:j:1]
            str = str[::-1]
            print(str)
            name.append(str)
        name
        col = funt.n
        print("\n\n\n")
        print(name)
        print(p)
        print(col)
        print("\n\n\n")
        df = create_df(col, p[0], name)

        pred = classifier(df)
        file = [pp, pred]
        with open('file.pkl', 'wb') as fid:
            cPickle.dump(file, fid)

    if (differ == 1):
        #        p=[]
        #        p.append("E:/gui/pe/")
        print("\n\n\n")
        path = p[0]
        path = path + '/'
        print(path)
        print("\n\n\n")
        col = funt.n
        file_list = [f for f in os.listdir(path)]

        print(file_list)
        name = file_list[:]
        for i in range(0, len(file_list)):
            file_list[i] = path + file_list[i]
        print("\n\n\n Below for")

        print(file_list)
        print("\n")
        print(name)

        df = create_df(col, file_list, name)
        pred = classifier(df)
        folder = [path, name, pred]

        with open('folder.pkl', 'wb') as fid:
            cPickle.dump(folder, fid)

    return pred
    test_sentence_df=get_frame_feature(test_sentence_df)
    test_sentence_df=get_pos_tag(test_sentence_df)

    test_df=get_arg_pairs(test_sentence_df)

    test_df=merge_sen_df(test_sentence_df,test_df)
    test_df=get_arg_cosine_simialrity(test_df)
    test_df=get_lf_cosine_similarity(test_df)
    test_df=get_rf_cosine_similarity(test_df)

    test_df=get_entailment_score(test_df)
    test_df=get_pos_similarity(test_df)

    #classification,precision,f1 score is printed and final dataframe with results is  returned
    print "Training and testing  "+str((fold_no+1))
    result_df=classifier(test_df,train_df,train_process_file_path)

    if int(fold_no+1)==1 :
            final_results_df=result_df
    else:
           final_results_df=pd.concat([final_results_df,result_df]).reset_index()
#preparing plot data
plot_data = final_results_df[['True_label', 'Classification_result', 'Probability of result']]
plot_data.columns = ['gold_role', 'srl_role', 'srl_score']

d = {}
for rid, rdata in plot_data.iterrows():
    grole = rdata['gold_role']
    srole = rdata['srl_role']
    sscore = rdata['srl_score']
    d[rid] = (grole, (srole, sscore))
示例#14
0
def main():
    start_whole = time.time()
    tau = 0.75
    num_instances = 200
    pert_left = True
    pert_right = True
    batch_size = 3
    epsilon = 0.25
    delta = 0.05
    initial_value = 20
    list_anchors = [''] * num_instances
    mean_vector = [0] * num_instances
    instance_counter = 0
    b = classifier('Olaf')
    for instance_index in range(0, num_instances):
        previous_anchor = []
        coverage_astar = 0.2
        start = time.time()
        perturbed_instances_left, instance_sentiment, instance_left, b, instance_info = Anchor.get_perturbations(
            True, False, b, instance_index)
        perturbed_instances_right, instance_sentiment, instance_right, b, instance_info = Anchor.get_perturbations(
            False, True, b, instance_index)
        instance = instance_left + instance_right
        print('instance', instance)
        perturbed_instances = [''] * len(perturbed_instances_right)
        for i in range(len(perturbed_instances_left)):
            perturbed_instances[
                i] = perturbed_instances_left[i] + perturbed_instances_right[i]
        print('pert instances', perturbed_instances)
        possible_anchor_list = Anchor.possible_anchor(previous_anchor,
                                                      instance, coverage_astar,
                                                      perturbed_instances)

        while possible_anchor_list != []:
            bbest = Anchor.bbest_anchors(
                batch_size, possible_anchor_list, epsilon, delta,
                perturbed_instances_left, perturbed_instances_right,
                instance_sentiment, initial_value, pert_left, pert_right, b,
                instance_info[0], instance_info[1], instance_info[2],
                instance_info[3], instance_info[4], instance_info[5],
                instance_info[6])
            print('bbest', bbest, type(bbest))
            if max(Anchor.get_lb_vector(bbest)) > tau:
                for y in range(len(bbest)):
                    if bbest[y][1] > tau:
                        anchor_cov = Anchor.get_coverage(
                            bbest[y][3], perturbed_instances)
                        if anchor_cov > coverage_astar:
                            coverage_astar = anchor_cov
                            bbest_anchor = bbest[y]
            previous_anchor = [bbest[j][3] for j in range(len(bbest))]
            possible_anchor_list = Anchor.possible_anchor(
                previous_anchor, instance, coverage_astar, perturbed_instances)
        end = time.time()
        print(end - start)
        anchor_mean = bbest_anchor[0]
        final_anchor = bbest_anchor[3]
        print('anchor', final_anchor, anchor_mean)
        mean_vector[instance_index] = anchor_mean
        list_anchors[instance_index] = final_anchor
        instance_counter += 1
        print('mean vector', mean_vector)
        print('instance counter', instance_counter)

    fid_mean = statistics.mean(mean_vector)
    fid_stdev = statistics.stdev(mean_vector)
    end_whole = time.time()
    print('mean vector', mean_vector)
    print('anchor list', list_anchors)
    print('fidelity', fid_mean, fid_stdev)

    print(end_whole - start_whole)
    return bbest
示例#15
0
e1 = Encoder()
d1 = Decoder()

e5 = Encoder()
d5 = Decoder()

i1 = Input(input_shape)
i5 = Input(input_shape)

auto1 = d1(e1(i1))
auto5 = d5(e5(i5))

cross51 = d1(e5(i5))
cross15 = d5(e1(i1))

D15 = classifier(cross15)
D51 = classifier(cross51)

M11 = Model(i1, auto1)
M15 = Model(i1, D15)
M55 = Model(i5, auto5)
M51 = Model(i5, D51)

M11.compile(optimizer='RMSProp', loss='mse', metrics=['accuracy'])
M55.compile(optimizer='RMSProp', loss='mse', metrics=['accuracy'])
M15.compile(optimizer='RMSProp',
            loss='categorical_crossentropy',
            metrics=['accuracy'])
M51.compile(optimizer='RMSProp',
            loss='categorical_crossentropy',
            metrics=['accuracy'])
示例#16
0
文件: utils.py 项目: ywz1993/oc-cnn
def choose_classifier(dataset, class_number, model_type, model, classifier, D, hyper_para, train_data, test_data, test_label, no_train_data, no_test_data, inm, relu, m, s):

	if(hyper_para.verbose==True):
		print('Extracting features.....')

	train_features = np.memmap('../../temp_files/train_features_temp.bin', dtype='float32', mode='w+', shape=(no_train_data,hyper_para.D))
	train_features = torch.from_numpy(train_features)

	for i in range(no_train_data):
		temp = model(torch.autograd.Variable(train_data[i:(i+1)].cuda().contiguous().float())).float()
		temp = temp.view(1,1,hyper_para.D)
		temp = inm(temp)
		temp = relu(temp.view(hyper_para.D))
		train_features[i:(i+1)] = temp.data.cpu()
	train_data = None

	if(hyper_para.verbose==True):
		print('Features extracted.')

	## test on the test set
	test_features = np.memmap('../../temp_files/test_features_temp.bin', dtype='float32', mode='w+', shape=(no_test_data,hyper_para.D))
	test_scores   = np.memmap('../../temp_files/test_scores_temp.bin', dtype='float32', mode='w+', shape=(no_test_data,1))
	test_features = torch.from_numpy(test_features)

	if(hyper_para.verbose==True):
		print('Computing test scores and AUC....')

	area_under_curve=0.0
	if(hyper_para.classifier_type=='OC_CNN'):
		test_scores   = torch.from_numpy(test_scores)
		k=0
		print np.shape(test_features)
		start = time.time()
		for j in range(no_test_data):
			temp = model(AddNoise(torch.autograd.Variable(test_data[j:(j+1)].cuda().contiguous().float()), hyper_para.sigma1)).float()
			temp = temp.view(1,1,hyper_para.D)
			temp = inm(temp)
			temp = temp.view(hyper_para.D)
			
			test_features[k:(k+1)] = temp.data.cpu()
			test_scores[k:(k+1)]   = classifier(relu(temp)).data.cpu()[1]
			# print(classifier(relu(temp)).data.cpu())
			
			k = k+1
		end = time.time()
		print(end-start)
		test_scores    = test_scores.numpy()
		test_features  = test_features.numpy()
		train_features = train_features.numpy()

		test_scores = (test_scores-np.min(test_scores))/(np.max(test_scores)-np.min(test_scores))

	elif(hyper_para.classifier_type=='OC_SVM_linear'):
		# train one-class svm
		oc_svm_clf = svm.OneClassSVM(kernel='linear', nu=float(hyper_para.N))
		oc_svm_clf.fit(train_features.numpy())
		k=0
		mean_kwn = np.zeros( (no_test_data,1) )
		for j in range(no_test_data):
			temp = model(torch.autograd.Variable(test_data[j:(j+1)].cuda().contiguous().float())).float()
			temp = temp.view(1,1,hyper_para.D)
			temp = inm(temp)
			temp = temp.view(hyper_para.D)			
			test_features[k:(k+1)] = temp.data.cpu()
			temp 				   = np.reshape(relu(temp).data.cpu().numpy(), (1, hyper_para.D))
			test_scores[k:(k+1)]   = oc_svm_clf.decision_function(temp)[0]

			k = k+1

		test_features  = test_features.numpy()
		train_features = train_features.numpy()

		joblib.dump(oc_svm_clf,'../../save_folder/saved_models/'+dataset+'/classifier/'+str(class_number) +'/'+
																				model_type+'_OCCNNlin'    +'_'+
																				str(hyper_para.iterations)+'_'+
																				str(hyper_para.lr)		  +'_'+
																				str(hyper_para.sigma)	  +'_'+
																				str(hyper_para.N)         +'.pkl')

	fpr, tpr, thresholds = metrics.roc_curve(test_label, test_scores)

	if(hyper_para.verbose==True):
		print('Test scores and AUC computed.')

	return area_under_curve, train_features, test_scores, test_features
示例#17
0
        X_test = np.concatenate(X_test, axis=0)

        # extract labels of every considered points
        y_train = []
        for cloud, indices in zip(train_clouds, train_indices):
            y_train.append(cloud.labels[indices])
        y_train = np.concatenate(y_train)

        y_test = []
        for cloud, indices in zip(test_clouds, test_indices):
            y_test.append(cloud.labels[indices])
        y_test = np.concatenate(y_test)

        # eventually train the classifier, predict and evaluate labels
        # (here, 'cloud' is the last element of test_clouds)
        clf = classifier(X_train, X_test, y_train, y_test, cloud.label_names)
        rf = clf.random_forest()
        y_pred, measures = clf.evaluate(rf, results_dir)
        t1 = time.time()
        print('Done in %.0f seconds' % (t1 - t0))

        print("Evaluation : {}% of points from the testing set were correctly classified.\n".format(np.round(measures["accuracy"],2)*100))
        mess = "Other available measures (considered classes : {}): \n\t- recall by class (%) : {}\n\t- precision by class (%) : {}\n\t- F by class (%) : {}\n\t- mean recall : {}%\n\t- mean precision : {}%\n\t- global F : {}%"

        print(mess.format("'"+"', '".join([cloud.label_names[l] for l in measures["considered_labels"]])+"'",
                            format_val(measures["recall_by_class"]),
                            format_val(measures["precision_by_class"]),
                            format_val(measures["F_by_class"]),
                            format_val(measures["mean_recall"]),
                            format_val(measures["mean_precision"]),
                            format_val(measures["global_F"])))
示例#18
0
    "PaleSkin", "PointyNose", "RecedingHairline", "RosyCheeks", "Sideburn",
    "Smiling", "StraightHair", "WavyHair", "WearingEarrings", "WearingHat",
    "WearingLipstick", "WearingNecklace", "WearingNecktie", "Young"
]

### Change me! ###
target_indices = feature_names.index(
    "Smiling"
)  # Feel free to change this value to any string from feature_names!

noise = get_noise(n_images, z_dim).to(device).requires_grad_()
for i in range(grad_steps):
    opt.zero_grad()
    fake = gen(noise)
    fake_image_history += [fake]
    fake_classes_score = classifier(fake)[:, target_indices].mean()
    fake_classes_score.backward()
    noise.data = calculate_updated_noise(noise, 1 / grad_steps)

plt.rcParams['figure.figsize'] = [n_images * 2, grad_steps * 2]
show_tensor_images(torch.cat(fake_image_history[::skip], dim=2),
                   num_images=n_images,
                   nrow=n_images)

fake_image_history = []
### Change me! ###
target_indices = feature_names.index(
    "Smiling"
)  # Feel free to change this value to any string from feature_names from earlier!
other_indices = [
    cur_idx != target_indices for cur_idx, _ in enumerate(feature_names)
示例#19
0
from classifier import *

setup()

string1 = "Volvió a casarse en 1118 con Agnés de Garlande, hija de Anseau de Garlande, señor de Rochefort-en-Yvelines, y de Beatrice de Rochefort. De esta unión nacieron"
string2 = "Our teacher warned him not to be late again"

print(classifier(string1))
print(classifier(string2))
if real_exp:
    f.write('Starting training...\n')

total_iter = 0

for epoch in range(epoch_num):
    corrects = 0.0
    for i, data in enumerate(train_dataloader, 0):
        if total_iter % validate_frequency == 0:
            data = next(iter(validation_dataloader))
            inputs = data["image"]
            labels = data["class"]

            inputs, labels = Variable(inputs), Variable(labels)
            output = classifier(inputs)
            loss = criterion(output, labels)

            temp = output[:, 1].data.numpy()
            temp = np.apply_along_axis(lambda x: np.rint(np.exp(x)), 0, temp)
            temp = torch.from_numpy(temp).long()
            num = torch.sum(temp == labels.data)
            if type(num) is not int:
                num = num.item()

            accuracy = num / float(batch_size)

            update = None if draw_validation_graphs is None else 'append'
            draw_validation_graphs = vis.line(
                X=np.array([total_iter]),
                Y=np.array([loss.data[0]]),
示例#21
0
def runDayClustering(sFamilyName, sFPDirectory, sMD5Directory, sDstDirectory,
                     threshold):
    familyFiles = [
        f for f in listdir(sMD5Directory)
        if isfile(join(sMD5Directory, f)) and f.endswith('.top5.family.txt')
    ]
    familyFiles.sort()

    fFirstDay = open(join(sMD5Directory, familyFiles[0]))
    md5List = []

    while True:
        line = fFirstDay.readline()
        if not line:
            break
        tmpList = line.split()
        if cmp(sFamilyName, tmpList[1]) == 0:
            md5List.append(tmpList[0])

    fFirstDay.close()

    fpFiles = []
    bsList = []
    sCurrentFPDirectory = join(sFPDirectory, familyFiles[0][0:-16])
    bitsetfromDirectoryMD5(sCurrentFPDirectory, bsList, fpFiles, md5List)

    #print len(bsList), len(fpFiles)
    distanceList = []
    d_ok = all_pair_distance(bsList, JACCARD, distanceList)
    classf = classifier(bsList, distanceList, fpFiles)
    classf.setThreshold(threshold)

    classf.clustering(SINGLE_LINKAGE)
    classf.separateFilesToClusters(sDstDirectory, 1)

    index = 1

    while index < len(familyFiles):

        print index, familyFiles[index]
        fDay = open(join(sMD5Directory, familyFiles[index]))
        md5List = []
        while True:
            line = fDay.readline()
            if not line:
                break

            tmpList = line.split()
            if cmp(sFamilyName, tmpList[1]) == 0:
                md5List.append(tmpList[0])

        fDay.close()

        newBSList = []
        newFileList = []
        sCurrentFPDirectory = join(sFPDirectory, familyFiles[index][:-16])
        bitsetfromDirectoryMD5(sCurrentFPDirectory, newBSList, newFileList,
                               md5List)
        pairDistanceList = []
        d_ok = all_pair_distance(newBSList, JACCARD, pairDistanceList)

        for root, dirs, files in os.walk(sDstDirectory):
            for directory in dirs:
                sCurrentClusterDirectory = join(sDstDirectory, directory)
                clusterBSList = []
                clusterFileList = []
                bitsetfromDirectory(sCurrentClusterDirectory, clusterBSList,
                                    clusterFileList)

                groupDistanceList = []
                group_distance(newBSList, clusterBSList, JACCARD,
                               groupDistanceList)

                setInsert = Set([])

                for i in range(len(groupDistanceList)):
                    flag = False
                    for j in range(len(groupDistanceList[i])):
                        if groupDistanceList[i][j] <= threshold:
                            flag = True
                            break
                    if flag:
                        setInsert.add(i)
                        for j in range(len(pairDistanceList[i])):
                            if pairDistanceList[i][j] <= threshold:
                                setInsert.add(j)

                indexList = sorted(setInsert, reverse=True)
                for indexDel in indexList:
                    shutil.copy(newFileList[indexDel],
                                sCurrentClusterDirectory)
                    del newBSList[indexDel]
                    del newFileList[indexDel]

                    for i in range(len(pairDistanceList)):
                        del pairDistanceList[i][indexDel]

                    del pairDistanceList[indexDel]

        if len(newFileList) > 0:
            classf1 = classifier(newBSList, pairDistanceList, newFileList)
            classf1.setThreshold(threshold)

            classf1.clustering(SINGLE_LINKAGE)
            classf1.separateFilesToClusters(sDstDirectory, 1)
        index += 1
示例#22
0
def main_uni():
    begin = time.time()
    model = 'Maria'
    isWSP = False
    batch_size = 200  #we have to implement a batch size to get the predictions of the perturbed instances
    num_samples = 5000  #has to be divisible by batch size
    seed = 2020
    width = 1.0
    K = 5  # number of coefficients to check
    B = 10  # number of instances to get
    input_file = 'data/programGeneratedData/300remainingtestdata2016.txt'
    model_path = 'trainedModelOlaf/2016/-18800'
    f = classifier(model)
    dict = f.get_Allinstances()
    r = check_random_state(seed)
    if (isWSP):
        write_path = 'data/Lime/WSPfh' + model + str(2016) + 'final'
    else:
        write_path = 'data/Lime/SPfh' + model + str(2016) + 'final'

    #Estimating Lime with multinominal logistic regression
    fidelity = []
    correct_hit = 0
    x_left = dict['x_left']
    x_left_len = dict['x_left_len']
    x_right = dict['x_right']
    x_right_len = dict['x_right_len']
    target_word = dict['target']
    target_words_len = dict['target_len']
    y_true = dict['y_true']
    true_label = dict['true_label']
    pred = dict['pred']
    size = dict['size']
    left_words = []
    right_words = []
    all_words = []

    targets = []
    x_len = []
    coefs = []
    pred_b, prob = f.get_allProb(x_left, x_left_len, x_right, x_right_len,
                                 y_true, target_word, target_words_len, size,
                                 size)
    with open(write_path + '.txt', 'w') as results:
        for index in range(size):
            x_inverse_left, x_lime_left, x_lime_left_len = lime_perturbation(
                r, x_left[index], x_left_len[index], num_samples)
            x_inverse_right, x_lime_right, x_lime_right_len = lime_perturbation(
                r, x_right[index], x_right_len[index], num_samples)

            target_lime_word = np.tile(target_word[index], (num_samples, 1))
            target_lime_word_len = np.tile(target_words_len[index],
                                           (num_samples))
            y_lime_true = np.tile(y_true[index], (num_samples, 1))

            # predicting the perturbations
            pred_c, probabilities = f.get_allProb(
                x_lime_left, x_lime_left_len, x_lime_right, x_lime_right_len,
                y_lime_true, target_lime_word, target_lime_word_len,
                batch_size, num_samples)

            neg_labels = labels(pred_c)
            # Getting the weights
            x_w = np.append(x_left[index][0:x_left_len[index]],
                            x_right[index][0:x_right_len[index]])
            x_w_len = x_left_len[index] + x_right_len[index]
            x_len.append(x_w_len)
            x_lime_len = x_lime_left_len + x_lime_right_len
            x_lime = np.concatenate((x_lime_left, x_lime_right), axis=1)
            weights_all = get_weights(f, x_w, x_lime, x_w_len, x_lime_len,
                                      width)

            model_all = LogisticRegression(multi_class='ovr',
                                           solver='newton-cg')

            n_neg_labels = len(neg_labels)

            x_all = np.concatenate((x_inverse_left, x_inverse_right), axis=1)

            if n_neg_labels > 0:
                for label in neg_labels:
                    pred_c = np.append(pred_c, label)
                    x_all = np.concatenate(
                        (x_all,
                         np.zeros(
                             (1, x_left_len[index] + x_right_len[index]))),
                        axis=0)
                    weights_all = np.append(weights_all, 0)

                model_all.fit(x_all, pred_c, sample_weight=weights_all)
                pred_c = pred_c[:-n_neg_labels]
                x_all = x_all[:-n_neg_labels, :]
            else:
                model_all.fit(x_all, pred_c, sample_weight=weights_all)

            yhat = model_all.predict(x_all)
            if (int(yhat[0]) == int(pred_b[index])):
                correct_hit += 1
            _, acc = compare_preds(yhat, pred_c)
            fidelity.append(acc)

            # words:
            left_words.append(f.get_String_Sentence(x_lime_left[0]))
            right_words.append(f.get_String_Sentence(x_lime_right[0]))
            all_words.append(
                f.get_String_Sentence(x_lime_left[0]) +
                f.get_String_Sentence(x_lime_right[0]))
            targets.append(f.get_String_Sentence(target_word[index]))

            coefs.append(model_all.coef_)
            intercept = model_all.intercept_
            classes = model_all.classes_

            results.write('Instance ' + str(index) + ':' + '\n')
            results.write('True Label: ' + str(true_label[index]) +
                          ', Predicted label: ' + str(int(pred[index])) + '\n')
            results.write('\n')
            results.write('Intercept: ' + str(intercept) + '\n')
            results.write('\n')
            results.write('Left words: ' + str(left_words[index]) + '\n')
            results.write('\n')
            temp = right_words.copy()
            temp[index].reverse()
            results.write('Right words: ' + str(temp[index]) + '\n')
            results.write('\n')
            results.write('All words: ' + str(all_words[index]) + '\n')
            results.write('Target words: ' + str(targets[index]) + '\n')
            results.write('\n')
            results.write(
                '________________________________________________________' +
                '\n')

        neg_coefs_k = []
        neu_coefs_k = []
        pos_coefs_k = []
        all_coefs_k = []
        e_ij = []
        sum_coefs_k = []

        all_words_k = []
        dict_I = {}
        for i in range(size):
            K = 4
            if (K > int(x_len[i])):
                K = int(x_len[i])

            ##getting the B instances according to (W)SP
            neg_coefs = coefs[i][0]
            neu_coefs = coefs[i][1]
            pos_coefs = coefs[i][2]

            sum_coefs = np.zeros(len(neg_coefs))
            for k in range(len(neg_coefs)):
                sum_coefs[k] += np.absolute(neg_coefs[k]) + np.absolute(
                    pos_coefs[k]) + np.absolute(neg_coefs[k])

            coefs_maxargs = np.argpartition(sum_coefs, -K)[-K:]
            neg_coefs_k.append(neg_coefs[coefs_maxargs])
            neu_coefs_k.append(neu_coefs[coefs_maxargs])
            pos_coefs_k.append(pos_coefs[coefs_maxargs])

            sum_coefs_k.append(sum_coefs[coefs_maxargs])
            e_ij.append(sum_coefs[coefs_maxargs])

            all_coefs_k.append(
                [neg_coefs_k[i], neu_coefs_k[i], pos_coefs_k[i]])

            temp = np.array(all_words[i])
            all_words_k.append(temp[coefs_maxargs])

            for j, word in enumerate(all_words_k[i]):
                if (inDict(dict_I, word)):
                    dict_I[word] += e_ij[i][j]
                else:
                    dict_I[word] = e_ij[i][j]

            results.write('Instance: ' + str(i))
            results.write('k words: ' + str(all_words_k[i]) + '\n')
            results.write('\n')
            results.write('Neg coefs k: ' + str(neg_coefs_k[i]) + '\n')
            results.write('\n')
            results.write('Neu coefs k: ' + str(neu_coefs_k[i]) + '\n')
            results.write('\n')
            results.write('Pos coefs k: ' + str(pos_coefs_k[i]) + '\n')
            results.write('\n')
            results.write(
                '________________________________________________________' +
                '\n')
        results.close()

    picked_instances_all = WSP(dict_I, all_words_k, sum_coefs_k, B, isWSP)

    with open(write_path + 'K_instances' + '.txt', 'w') as results:
        for i in picked_instances_all:
            results.write('picked instance ' + str(i) + ":")
            results.write(' True Label: ' + str(true_label[i]) +
                          ', Predicted label: ' + str(int(pred[i])) + '\n')
            results.write('\n')
            results.write('Sentence: ' + str(left_words[i]) + str(targets[i]) +
                          str(right_words[i]) + '\n')
            results.write('\n')
            results.write('coefs: ' + str(coefs[i]) + '\n')
            results.write('\n')
            results.write('k words: ' + str(all_words_k[i]) + '\n')
            results.write('\n')
            results.write('Neg coefs k: ' + str(neg_coefs_k[i]) + '\n')
            results.write('\n')
            results.write('Neu coefs k: ' + str(neu_coefs_k[i]) + '\n')
            results.write('\n')
            results.write('Pos coefs k: ' + str(pos_coefs_k[i]) + '\n')
            results.write('\n')

            results.write('target: ' + str(targets[i]) + '\n')
            results.write(
                '___________________________________________________________________'
                + '\n')
        results.write('\n')
        results.write('Hit Rate measure:' + '\n')
        results.write('Correct: ' + str(correct_hit) + ' hit rate: ' +
                      str(correct_hit / size) + '\n')
        results.write('\n')
        results.write('Fidelity All measure: ' + '\n')
        mean = np.mean(fidelity)
        std = np.std(fidelity)
        results.write('Mean: ' + str(mean) + '  std: ' + str(std))

    end = time.time()
    print('It took: ' + str(end - begin) + ' Seconds')
示例#23
0
 def getResult(self):
     dataextraction()
     preprocess()
     result = classifier()
     return result
示例#24
0
def main_pos():
    begin = time.time()
    model = 'Maria'
    #isWSP = False
    batch_size = 200  #we have to implement a batch size to get the predictions of the perturbed instances
    num_samples = 5000  #has to be divisible by batch size
    seed = 2020
    width = 1.0
    K = 5  # number of coefficients to check
    B = 10  # number of instances to get
    nlp = en_core_web_lg.load()
    neighbors = Neighbors(nlp)
    f = classifier(model)
    dict = f.get_Allinstances()
    r = check_random_state(seed)
    write_path = 'data/Lime2/test' + model + str(2016) + 'final'

    #Estimating Lime with multinominal logistic regression
    n_all_features = len(f.word_id_mapping)
    fidelity = []
    correct_hit = 0
    x_left = dict['x_left']
    x_left_len = dict['x_left_len']
    x_right = dict['x_right']
    x_right_len = dict['x_right_len']
    target_word = dict['target']
    target_words_len = dict['target_len']
    y_true = dict['y_true']
    true_label = dict['true_label']
    pred = dict['pred']
    size = dict['size']
    left_words = []
    right_words = []
    all_words = []

    targets = []
    x_len = []
    coefs = []
    size = 10

    pred_b, prob = f.get_allProb(x_left, x_left_len, x_right, x_right_len,
                                 y_true, target_word, target_words_len, size,
                                 size)
    original_x = []
    with open(write_path + '.txt', 'w') as results:
        for index in range(size):
            pertleft, instance_sentiment, text, _, x = get_perturbations(
                True, False, neighbors, f, index, num_samples)
            pertright, instance_sentiment, text, _, x = get_perturbations(
                False, True, neighbors, f, index, num_samples)
            orig_left_x = x_left[index]
            orig_right_x = x_right[index]
            Z = np.zeros((num_samples, n_all_features))
            X = np.zeros((n_all_features))
            X[orig_left_x] += 1
            X[orig_right_x] += 1
            X = X.reshape(1, -1)
            predictions_f = []
            x_lime = np.zeros(
                (num_samples, x_left_len[index] + x_right_len[index]))
            x_lime_left = np.zeros((num_samples, FLAGS.max_sentence_len))
            x_lime_right = np.zeros((num_samples, FLAGS.max_sentence_len))
            print('Time after perturbation: ' + str(time.time() - begin) +
                  ' Seconds')
            for i in range(num_samples):

                x_left_ids = f.to_input(pertleft[i].split())
                x_right_ids = f.to_input(pertright[i].split())
                x_lime_left[i, :] = x_left_ids
                x_lime_right[i, :] = x_right_ids

                x_lime[i,
                       0:x_left_len[index] + x_right_len[index]] = np.append(
                           x_left_ids[0][0:x_left_len[index]],
                           x_right_ids[0][0:x_right_len[index]])
                Z[i, x_left_ids] += 1
                Z[i, x_right_ids] += 1
                #pred_f, _ = f.get_prob(x_left_ids, x[1], x_right_ids, x[3], x[4], x[5], x[6])
                #predictions_f.append(pred_f)

            target_lime_word = np.tile(target_word[index], (num_samples, 1))
            target_lime_word_len = np.tile(target_words_len[index],
                                           (num_samples))
            y_lime_true = np.tile(y_true[index], (num_samples, 1))
            x_lime_left_len = np.tile(x[1], (num_samples))
            x_lime_right_len = np.tile(x[3], (num_samples))

            # predicting the perturbations
            predictions_f, _ = f.get_allProb(x_lime_left, x_lime_left_len,
                                             x_lime_right, x_lime_right_len,
                                             y_lime_true, target_lime_word,
                                             target_lime_word_len, batch_size,
                                             num_samples)

            neg_labels = labels(predictions_f)
            # Getting the weights
            orig_x = np.append(orig_left_x[0:x_left_len[index]],
                               orig_right_x[0:x_right_len[index]])
            original_x.append(orig_x)
            orig_x_len = int(x_left_len[index] + x_right_len[index])
            x_len.append(orig_x_len)
            z_len = np.tile(orig_x_len, num_samples)
            x_lime = np.asarray(x_lime, int)

            weights_all = get_weights(f, orig_x, x_lime, orig_x_len, z_len,
                                      width)

            model_all = LogisticRegression(multi_class='ovr',
                                           solver='newton-cg')

            n_neg_labels = len(neg_labels)

            if n_neg_labels > 0:
                for label in neg_labels:
                    predictions_f = np.append(predictions_f, label)
                    Z = np.concatenate((Z, np.zeros((1, n_all_features))),
                                       axis=0)
                    weights_all = np.append(weights_all, 0)

                model_all.fit(Z, predictions_f, sample_weight=weights_all)
                predictions_f = predictions_f[:-n_neg_labels]
                Z = Z[:-n_neg_labels, :]
            else:
                model_all.fit(Z, predictions_f, sample_weight=weights_all)

            yhat = model_all.predict(X)

            if (int(yhat[0]) == int(pred_b[index])):
                correct_hit += 1
            print(pertleft[0].split())
            print(pertright[0].split())
            print(x_lime)
            get_predStats(predictions_f)
            print('Current instance: ' + str(index))
            print('Correct hit: ' + str(correct_hit))
            print('Current runtime: ' + str(time.time() - begin) + ' seconds')
            yhat = model_all.predict(Z)

            _, acc = compare_preds(yhat, predictions_f)
            fidelity.append(acc)

            # words:
            left_words.append(f.get_String_Sentence(orig_left_x))
            right_words.append(f.get_String_Sentence(orig_right_x))
            all_words.append(
                f.get_String_Sentence(orig_left_x) +
                f.get_String_Sentence(orig_right_x))
            targets.append(f.get_String_Sentence(target_word[index]))

            coefs.append(model_all.coef_)
            intercept = model_all.intercept_
            classes = model_all.classes_

            results.write('Instance ' + str(index) + ':' + '\n')
            results.write('True Label: ' + str(true_label[index]) +
                          ', Predicted label: ' + str(int(pred[index])) + '\n')
            results.write('\n')
            results.write('Intercept: ' + str(intercept) + '\n')
            results.write('\n')
            results.write('Left words: ' + str(left_words[index]) + '\n')
            results.write('\n')
            temp = right_words.copy()
            temp[index].reverse()
            results.write('Right words: ' + str(temp[index]) + '\n')
            results.write('\n')
            results.write('All words: ' + str(all_words[index]) + '\n')
            results.write('Target words: ' + str(targets[index]) + '\n')
            results.write('\n')
            results.write(
                '________________________________________________________' +
                '\n')

        neg_coefs_k = []
        neu_coefs_k = []
        pos_coefs_k = []
        all_coefs_k = []
        e_ij = []
        sum_coefs_k = []
        all_words_k = []
        dict_I = {}

        for i in range(size):
            K = 4
            if (K > int(x_len[i])):
                K = int(x_len[i])

            ##getting the B instances according to (W)SP
            neg_coefs = coefs[i][0]
            neu_coefs = coefs[i][1]
            pos_coefs = coefs[i][2]

            sum_coefs = np.zeros(len(neg_coefs))
            for j in original_x[i]:
                sum_coefs[j] += np.absolute(neg_coefs[j]) + np.absolute(
                    pos_coefs[j]) + np.absolute(neg_coefs[j])

            coefs_maxargs = np.argpartition(sum_coefs, -K)[-K:]
            neg_coefs_k.append(neg_coefs[coefs_maxargs])
            neu_coefs_k.append(neu_coefs[coefs_maxargs])
            pos_coefs_k.append(pos_coefs[coefs_maxargs])

            sum_coefs_k.append(sum_coefs[coefs_maxargs])

            e_ij.append(sum_coefs[coefs_maxargs])

            all_coefs_k.append(
                [neg_coefs_k[i], neu_coefs_k[i], pos_coefs_k[i]])
            all_words_k.append(f.get_String_Sentence(coefs_maxargs))
            #temp = np.array(all_words[i])
            #all_words_k.append(temp[coefs_maxargs])

            for j, word in enumerate(all_words_k[i]):
                if (inDict(dict_I, word)):
                    dict_I[word] += e_ij[i][j]
                else:
                    dict_I[word] = e_ij[i][j]

            results.write('Instance: ' + str(i) + '\n')
            results.write('k words: ' + str(all_words_k[i]) + '\n')
            results.write('\n')
            results.write('Neg coefs k: ' + str(neg_coefs_k[i]) + '\n')
            results.write('\n')
            results.write('Neu coefs k: ' + str(neu_coefs_k[i]) + '\n')
            results.write('\n')
            results.write('Pos coefs k: ' + str(pos_coefs_k[i]) + '\n')
            results.write('\n')
            results.write(
                '________________________________________________________' +
                '\n')
        results.close()

    picked_instances_all = WSP(dict_I, all_words_k, sum_coefs_k, B, True)

    with open(write_path + 'B_instances' + 'WSP.txt', 'w') as results:
        for i in picked_instances_all:
            results.write('picked instance ' + str(i) + ":")
            results.write(' True Label: ' + str(true_label[i]) +
                          ', Predicted label: ' + str(int(pred[i])) + '\n')
            results.write('\n')
            results.write('Sentence: ' + str(left_words[i]) + str(targets[i]) +
                          str(right_words[i]) + '\n')
            results.write('\n')
            results.write('coefs: ' + str(coefs[i]) + '\n')
            results.write('\n')
            results.write('k words: ' + str(all_words_k[i]) + '\n')
            results.write('\n')
            results.write('Neg coefs k: ' + str(neg_coefs_k[i]) + '\n')
            results.write('\n')
            results.write('Neu coefs k: ' + str(neu_coefs_k[i]) + '\n')
            results.write('\n')
            results.write('Pos coefs k: ' + str(pos_coefs_k[i]) + '\n')
            results.write('\n')

            results.write('target: ' + str(targets[i]) + '\n')
            results.write(
                '___________________________________________________________________'
                + '\n')
        results.write('\n')
        results.write('Hit Rate measure:' + '\n')
        results.write('Correct: ' + str(correct_hit) + ' hit rate: ' +
                      str(correct_hit / size) + '\n')
        results.write('\n')
        results.write('Fidelity All measure: ' + '\n')
        mean = np.mean(fidelity)
        std = np.std(fidelity)
        results.write('Mean: ' + str(mean) + '  std: ' + str(std))

    picked_instances_all = WSP(dict_I, all_words_k, sum_coefs_k, B, False)

    with open(write_path + 'B_instances' + 'SP.txt', 'w') as results:
        for i in picked_instances_all:
            results.write('picked instance ' + str(i) + ":")
            results.write(' True Label: ' + str(true_label[i]) +
                          ', Predicted label: ' + str(int(pred[i])) + '\n')
            results.write('\n')
            results.write('Sentence: ' + str(left_words[i]) + str(targets[i]) +
                          str(right_words[i]) + '\n')
            results.write('\n')
            results.write('coefs: ' + str(coefs[i]) + '\n')
            results.write('\n')
            results.write('k words: ' + str(all_words_k[i]) + '\n')
            results.write('\n')
            results.write('Neg coefs k: ' + str(neg_coefs_k[i]) + '\n')
            results.write('\n')
            results.write('Neu coefs k: ' + str(neu_coefs_k[i]) + '\n')
            results.write('\n')
            results.write('Pos coefs k: ' + str(pos_coefs_k[i]) + '\n')
            results.write('\n')

            results.write('target: ' + str(targets[i]) + '\n')
            results.write(
                '___________________________________________________________________'
                + '\n')
        results.write('\n')
        results.write('Hit Rate measure:' + '\n')
        results.write('Correct: ' + str(correct_hit) + ' hit rate: ' +
                      str(correct_hit / size) + '\n')
        results.write('\n')
        results.write('Fidelity All measure: ' + '\n')
        mean = np.mean(fidelity)
        std = np.std(fidelity)
        results.write('Mean: ' + str(mean) + '  std: ' + str(std))

    end = time.time()
    print('It took: ' + str(end - begin) + ' Seconds')
示例#25
0
    for adv in adv_accuracy:
        # init dataset for TESTING
        data = np.load('../data/{}_xs_mnist.npy'.format(adv))
        adv_data = np.load('../data/{}_advs_mnist.npy'.format(adv))
        labels = np.load('../data/{}_ys_mnist.npy'.format(adv))
        dataset = Dataset(data, adv_data, labels)
        dataloader = torch.utils.data.DataLoader(dataset,
                                                 batch_size=128,
                                                 shuffle=True,
                                                 num_workers=1)

        for image, adv_img, label in dataloader:
            image = image.cuda()
            adv_img = adv_img.cuda()
            label = label.cuda()

            # get model output
            def_out, _, _, _ = model(adv_img)
            adv_out_class = classifier(def_out)

            # get model predicted class
            adversarial_class = torch.argmax(adv_out_class, 1)

            # update confusion matrix
            adv_accuracy[adv][(adversarial_class * 10 +
                               label).astype(int)] += 1

    output = np.zeros(100)
    for adv in adv_accuracy:
        output += adv_accuracy[adv]
    generate(output.tolist())
示例#26
0
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.ndimage.measurements import label
# from moviepy.editor import VideoFileClip
# from IPython.display import HTML
from loadImages import *
from featureFunctions import featureFunctions
import extractor
from drawWindows import *
import classifier
from heatMap import *
from classifier import classifier

# Load car and not car images

cars = loadImages("./vehicles/", "cars")
notcars = loadImages("./non-vehicles/", "notcars")

# We can save the trained classifier in a .pkl file and then retrieve it to train the classifier

if os.path.isfile("models/trained_model.p"):
    print("Model already present, retrieving to use it")
    train_model = False
else:
    print("Training a classifier using new images")
    train_model = True

# Train a classifier

c = classifier(cars, notcars, train_model)
c.classify()
import classifier

class_dict = classifier("pet_images")

示例#28
0
def train_classifier(filename):
    import seaborn as sns
    import matplotlib.pyplot as plt

    # You can run this code to train your own classifier, but there is a provided pretrained one.
    # If you'd like to use this, just run "train_classifier(filename)"
    # to train and save a classifier on the label indices to that filename.

    # Target all the classes, so that's how many the classifier will learn
    label_indices = range(40)

    n_epochs = 3
    display_step = 500
    lr = 0.001
    beta_1 = 0.5
    beta_2 = 0.999
    image_size = 64

    transform = transforms.Compose([
        transforms.Resize(image_size),
        transforms.CenterCrop(image_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])

    dataloader = DataLoader(CelebA(".",
                                   split='train',
                                   download=True,
                                   transform=transform),
                            batch_size=batch_size,
                            shuffle=True)

    classifier = Classifier(n_classes=len(label_indices)).to(device)
    class_opt = torch.optim.Adam(classifier.parameters(),
                                 lr=lr,
                                 betas=(beta_1, beta_2))
    criterion = nn.BCEWithLogitsLoss()

    cur_step = 0
    classifier_losses = []
    # classifier_val_losses = []
    for epoch in range(n_epochs):
        # Dataloader returns the batches
        for real, labels in tqdm(dataloader):
            real = real.to(device)
            labels = labels[:, label_indices].to(device).float()

            class_opt.zero_grad()
            class_pred = classifier(real)
            class_loss = criterion(class_pred, labels)
            class_loss.backward()  # Calculate the gradients
            class_opt.step()  # Update the weights
            classifier_losses += [
                class_loss.item()
            ]  # Keep track of the average classifier loss

            ## Visualization code ##
            if cur_step % display_step == 0 and cur_step > 0:
                class_mean = sum(
                    classifier_losses[-display_step:]) / display_step
                print(f"Step {cur_step}: Classifier loss: {class_mean}")
                step_bins = 20
                x_axis = sorted([
                    i * step_bins
                    for i in range(len(classifier_losses) // step_bins)
                ] * step_bins)
                sns.lineplot(x_axis,
                             classifier_losses[:len(x_axis)],
                             label="Classifier Loss")
                plt.legend()
                plt.show()
                torch.save({"classifier": classifier.state_dict()}, filename)
            cur_step += 1
示例#29
0
p_y = np.array([
    0.04259472890229834, 0.05155736977549028, 0.05075871860857219,
    0.05208980388676901, 0.051024935664211554, 0.052533498979501284,
    0.051646108794036735, 0.052533498979501284, 0.052888455053687104,
    0.0527109770165942, 0.05306593309078002, 0.0527109770165942,
    0.05244475996095483, 0.0527109770165942, 0.052622237998047744,
    0.05315467210932647, 0.04836276510781791, 0.05004880646020055,
    0.04117490460555506, 0.033365870973467035
])

#read in the training data
print("reading the traing data...")
print("")
tm = training_matrix()

nb_c = classifier(tm, p_y)

#classifying... really slow because I'm a terrible programmer
print(
    "classifying... this may take a while on UNM machines. It takes like 15s on my personal laptop, but it took like 2 mins on campus"
)
print("")
#make p_x_given_y matrix
nb_c.makeMAPMatrix(alpha)
#classify
nb_c.classify()
print("accuracy: ")
print(nb_c.calculateAccuracy(True))

print("classifiers 100 most important words: ")
ent = entropy(nb_c, p_y, tm)
def clustering(distance, FileList, sDstDirectory, threshold):
    classf = classifier(distance, FileList)
    classf.setThreshold(threshold)

    classf.clustering(SINGLE_LINKAGE)
    classf.separateFilesToClusters(sDstDirectory, 1)
示例#31
0
        create_set('MNIST', './data', lab_classes, 100, unlab_classes, 50)

        data = dataset(num_lab_classes, './data', 128)

        secondary_path = main_path + '/loop_' + str(i)

        if not os.path.exists(secondary_path):
            os.makedirs(secondary_path)

        #generator = FCGenerator(data.img_size_x, data.img_size_y,
        #                        data.img_size_z)

        #critic = FCCritic("Critic", data.img_size_x, data.img_size_y,
        #                  data.img_size_z, num_lab_classes + 1)

        network = FCCritic("classifier", data.img_size_x, data.img_size_y,
                           data.img_size_z, num_lab_classes + 1)

        classify = classifier(network=network, dataset=data, steps=15000)

        print("TRANING STARTED")
        temp = classify.call(secondary_path, main_path)
        del network
        del classify
        del data
        tf.compat.v1.reset_default_graph()

        print("TRAINING FINISHED")

    count_labels += 1
示例#32
0
文件: utils.py 项目: ywz1993/oc-cnn
def OC_CNN(dataset, model_type, class_number, hyper_para):

	running_loss, inm, relu, mean, cov, imagenet_mean, imagenet_std, classifier = get_fuv(hyper_para, model_type)

	if(hyper_para.verbose==True):
		print('Loading dataset '+dataset+'...')

	train_data, test_data, test_label = load_dataset(dataset, class_number, imagenet_mean, imagenet_std, hyper_para)

	if(hyper_para.verbose==True):
		print(dataset+' dataset loaded.')

	no_train_data = np.shape(train_data.numpy())[0]
	no_test_data  = np.shape(test_data.numpy())[0]

	### choose one network which produces D dimensional features
	if(hyper_para.verbose==True):
		print('Loading network '+hyper_para.model_type+'...')
	
	model = choose_network(model_type, hyper_para.pre_trained_flag)

	if(hyper_para.verbose==True):
		print('Network '+hyper_para.model_type+' loaded.')

	running_cc = 0.0
	running_ls = 0.0

	if(hyper_para.gpu_flag):
		inm.cuda()
		relu.cuda()
		model.cuda()
		classifier.cuda()
	
	model.train()
	classifier.train()
	
	### optimizer for model training (for this work we restrict to only fine-tuning FC layers)
	if(model_type=='vggface'):
		model_optimizer      = optim.Adam(model[-5:].parameters(), lr=hyper_para.lr)
	else:
		model_optimizer      = optim.Adam(model.classifier.parameters(), lr=hyper_para.lr)
	classifier_optimizer = optim.Adam(classifier.parameters(), lr=hyper_para.lr)
	
	# loss functions
	cross_entropy_criterion = nn.CrossEntropyLoss()

	for i in range(int(hyper_para.iterations)):
	# for i in range(int(hyper_para.iterations*no_train_data/hyper_para.batch_size)):
		# print i
		rand_id = np.asarray(random.sample( range(no_train_data), int(hyper_para.batch_size)))
		rand_id = torch.from_numpy(rand_id)

		# get the inputs
		inputs = torch.autograd.Variable(train_data[rand_id].cuda()).float()
		
		# get the labels
		labels = np.concatenate( (np.zeros( (int(hyper_para.batch_size),) ), np.ones( (int(hyper_para.batch_size),)) ), axis=0)
		labels = torch.from_numpy(labels)
		labels = torch.autograd.Variable(labels.cuda()).long()
		
		gaussian_data = np.random.normal(0, hyper_para.sigma, (int(hyper_para.batch_size), hyper_para.D))
		gaussian_data = torch.from_numpy(gaussian_data)

		# forward + backward + optimize
		out1 = model(AddNoise(inputs, hyper_para.sigma1))

		out1 = out1.view(int(hyper_para.batch_size), 1, hyper_para.D)
		out1 = inm(out1)
		out1 = out1.view(int(hyper_para.batch_size), hyper_para.D)
		out2 = torch.autograd.Variable(gaussian_data.cuda()).float()
		out  = torch.cat( (out1, out2),0)
		out  = relu(out)
		out  = classifier(out)
		
		# zero the parameter gradients
		model_optimizer.zero_grad()
		classifier_optimizer.zero_grad()
		 
		cc = cross_entropy_criterion(out, labels) 
		loss = cc
		
		loss.backward()

		model_optimizer.step()
		classifier_optimizer.step()
		
		# print statistics
		running_cc += cc.data
		running_loss += loss.data

		if(hyper_para.verbose==True):
			if (i % (hyper_para.stats_freq) == (hyper_para.stats_freq-1)):    # print every stats_frequency batches
				line = hyper_para.BLUE   + '[' + str(format(i+1, '8d')) + '/'+ str(format(int(hyper_para.iterations), '8d')) + ']' + hyper_para.ENDC + \
					hyper_para.GREEN  + ' loss: '     + hyper_para.ENDC + str(format(running_loss/hyper_para.stats_freq, '1.8f'))  + \
					hyper_para.GREEN  + ' cc: '     + hyper_para.ENDC + str(format(running_cc/hyper_para.stats_freq, '1.8f'))
				print(line)
				running_loss = 0.0
				running_cc = 0.0
			
	classifier.eval()
	model.eval()
	relu.eval()

	area_under_curve, train_features, test_scores, test_features = choose_classifier(dataset, class_number, model_type, model, classifier, D, hyper_para, train_data, test_data, test_label, no_train_data, no_test_data, inm, relu, imagenet_mean, imagenet_std)

	classifier.cpu()
	model.cpu()
	relu.cpu()
	
	torch.save(model,'../../save_folder/saved_models/'+dataset+'/model/'+str(class_number)+'/'+model_type +'_'+
																				str(hyper_para.iterations)+'_'+
																				str(hyper_para.lr)		  +'_'+
																				str(hyper_para.sigma)	  +'.pth')
	
	torch.save(model,'../../save_folder/saved_models/'+dataset+'/classifier/'+str(class_number)+'/'+model_type +'_'+
																					 str(hyper_para.iterations)+'_'+
																					 str(hyper_para.lr)		   +'_'+
																					 str(hyper_para.sigma)     +'.pth')

	scipy.io.savemat('../../save_folder/results/'+dataset+'/'+ str(class_number)  +'/'+ model_type	+'_OCCNN123_'+
							 str(hyper_para.iterations)  +'_'+ str(hyper_para.lr) +'_'+ str(hyper_para.sigma)	 +'.mat',
								{'auc':area_under_curve, 'train_features':train_features, 'test_scores':test_scores,
														 'test_features':test_features,   'test_label':test_label    })

	if(hyper_para.verbose==True):
		print('model, classifier, features and results saved.')

	return area_under_curve
示例#33
0
    + "\n and was trained for " + str(best_epoch) + " epoches with batch size = " + str(best_b_size) + " and drop out percentage = " + str(best_drop_perc)  )

if __name__ == '__main__':
    # classifiers tuinind parameters
    neighbors = [1,3,5,7,10,15]
    slack = [1,.1,.2,.25,2,5,10,20]
    estimators = [50,100,200]
    # define net tuinind parameters
    # drop_perc: vector of lenght 5 defining the percent of dropout at every layer
    level1_drop = [.2,.25,.28,.3]
    level2_drop = [.2,.25,.28,.3]
    level3_drop = [.5,.3,.35,.4]
    level4_drop = [.3,.35,.4,.45]
    level5_drop = [.35,.4,.45,5]
    # get all combinations 
    drop_percs = [level1_drop,level2_drop,level3_drop,level4_drop,level5_drop]
    drop_percs = list(itertools.product(*drop_percs))
    batch_sizes = [16,32,64,128]
    epochs = [32,65,100,150]
    num_classes = 10
    valid_size = .2
    dropout = True
    data_loader = get_dataset()
    cnn_ds, ds = data_loader.data_preproc(use_pca=0)
    
    clas = classifier()
    # call CNN
    CNN_best_perform(drop_percs,batch_sizes,epochs)
    # call the rest of classifiers
    clas.classify(ds,neighbors = neighbors,slack = slack,estimators = estimators)