示例#1
0
    def __init__(self):
        super(Brisk, self).__init__()
        self.config = Config().config
        self.logger = Logger().logger
        self.proxy_manager = ProxyPool()
        self.db = DB().db

        self.__proxy_status = self.config.get('PROXY', 'empty')
        if self.__proxy_status == 'TRUE':
            self.proxy_manager.delete_valid_proxy()

        self.__hook_name = 'hook'
        self.__walk_name = 'walk'
        self.__flow_name = 'flow'

        self.__brisk_type = self.config.get('RUN', 'type')

        self.__func_filter = lambda m: not m.startswith("__") and \
                                       not m.startswith(self.__hook_name) and \
                                       not m.startswith(self.__walk_name) and \
                                       not m.startswith(self.__flow_name)

        self.__flow_num = int(self.config.get('RUN', 'num'))
        self.__hook = None
        self.__flow_queue = queue.Queue()
        self.__walk_queue = queue.Queue()
        self.__go_init()
示例#2
0
 def __init__(self, queue=None):
     Request.__init__(self)
     threading.Thread.__init__(self)
     self.logger = Logger().logger
     self.config = Config().config
     self.proxy_pool = ProxyPool()
     self.__queue = queue
     self.db = DB().db
     self.proxy_ip, self.proxy_port = None, None
     self.proxy()
示例#3
0
class Core(Request, threading.Thread):

    def __init__(self, queue=None):
        Request.__init__(self)
        threading.Thread.__init__(self)
        self.logger = Logger().logger
        self.config = Config().config
        self.proxy_pool = ProxyPool()
        self.__queue = queue
        self.db = DB().db
        self.proxy_ip, self.proxy_port = None, None
        self.proxy()

    def proxy(self):
        if self.config.get('PROXY', 'use') == 'TRUE':
            if not self.proxy_ip:
                self.proxy_ip, self.proxy_port = self.proxy_pool.get_proxy(
                        type=self.config.get('PROXY', 'type'), 
                        seed_num=int(self.config.get('PROXY', 'seed_num')),
                        distinct=self.config.get('PROXY', 'distinct') == 'TRUE'
                        )
            else:
                self.proxy_pool.delete_proxy(ip=self.proxy_ip, type=self.proxy_type)
                self.proxy_ip, self.proxy_port = self.proxy_pool.get_proxy(
                        type=self.config.get('PROXY', 'type'),
                        seed_num=int(self.config.get('PROXY', 'seed_num')),
                        distinct=self.config.get('PROXY', 'distinct') == 'TRUE'
                        )
        else:
            self.proxy_ip, self.proxy_port = None, None

    @use_proxy
    @load_params
    def get(self, url, params=None, **kwargs):
        return super().get(url, params, **kwargs)

    @use_proxy
    @load_params
    def post(self, url, data, **kwargs):
        return super().post(url, data, **kwargs)

    def task(self):
        pass

    def run(self):
        self.logger.info('go')
        try:
            self.task()
            self.logger.info('ok')
        except Exception as e:
            self.logger.info('something wrong')
        finally:
            if self.__queue:
                assert isinstance(self.__queue, queue.Queue)
                self.__queue.task_done()
示例#4
0
import pandas as pd

from scipy import sparse

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
import seaborn as sns

from src.runner import Runner
from src.util import Logger
from src.model_NB import ModelMultinomialNB

logger = Logger()


def makefig(result):
    sns.set_style("whitegrid")
    ax = sns.boxenplot(data=result, width=0.4)
    ax.set_ylabel('Accuracy', size=14)
    ax.tick_params(labelsize=14)
    plt.savefig(f'../model/tuning/{NAME}-NB.png', dpi=300)


if __name__ == '__main__':
    base_params = {'alpha': 1.0, 'fit_prior': True, 'class_prior': None}
    params_NB = dict(base_params)
    param_grid_ = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
示例#5
0
sys.path.append('../')
import numpy as np
import pandas as pd
from src.model_SVC import ModelSVC
from scipy import sparse

from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations

from src.util import Logger
from src.runner import Runner

logger = Logger()

if __name__ == '__main__':
    params = {'kernel': 'linear', 'gamma': 0.001}
    params_SVC = dict(params)

    param_grid_ = [{
        'n_components': [10, 30, 50, 100],
        'n_iter': [8, 16],
        'C': [1, 10, 100, 1000]
    }, {
        'apply_svd': [False],
        'C': [1, 10, 100, 1000]
    }]

    feature = [["mfcc", "delta", "power"]]
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import LeaveOneGroupOut
from typing import Callable, List, Optional, Tuple, Union
from sklearn.model_selection import learning_curve
from scipy import sparse
from scipy.sparse import load_npz
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy

from src.model import Model
from src.util import Logger, Util

sns.set()
logger = Logger()


class RunnerLeaveOneOut:
    def __init__(self, run_name: str, model_cls: Callable[[str, dict], Model],
                 features: str, params: dict):
        """コンストラクタ
        :param run_name: ランの名前
        :param model_cls: モデルのクラス
        :param features: 特徴量のリスト
        :param params: ハイパーパラメータ
        """
        self.run_name = run_name
        self.model_cls = model_cls
        self.features = features
        self.params = params
示例#7
0
 def __init__(self):
     super(ProxyPool, self).__init__()
     self.config = Config().config
     self.default_db = DB().default_db
     self.proxy_db = DB().proxy_db
     self.logger = Logger().logger
示例#8
0
class ProxyPool(Request):
    _instance_lock = threading.Lock()

    def __new__(cls, *args, **kwargs):
        if not hasattr(ProxyPool, "_instance"):
            with ProxyPool._instance_lock:
                if not hasattr(ProxyPool, "_instance"):
                    ProxyPool._instance = object.__new__(cls)
        return ProxyPool._instance

    def __init__(self):
        super(ProxyPool, self).__init__()
        self.config = Config().config
        self.default_db = DB().default_db
        self.proxy_db = DB().proxy_db
        self.logger = Logger().logger

    def _fetch_proxy_http_1(self):
        url = 'http://www.ip3366.net/free/?stype=1'
        soup = self.get_soup(url)
        for tr in soup.find_all('tr')[1:]:
            ip, port = map(lambda x: x.text, tr.find_all('td')[0:2])
            self.default_db.hset('HTTP', ip, port)

    def _fetch_proxy_http_2(self):
        url = 'https://www.xicidaili.com/wt/'
        soup = self.get_soup(url)
        for tr in soup.find_all('tr')[1:]:
            ip, port = map(lambda x: x.text, tr.find_all('td')[1:3])
            self.default_db.hset('HTTP', ip, port)

    def _fetch_proxy_https_1(self):
        url = 'http://www.ip3366.net/free/?stype=2'
        html = self.get(url)
        soup = self.res2soup(html)
        for tr in soup.find_all('tr')[1:]:
            ip, port = map(lambda x: x.text, tr.find_all('td')[0:2])
            self.default_db.hset('HTTPS', ip, port)

    def _fetch_proxy_https_2(self):
        url = 'https://www.xicidaili.com/wn/'
        soup = self.get_soup(url)
        for tr in soup.find_all('tr')[1:]:
            ip, port = map(lambda x: x.text, tr.find_all('td')[1:3])
            self.default_db.hset('HTTPS', ip, port)

    @pool_lock
    def delete_backup_proxy(self):
        _ = [
            self.default_db.hdel('HTTP', key)
            for key in self.default_db.hkeys(name='HTTP')
        ]
        _ = [
            self.default_db.hdel('HTTPS', key)
            for key in self.default_db.hkeys(name='HTTPS')
        ]

    @pool_lock
    def delete_valid_proxy(self):
        _ = [self.proxy_db.delete(key) for key in self.proxy_db.keys()]

    @pool_lock
    def _delete_all_proxy(self):
        _ = [
            self.default_db.hdel('HTTP', key)
            for key in self.default_db.hkeys(name='HTTP')
        ]
        _ = [
            self.default_db.hdel('HTTPS', key)
            for key in self.default_db.hkeys(name='HTTPS')
        ]
        _ = [self.proxy_db.delete(key) for key in self.proxy_db.keys()]

    def _fetch_proxy(self, type):
        if type == 'HTTP':
            self._fetch_proxy_http_1()
            self._fetch_proxy_http_2()
        if type == 'HTTPS':
            self._fetch_proxy_https_1()
            self._fetch_proxy_https_2()

    def _check_proxy(self, ip, port, timeout=3):
        if not ip or not port:
            return False
        http_url = 'http://{}:{}'.format(ip, port)
        https_url = 'https://{}:{}'.format(ip, port)
        try:
            self.get(url="http://icanhazip.com/",
                     timeout=timeout,
                     proxies={
                         'http': http_url,
                         'https': https_url
                     })
            return True
        except:
            return False

    def _get_backup_proxy(self, type='HTTPS'):
        import random
        ip = random.choice(self.default_db.hkeys(type))
        port = self.default_db.hget(type, ip)
        return ip, port

    def _get_valid_proxy(self, type='HTTPS', seed_num=1):
        for _ in range(len(self.default_db.hkeys(type))):
            ip, port = self._get_backup_proxy(type)
            if self._check_proxy(ip, port):
                self.logger.info('backup ip {} valid, fetch'.format(ip))
                self.proxy_db.set(ip, port, ex=3600)
                if len(self.proxy_db.keys()) >= seed_num:
                    break
            else:
                self.logger.info('backup ip {} invalid, delete'.format(ip))
                self.default_db.hdel(type, ip)
                pass
        else:
            self.logger.info('no enough {} backup valid ip'.format(seed_num))
            pass

    @pool_lock
    def get_proxy(self, type='HTTPS', seed_num=1, distinct=False):
        if len(self.proxy_db.keys()) == 0:
            self.logger.info('no valid proxy in proxy pool')
            if not self.default_db.exists(type) or not len(
                    self.default_db.hkeys(type)):
                self.logger.info(
                    'no backup {} proxy, fetch from web'.format(type))
                self._fetch_proxy(type)
                self.logger.info('fetch ok, {} items in total'.format(
                    len(self.default_db.hkeys(type))))
            self.logger.info('fetch valid proxy from backup proxy pool')
            self._get_valid_proxy(type, seed_num)
            assert len(self.proxy_db.keys()) != 0
            self.logger.info('fetch valid proxy ok')

        ip = self.proxy_db.keys()[0]
        port = self.proxy_db.get(ip)
        if distinct:
            self.proxy_db.delete(ip)
            self.default_db.hdel(type, ip)
        else:
            pass
        return ip, port

    @pool_lock
    def delete_proxy(self, ip, type='HTTPS'):
        self.proxy_db.delete(ip)
        self.proxy_db.hdel(type, ip)
示例#9
0
from hyperopt import fmin, tpe, STATUS_OK, Trials

# 学習データを学習データとバリデーションデータに分ける
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns

from src.util import Logger
from src.model_MLP import ModelMLP
from src.runner import Runner

import gc
gc.collect()
logger = Logger()


# 目的関数
def objective(params):
    global base_params
    # base parameter のパラメータを探索パラメータに更新する
    base_params.update(params)

    # モデルオブジェクト生成 & Train
    model = ModelMLP("MLP", **base_params)
    model.train(tr_x, tr_y, va_x, va_y)

    # 予測
    va_pred = model.predict(va_x)
    score = log_loss(va_y, va_pred)
示例#10
0
def  main(args):

    np.random.seed(1)
    random.seed(1)
    ray.init(num_cpus=20)

    output_path = args.output_path
    data_dir = args.data_dir

    ## Select dataset
    dataset = args.dataset

    # ###  Clustering  options

    cluster_option = args.cluster_option
    
    ## plotting options
    plot_option_clusters_vs_lambda = args.plot_option_clusters_vs_lambda
    plot_option_fairness_vs_clusterE = args.plot_option_fairness_vs_clusterE
    plot_option_balance_vs_clusterE = args.plot_option_balance_vs_clusterE
    plot_option_convergence = args.plot_option_convergence
    
        # ###  Data load
    savepath_compare =  osp.join(data_dir,dataset+'.npz')

    if not os.path.exists(savepath_compare):
        X_org, demograph, K = read_dataset(dataset)
        np.savez(savepath_compare,X_org = X_org, demograph = demograph, K = K)
    else:
        datas = np.load(savepath_compare)
        X_org = datas['X_org']
        demograph = datas['demograph']
        K = datas['K'].item()

    log_path = osp.join(data_dir,dataset+'_'+cluster_option,'_log.txt')
    sys.stdout = Logger(log_path)

    print('Cluster number for dataset {}'.format(K))
    V_list =  [np.array(demograph == j) for j in np.unique(demograph)]
    V_sum =  [x.sum() for x in V_list]
    print('Balance of the dataset {}'.format(min(V_sum)/max(V_sum)))
#    J = len(V_sum)
    N,D = X_org.shape
    
    
    # demographic probability for each V_j
    
    u_V = [x/N for x in V_sum]  #proportional
    
    # Normalize Features
    
    X = normalizefea(X_org)
    
    #############################################################################
    
    ######################## Run Fair clustering #################################
    
    #############################################################################
    #    
    fairness = True # Setting False only runs unfair clustering
    
    elapsetimes = []
    avg_balance_set = []
    min_balance_set = []
    fairness_error_set = []
    E_cluster_set = []
    E_cluster_discrete_set = []
    bestacc = 1e10
    best_avg_balance = -1
    best_min_balance = -1
    
    if args.lmbda is None:      
        lmbdas = np.arange(45,50,2).tolist()
    else:
        lmbdas = [args.lmbda]
        
    length_lmbdas = len(lmbdas)
    
    l = None


    if (not 'A' in locals()) and cluster_option == 'ncut':
        alg_option = 'flann' if N>50000 else 'None'
        affinity_path = osp.join(data_dir,dataset+'_affinity_ncut_final.npz')
        knn = 20
        if not os.path.exists(affinity_path):
            A = util.create_affinity(X,knn,savepath = affinity_path, alg=alg_option)
        else:
            A = util.create_affinity(X,knn,W_path = affinity_path)

    
    init_C_path = osp.join(data_dir,dataset+'_init_{}_{}_final.npy'.format(cluster_option,K))
    
    for count,lmbda in enumerate(lmbdas):
        print('Inside Lambda ',lmbda)
    
        if not os.path.exists(init_C_path):
            print('Generating initial seeds')
            C_init,_ = km_init(X,K,'kmeans_plus')
            np.save(init_C_path,C_init)
            
        else:
            
            C_init = np.load(init_C_path) # Load initial seeds
    
            
        if cluster_option == 'ncut':
            
            C,l,elapsed,S,E = fair_clustering(X, K, u_V, V_list, lmbda, fairness, cluster_option, C_init, A = A)
            
        else:
            
            C,l,elapsed,S,E = fair_clustering(X, K, u_V, V_list, lmbda, fairness, cluster_option, C_init)
      
        min_balance, avg_balance = get_fair_accuracy(u_V,V_list,l,N,K)
        fairness_error = get_fair_accuracy_proportional(u_V,V_list,l,N,K)
    
        print('lambda = {}, \n fairness_error {: .2f} and \n avg_balance = {: .2f} \n min_balance = {: .2f}'.format(lmbda, fairness_error, avg_balance, min_balance))
    
            
        # Plot the figure with clusters
        
        if dataset in ['Synthetic', 'Synthetic-unequal'] and plot_option_clusters_vs_lambda == True:
            
            filename = osp.join(output_path, 'cluster_output', 'cluster-plot_fair_{}-{}_lambda_{}.png'.format(cluster_option,dataset,lmbda))
            plot_clusters_vs_lambda(X_org,l,filename,dataset,lmbda, min_balance_set, avg_balance_set,fairness_error)
    #
        if avg_balance>best_avg_balance:
           best_avg_balance = avg_balance
           best_lambda_avg_balance = lmbda

        if min_balance>best_min_balance:
           best_min_balance = min_balance
           best_lambda_min_balance = lmbda

        if fairness_error<bestacc:
            bestacc = fairness_error
            best_lambda_acc = lmbda
            
            
        if plot_option_convergence == True:
            
            filename = osp.join(output_path,'Fair_{}_convergence_{}.png'.format(cluster_option,dataset))
            E_fair = E['fair_cluster_E']
            plot_convergence(cluster_option, filename, E_fair)
    
    
    
        print('Best fairness_error %0.4f' %bestacc,'|Error lambda = ', best_lambda_acc)
        print('Best  Avg balance %0.4f' %best_avg_balance,'| Avg Balance lambda = ', best_lambda_avg_balance)
        print('Best  Min balance %0.4f' %best_min_balance,'| Min Balance lambda = ', best_lambda_min_balance)
        elapsetimes.append(elapsed)
        avg_balance_set.append(avg_balance)
        min_balance_set.append(min_balance)
        fairness_error_set.append(fairness_error)
        E_cluster_set.append(E['cluster_E'][-1])
        E_cluster_discrete_set.append(E['cluster_E_discrete'][-1])
    
        
    avgelapsed = sum(elapsetimes)/len(elapsetimes)
    print ('avg elapsed ',avgelapsed)
    ray.shutdown()
    
    
    if plot_option_fairness_vs_clusterE == True and length_lmbdas > 1:
    

        savefile = osp.join(data_dir,'Fair_{}_fairness_vs_clusterEdiscrete_{}.npz'.format(cluster_option,dataset))
        filename = osp.join(output_path,'Fair_{}_fairness_vs_clusterEdiscrete_{}.png'.format(cluster_option,dataset))
        plot_fairness_vs_clusterE(cluster_option, savefile, filename, lmbdas, fairness_error_set, min_balance_set, avg_balance_set, E_cluster_discrete_set)

    if plot_option_balance_vs_clusterE == True and length_lmbdas > 1:

        savefile = osp.join(data_dir,'Fair_{}_balance_vs_clusterEdiscrete_{}.npz'.format(cluster_option,dataset))
        filename = osp.join(output_path,'Fair_{}_balance_vs_clusterEdiscrete_{}.png'.format(cluster_option,dataset))

        plot_balance_vs_clusterE(cluster_option, savefile, filename, lmbdas, fairness_error_set, min_balance_set, avg_balance_set, E_cluster_discrete_set)
示例#11
0
def  main(args, k):
    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)

    ## Options
    dataset = args.dataset
    cluster_option = args.cluster_option
    data_dir = osp.join(args.data_dir, dataset)
    output_path = data_dir
    if not osp.exists(data_dir):
        os.makedirs(data_dir)

    ## plotting options
    plot_option_clusters_vs_lambda = args.plot_option_clusters_vs_lambda
    plot_option_fairness_vs_clusterE = args.plot_option_fairness_vs_clusterE
    plot_option_balance_vs_clusterE = args.plot_option_balance_vs_clusterE
    plot_option_convergence = args.plot_option_convergence
    plot_option_K_vs_clusterE = args.plot_option_K_vs_clusterE


    # ###  Data load
    dir_path = osp.join(data_dir, cluster_option + "_" + str(k))
    savepath_compare =  osp.join(dir_path,dataset+'.npz')
    if not os.path.exists(savepath_compare):
        X_org, demograph, K = read_dataset(dataset, data_dir, k)
        if X_org.shape[0]>200000:
            np.savez_compressed(savepath_compare, X_org = X_org, demograph = demograph, K = K)
        else:
            os.mkdir(dir_path)
            np.savez(savepath_compare, X_org=X_org, demograph=demograph, K=K)

    else:
        datas = np.load(savepath_compare)
        X_org = datas['X_org']
        demograph = datas['demograph']
        K = datas['K'].item()


    log_path = osp.join(data_dir,cluster_option + '_iot_log_' + str(k) + '.txt')
    sys.stdout = Logger(log_path)
    # Scale and Normalize Features
    X_org = scale(X_org, axis = 0)
    X = normalizefea(X_org)

    N, D = X.shape
    print('Cluster number for dataset {} is {}'.format(dataset,K))
    V_list =  [np.array(demograph == j) for j in np.unique(demograph)]
    V_sum =  [x.sum() for x in V_list]
    print('Balance of the dataset {}'.format(min(V_sum)/max(V_sum)))

    print('Number of points in the dataset {}'.format(N))
#    J = len(V_sum)


    # demographic probability for each V_j

    u_V = [x/N for x in V_sum]  #proportional
    print('Demographic-probabilites: {}'.format(u_V))
    print('Demographic-numbers per group: {}'.format(V_sum))

    #############################################################################

    ######################## Run Fair clustering #################################

    #############################################################################
    #
    fairness = True # Setting False only runs unfair clustering

    elapsetimes = []
    avg_balance_set = []
    min_balance_set = []
    fairness_error_set = []
    E_cluster_set = []
    E_cluster_discrete_set = []
    bestacc = 1e10
    best_avg_balance = -1
    best_min_balance = -1

    if args.lmbda_tune:
        print('Lambda tune is true')
        lmbdas = np.arange(0,10000,100).tolist()
    else:
        lmbdas = [args.lmbda]

    length_lmbdas = len(lmbdas)

    l = None

    dir_path = osp.join(data_dir, cluster_option + "_" + str(k))
    if (not 'A' in locals()) and cluster_option == 'ncut':
        alg_option = 'flann' if N>50000 else 'None'
        affinity_path = osp.join(dir_path, dataset +'_affinity_ncut.npz')
        knn = 20
        if not osp.exists(affinity_path):
            A = util.create_affinity(X,knn,savepath = affinity_path, alg=alg_option)
        else:
            A = util.create_affinity(X,knn,W_path = affinity_path)

    init_C_path = osp.join(dir_path,'{}_init_{}_{}.npz'.format(dataset,cluster_option,K))
    if not osp.exists(init_C_path):
        print('Generating initial seeds')
        C_init,l_init = km_init(X,K,'kmeans_plus')
        np.savez(init_C_path, C_init = C_init, l_init = l_init)

    else:
        temp = np.load(init_C_path)
        C_init = temp ['C_init'] # Load initial seeds
        l_init = temp ['l_init']

    for count,lmbda in enumerate(lmbdas):

        print('Inside Lambda ',lmbda)

        if cluster_option == 'ncut':

            C,l,elapsed,S,E = fair_clustering(X, K, u_V, V_list, lmbda, fairness, cluster_option, C_init = C_init, l_init =l_init,  A = A)

        else:

            C,l,elapsed,S,E = fair_clustering(X, K, u_V, V_list, lmbda, fairness, cluster_option, C_init=C_init, l_init=l_init)

        min_balance, avg_balance = get_fair_accuracy(u_V,V_list,l,N,K)
        fairness_error = get_fair_accuracy_proportional(u_V,V_list,l,N,K)

        print('lambda = {}, \n fairness_error {: .2f} and \n avg_balance = {: .2f} \n min_balance = {: .2f}'.format(lmbda, fairness_error, avg_balance, min_balance))


        # Plot the figure with clusters

        if dataset in ['Synthetic', 'Synthetic-unequal', 'Sensor'] and plot_option_clusters_vs_lambda == True:
            cluster_plot_location = osp.join(output_path, 'cluster_output')
            if not osp.exists(cluster_plot_location):
                os.makedirs(cluster_plot_location)

            filename = osp.join(cluster_plot_location, 'cluster-plot_fair_{}-{}_lambda_{}.png'.format(cluster_option,dataset,lmbda))
            plot_clusters_vs_lambda(X_org, demograph, l, filename, dataset, lmbda, fairness_error, cluster_option)
    #
        if avg_balance>best_avg_balance:
           best_avg_balance = avg_balance
           best_lambda_avg_balance = lmbda

        if min_balance>best_min_balance:
           best_min_balance = min_balance
           best_lambda_min_balance = lmbda

        if fairness_error<bestacc:
            bestacc = fairness_error
            best_lambda_acc = lmbda


        if plot_option_convergence == True and count==0:

            filename = osp.join(output_path,'Fair_{}_convergence_{}.png'.format(cluster_option,dataset))
            E_fair = E['fair_cluster_E']
            plot_convergence(cluster_option, filename, E_fair)


        print('Best fairness_error %0.4f' %bestacc,'|Error lambda = ', best_lambda_acc)
        print('Best  Avg balance %0.4f' %best_avg_balance,'| Avg Balance lambda = ', best_lambda_avg_balance)
        print('Best  Min balance %0.4f' %best_min_balance,'| Min Balance lambda = ', best_lambda_min_balance)
        elapsetimes.append(elapsed)
        avg_balance_set.append(avg_balance)
        min_balance_set.append(min_balance)
        fairness_error_set.append(fairness_error)
        E_cluster_set.append(E['cluster_E'][-1])
        E_cluster_discrete_set.append(E['cluster_E_discrete'][-1])

    avgelapsed = sum(elapsetimes)/len(elapsetimes)
    print ('avg elapsed ',avgelapsed)

    if plot_option_fairness_vs_clusterE == True and length_lmbdas > 1:


        savefile = osp.join(dir_path,'Fair_{}_fairness_vs_clusterEdiscrete_{}.npz'.format(cluster_option,dataset))
        filename = osp.join(output_path,'Fair_{}_fairness_vs_clusterEdiscrete_{}.png'.format(cluster_option,dataset))
        plot_fairness_vs_clusterE(cluster_option, savefile, filename, lmbdas, fairness_error_set, min_balance_set, avg_balance_set, E_cluster_discrete_set)

    if plot_option_balance_vs_clusterE == True and length_lmbdas > 1:

        savefile = osp.join(dir_path,'Fair_{}_balance_vs_clusterEdiscrete_{}.npz'.format(cluster_option,dataset))
        filename = osp.join(output_path,'Fair_{}_balance_vs_clusterEdiscrete_{}.png'.format(cluster_option,dataset))

        plot_balance_vs_clusterE(cluster_option, savefile, filename, lmbdas, fairness_error_set, min_balance_set, avg_balance_set, E_cluster_discrete_set)

    if plot_option_K_vs_clusterE == True and length_lmbdas > 1:

        savefile = osp.join(dir_path, 'Fair_{}_K_vs_clusterE_{}.npz'.format(cluster_option, dataset))
        filename = osp.join(output_path, 'Fair_{}_K_vs_clusterE_{}.png'.format(cluster_option, dataset))

        #TO DO , set correct set of params
        plot_K_vs_clusterE(cluster_option, savefile, filename, range(K), E_cluster_set, E_cluster_discrete_set, save=True)
示例#12
0
def main(args):

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)
    # pdb.set_trace()
    dataset = args.dataset
    data_dir = './data/'

    # SLK options BO/MS/Means
    SLK_option = args.SLK_option

    #  Save?
    mode_images = args.mode_images #  save mode images in a directory?
    saveresult = args.saveresult  #  save results?

    log_path = os.path.join(data_dir,SLK_option+'_'+dataset+'_log_.txt')
    sys.stdout = Logger(log_path)

    #   Give data matrix in samples by feature format ( N X D)
    X, gnd_labels, K, sigma, X_org, knn = read_dataset(dataset,data_dir)

    # Normalize Features
    X =normalizefea(X)

    N,D =X.shape

    #####Validation Set for tuning lambda and initial K-means++ seed.
    #### However you can set value of lambda and initial seed empirically and skip validation set #######

    val_path = data_dir + dataset + '_val_set.npz'

    if not os.path.exists(val_path):
        X_val,gnd_val,val_ind,imbalance = util.validation_set(X,gnd_labels,K,0.1)
        np.savez(val_path, X_val = X_val, gnd_val = gnd_val, val_ind = val_ind)
    else:
        data_val = np.load(val_path)
        X_val = data_val['X_val']
        gnd_val = data_val['gnd_val']
        val_ind = data_val['val_ind']

    ##    # Build the knn kernel
    start_time = timeit.default_timer()

    aff_path = data_dir + 'W_'+str(knn)+'_'+ dataset+'.npz'
    alg = None
    if N>100000:
        alg = "flann"

    if not os.path.exists(aff_path):
        W = util.create_affinity(X, knn, scale = None, alg = alg, savepath = aff_path, W_path = None)
    else:
        W = util.create_affinity(X, knn, W_path = aff_path)

    elapsed = timeit.default_timer() - start_time
    print(elapsed)

    ###### Run SLK#################################

    bound_ = args.bound # Setting False only runs K-modes
    bound_it = 1000

    if sigma is None:
        sigma = util.estimate_sigma(X,W,knn,N)
    #        sigma = util.estimate_median_sigma(X,knn) # Or this

    # Initial seed path from kmeans++ seed
    init_C_path = data_dir+dataset+'_C_init.npy'
    if not os.path.exists(init_C_path):
        C_init,_ = km_init(X,K,'kmeans_plus')
        np.save(init_C_path,C_init)
    else:
        C_init = np.load(init_C_path) # Load initial seeds

    if args.lmbda_tune:
        lmbdas = np.arange(0.1,10,0.3).tolist()
    else:
        lmbdas = [args.lmbda]

    if args.lmbda_tune == True:
        elapsetimes = []
        bestnmi = -1
        bestacc = -1
        t = len(lmbdas)
        trivial = [0]*t # Take count on any missing cluster

        for count,lmbda in enumerate(lmbdas):
            print('Inside Lambda ',lmbda)
            print('Inside Sigma ',sigma)

            if N<=5000:
                _,l,elapsed,mode_index,z,_,ts = SLK_iterative(X, sigma, K, W, bound_, SLK_option, C_init,
                                                                   bound_lambda = lmbda, bound_iterations=bound_it)
            else:
                _,l,elapsed,mode_index,z,_,ts = SLK(X, sigma, K, W, bound_, SLK_option, C_init,
                                                         bound_lambda = lmbda, bound_iterations=bound_it)

            if ts:
                trivial[count] = 1
                continue

            # Evaluate the performance on validation set
            current_nmi = nmi(gnd_val,l[val_ind])
            acc,_ = get_accuracy(gnd_val,l[val_ind])

            print('lambda = ',lmbda, ' : NMI= %0.4f' %current_nmi)
            print('accuracy %0.4f' %acc)

            if current_nmi>bestnmi:
                bestnmi = current_nmi
                best_lambda_nmi = lmbda

            if acc>bestacc:
                bestacc = acc
                best_lambda_acc = lmbda

            print('Best result: NMI= %0.4f' %bestnmi,'|NMI lambda = ', best_lambda_nmi)
            print('Best Accuracy %0.4f' %bestacc,'|Acc lambda = ', best_lambda_acc)
            elapsetimes.append(elapsed)

        avgelapsed = sum(elapsetimes)/len(elapsetimes)
        print ('avg elapsed ',avgelapsed)
    else:
        best_lambda_acc = args.lmbda

    ### Run with best Lambda and assess accuracy over whole dataset
    best_lambda = best_lambda_acc # or best_lambda_nmi
    if N>=5000:
        C,l,elapsed,mode_index,z,_,_ = SLK(X,sigma,K,W,bound_,SLK_option,C_init,
                                                bound_lambda = best_lambda, bound_iterations=bound_it)
    else:
        C,l,elapsed,mode_index,z,_,_ = SLK_iterative(X,sigma,K,W,bound_,SLK_option,C_init,
                                                      bound_lambda = best_lambda, bound_iterations=bound_it)
    # Evaluate the performance on dataset

    print('Elapsed time for SLK = %0.5f seconds' %elapsed)
    nmi_ = nmi(gnd_labels,l)
    acc_,_ = get_accuracy(gnd_labels,l)

    print('Result: NMI= %0.4f' %nmi_)
    print('        Accuracy %0.4f' %acc_)
    best_lambda = best_lambda_acc

    if saveresult:
        saveresult_path = data_dir + 'Result_'+dataset+'.mat'
        sio.savemat(saveresult_path,{'lmbda':best_lambda,'l':l,'C':C,'z':z})

    if mode_images and X_org is not None:
        if SLK_option == 'BO':
            mode_images_path = data_dir+dataset+'_modes'
            original_image_size = (28,28)
            util.mode_nn(mode_index,X,K,C,l,6,X_org,mode_images_path, original_image_size)
        else:
            print('\n For Mode images change option to -- BO and have image intensities X_org')
示例#13
0
class Brisk(Request):

    def __init__(self):
        super(Brisk, self).__init__()
        self.config = Config().config
        self.logger = Logger().logger
        self.proxy_manager = ProxyPool()
        self.db = DB().db

        self.__proxy_status = self.config.get('PROXY', 'empty')
        if self.__proxy_status == 'TRUE':
            self.proxy_manager.delete_valid_proxy()

        self.__hook_name = 'hook'
        self.__walk_name = 'walk'
        self.__flow_name = 'flow'

        self.__brisk_type = self.config.get('RUN', 'type')

        self.__func_filter = lambda m: not m.startswith("__") and \
                                       not m.startswith(self.__hook_name) and \
                                       not m.startswith(self.__walk_name) and \
                                       not m.startswith(self.__flow_name)

        self.__flow_num = int(self.config.get('RUN', 'num'))
        self.__hook = None
        self.__flow_queue = queue.Queue()
        self.__walk_queue = queue.Queue()
        self.__go_init()

    def __go_init(self):

        for method_name in list(
                filter(lambda m: m.startswith(self.__hook_name) and callable(getattr(self, m)), dir(self))):
            method = self.__class__.__dict__[method_name]
            obj = Core()
            obj.task = types.MethodType(method, obj)
            for func_name in filter(self.__func_filter, self.__class__.__dict__):
                func = self.__class__.__dict__[func_name]
                setattr(obj, func_name, types.MethodType(func, obj))
            self.__hook = obj
            break

        if self.__brisk_type == 'WALK':
            for method_name in list(
                    filter(lambda m: m.startswith(self.__walk_name) and callable(getattr(self, m)), dir(self))):
                self.__walk_queue.put(method_name)

        if self.__brisk_type == 'FLOW':
            for method_name in list(
                    filter(lambda m: m.startswith(self.__flow_name) and callable(getattr(self, m)), dir(self))):
                self.__flow_queue.put(method_name)

    def go(self):
        self.logger.info('brisk go')

        self.logger.info('brisk create {} task(s)'.format(self.__flow_queue.qsize()))
        if self.__hook:
            self.__hook_attr_base = dir(self.__hook)
            self.logger.info('brisk create hook')
            self.__hook.start()
            self.__hook.join()
            self.logger.info('brisk complete hook')

        self.__hook: Core
        self.__hook_attr = []
        for method_name in dir(self.__hook):
            if method_name not in self.__hook_attr_base:
                self.__hook_attr.append(method_name)
        while not self.__walk_queue.empty():
            method_name = self.__walk_queue.get()
            method = self.__class__.__dict__[method_name]
            t = Core(self.__walk_queue)
            for attr_name in self.__hook_attr:
                setattr(t, attr_name, self.__hook.__dict__[attr_name])
            t.task = types.MethodType(method, t)
            for func_name in filter(self.__func_filter, self.__class__.__dict__):
                func = self.__class__.__dict__[func_name]
                setattr(t, func_name, types.MethodType(func, t))
            if self.__hook:
                t.make(self.__hook.headers, self.__hook.cookies)
            t.start()
            t.join()
        self.__walk_queue.join()

        while not self.__flow_queue.empty():
            if (threading.activeCount() - 1) < self.__flow_num:
                method_name = self.__flow_queue.get()
                method = self.__class__.__dict__[method_name]
                t = Core(self.__flow_queue)
                for attr_name in self.__hook_attr:
                    setattr(t, attr_name, self.__hook.__dict__[attr_name])
                t.task = types.MethodType(method, t)
                for func_name in filter(self.__func_filter, self.__class__.__dict__):
                    func = self.__class__.__dict__[func_name]
                    setattr(t, func_name, types.MethodType(func, t))
                if self.__hook:
                    t.make(self.__hook.headers, self.__hook.cookies)
                t.start()
        self.__flow_queue.join()

        self.logger.info('brisk ok')
示例#14
0
                    type=int,
                    required=False,
                    help="Max number of iterations of length [--minutes]")
args = parser.parse_args()

#Verbose set up
v = args.verbose

#Log Setup
logs = args.logDirectory + '/'
if not os.path.exists(logs):
    os.makedirs(logs)

logfile = logs + datetime.now().strftime("%m-%d-%Y_%H:%M:S") + ".txt"

logger = Logger(logfile, v)
if v:
    logger.logAndPrint("Verbosity turned on")

#Configurations
conf = Conf(logger)
camera = conf.camera

#Output Setup
output = args.outputDirectory + "/"
if not os.path.exists(output):
    os.makedirs(output)

#Iterations
it = args.maxIterations
i = 0