예제 #1
0
    def __init__(self):

        print("load law school")
        data_dir = 'dataset'
        data_file = path.join(data_dir, 'lawschs1_1.dta')

        if not path.exists(data_file):
            request.urlretrieve(
                'http://www.seaphe.org/databases/FOIA/lawschs1_1.dta',
                data_file)
        dataset = pd.read_stata(data_file)
        dataset.drop([
            'enroll', 'asian', 'black', 'hispanic', 'white', 'missingrace',
            'urm'
        ],
                     axis=1,
                     inplace=True)
        dataset.dropna(axis=0, inplace=True, subset=['admit'])
        dataset.replace(to_replace='', value=np.nan, inplace=True)
        dataset.dropna(axis=0, inplace=True)
        dataset = dataset[dataset['race'] != 'Asian']

        for col in dataset.columns:
            if dataset[col].isnull().sum() > 0:
                dataset.drop(col, axis=1, inplace=True)

        self.con_vars = ['lsat', 'gpa']
        self.cat_vars = [
            col for col in dataset.columns if col not in self.con_vars
        ]
        self.columns_name = self.con_vars + self.cat_vars
        self.data = dataset[self.columns_name]

        self.data_info = get_data_info(self.data, self.cat_vars)
        self.con_loc = [dataset.columns.get_loc(var) for var in self.con_vars]
예제 #2
0
파일: main.py 프로젝트: robspringles/IAN
def main(_):
    start_time = time.time()

    print('Loading data info ...')
    word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info(
        FLAGS.train_file_name, FLAGS.test_file_name, FLAGS.data_info,
        FLAGS.pre_processed)

    print('Loading training data and testing data ...')
    train_data = read_data(FLAGS.train_file_name, word2id,
                           FLAGS.max_aspect_len, FLAGS.max_context_len,
                           FLAGS.train_data, FLAGS.pre_processed)
    test_data = read_data(FLAGS.test_file_name, word2id, FLAGS.max_aspect_len,
                          FLAGS.max_context_len, FLAGS.test_data,
                          FLAGS.pre_processed)

    print('Loading pre-trained word vectors ...')
    FLAGS.embedding_matrix = load_word_embeddings(FLAGS.embedding_file_name,
                                                  FLAGS.embedding_dim, word2id)

    with tf.Session() as sess:
        model = IAN(FLAGS, sess)
        model.build_model()
        model.run(train_data, test_data)

    end_time = time.time()
    print('Time Costing: %s' % (end_time - start_time))
def main(use_cpu=1, batch_size=16, num_of_classes=8, first_n_byte=2000000):
    model = malConv.MalConv(num_of_classes)
    if torch.cuda.is_available() and torch.cuda.device_count() > 0:
        print("gpu!!!!!!!")
        device = torch.device("cuda:0")
        model = nn.Sequential(model)
        model = nn.DataParallel(model)
        model.to(device)
    else:
        device = torch.device("cpu")

    # Extracting data to data loaders.
    data_folder, classes_list = ut.get_data_info('classes.txt')
    train_set_list, train_labels, test_set_list, test_labels = ut.gen_train_and_dev_data_sets(
        data_folder, classes_list)

    train_loader = DataLoader(ExeDataset(train_set_list, train_labels,
                                         first_n_byte),
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=use_cpu)
    test_loader = DataLoader(ExeDataset(test_set_list, test_labels,
                                        first_n_byte),
                             batch_size=batch_size,
                             shuffle=False,
                             num_workers=use_cpu)

    train.train_on(model, train_loader, test_loader, len(test_labels),
                   len(train_labels), device, batch_size)
예제 #4
0
파일: main.py 프로젝트: xiaomaniay/IAN
def main(_):
    start_time = time.time()

    print('Loading data info ...')
    word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info(
        FLAGS.dataset, FLAGS.pre_processed)

    print('Loading training data and testing data ...')
    train_data = read_data(word2id, FLAGS.max_aspect_len,
                           FLAGS.max_context_len, FLAGS.dataset + 'train',
                           FLAGS.pre_processed)
    test_data = read_data(word2id, FLAGS.max_aspect_len, FLAGS.max_context_len,
                          FLAGS.dataset + 'test', FLAGS.pre_processed)

    print('Loading pre-trained word vectors ...')
    FLAGS.embedding_matrix = load_word_embeddings(FLAGS.embedding_file_name,
                                                  FLAGS.embedding_dim, word2id)

    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.5
    with tf.Session(config=config) as sess:
        model = IAN(FLAGS, sess)
        model.build_model(train_data, test_data)
        model.run()

    end_time = time.time()
    print('Time Costing: %s' % (end_time - start_time))
def main(_):
    start_time = time.time()

    print('Loading data info ...')
    word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info(
        dataset, pre_processed)

    print('Loading training data ,validation and testing data ...')
    train_data = read_data(word2id, FLAGS.max_aspect_len,
                           FLAGS.max_context_len, dataset + 'train',
                           pre_processed)
    test_data = read_data(word2id, FLAGS.max_aspect_len, FLAGS.max_context_len,
                          dataset + 'val', pre_processed)
    test_new_data = read_data(word2id, FLAGS.max_aspect_len,
                              FLAGS.max_context_len, dataset + 'test',
                              pre_processed)

    print('Loading pre-trained word vectors ...')
    FLAGS.embedding_matrix = load_word_embeddings(embedding_file_name,
                                                  FLAGS.embedding_dim, word2id)

    model = IAN(FLAGS)
    run(model, train_data, test_data, test_new_data)

    end_time = time.time()
    print('Time Costing: %s' % (end_time - start_time))
예제 #6
0
    def __init__(self,train,con_vars):
        
        print("load Synthetic Adult...")
        #self.cat_vars = [col for col in df_health.columns if col not in self.con_vars]
        self.con_vars = con_vars
        self.cat_vars = [col for col in train.columns if col not in self.con_vars]

        self.columns_name = self.con_vars + self.cat_vars
        self.train = train[self.columns_name]
        #get data info
        self.data_info = get_data_info(self.train ,self.cat_vars)
        print("Data info:", self.data_info)    

        self.con_loc =  [self.train.columns.get_loc(var) for var in self.con_vars]    
예제 #7
0
파일: main.py 프로젝트: haozijie/RAM
def main(_):
    print('Loading data info ...')
    FLAGS.word2id, FLAGS.max_sentence_len, FLAGS.max_aspect_len = get_data_info(FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info, FLAGS.pre_processed)

    print('Loading training data and testing data ...')
    train_data = read_data(FLAGS.train_fname, FLAGS.word2id, FLAGS.max_sentence_len, FLAGS.max_aspect_len, FLAGS.train_data, FLAGS.pre_processed)
    test_data = read_data(FLAGS.test_fname, FLAGS.word2id, FLAGS.max_sentence_len,  FLAGS.max_aspect_len, FLAGS.test_data, FLAGS.pre_processed)

    print('Loading pre-trained word vectors ...')
    FLAGS.word2vec = load_word_embeddings(FLAGS.embedding_fname, FLAGS.embedding_dim, FLAGS.word2id)

    with tf.Session() as sess:
        model = RAM(FLAGS, sess)
        model.build_model()
        model.run(train_data, test_data)
예제 #8
0
 def __init__(self, i=1):
     print("load synthetic compas")
     df_compas = pd.read_csv(
         "./GenerateData/Compas/Compas_syn600_bs100_seed1_times_10.csv")
     df_compas = df_compas[3694 * i:3694 * (i + 1)]
     df_compas.loc[(df_compas["diff_jail"] < 0, "diff_jail")] = 0
     self.con_vars = ['age', 'diff_custody', 'diff_jail', 'priors_count']
     self.cat_vars = [
         col for col in df_compas.columns if col not in self.con_vars
     ]
     self.columns_name = self.con_vars + self.cat_vars
     self.data = df_compas[self.columns_name]
     self.con_loc = [
         self.data.columns.get_loc(var) for var in self.con_vars
     ]
     #get data info
     self.data_info = get_data_info(self.data, self.cat_vars)
     print("Data info:", self.data_info)
예제 #9
0
    def __init__(self):

        print("load Synthetic law school")
        df_lawsch = pd.read_csv(
            "./GenerateData/lawschool/lawschool_syn_300_bs500_seed0_times_10.csv"
        )
        df_lawsch = df_lawsch[:43011]
        df_lawsch['lsat'] = df_lawsch['lsat'].astype('int')
        df_lawsch['gpa'] = df_lawsch['gpa'].round(decimals=2)
        self.con_vars = ['lsat', 'gpa']
        self.cat_vars = [
            col for col in df_lawsch.columns if col not in self.con_vars
        ]
        self.columns_name = self.con_vars + self.cat_vars
        self.data = df_lawsch[self.columns_name]

        self.data_info = get_data_info(self.data, self.cat_vars)
        self.con_loc = [
            self.data.columns.get_loc(var) for var in self.con_vars
        ]
예제 #10
0
    def transform(self):
        self.columns_name = self.data.columns
        self.output_info = get_data_info(self.data, self.categorical_columns)

        self.data = pd.get_dummies(self.data,
                                   columns=self.categorical_columns,
                                   prefix_sep='=')
        #onehot:numpy array
        self.scaler = MinMaxScaler()
        self.data[self.c_vars] = self.scaler.fit_transform(
            self.data[self.c_vars])
        print('Attributes', self.columns_name)
        print('Data info:', self.output_info)
        #change to numpy array
        data_np = self.data.values

        #change range to [-1,1]
        #data_np = (data_np[:,:] - 0.5)*2

        return data_np
예제 #11
0
def main(_):
    print('Loading data info ...')
    #FLAGS.word2id, FLAGS.max_sentence_len, FLAGS.max_aspect_len = get_data_info(FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info, FLAGS.pre_processed)
    print('Buoc 1: lay thong tin co ban cu du lieu train va test ...')
    word2id, max_sentence_len, max_aspect_len = get_data_info(
        FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info,
        FLAGS.pre_processed)
    #sys.exit()
    #sys.exit()

    #tf.app.flags.DEFINE_string('word2id', word2id, 'word2id')
    tf.app.flags.DEFINE_integer('max_sentence_len', max_sentence_len,
                                'max sentence len')
    tf.app.flags.DEFINE_integer('max_aspect_len', max_aspect_len,
                                'max aspect len')

    print('Buoc 2: Loading training data and testing data ...')
    print('Buoc 2.1: doac training data ...')
    train_data = read_data(FLAGS.train_fname, word2id, max_sentence_len,
                           max_aspect_len, FLAGS.train_data,
                           FLAGS.pre_processed, FLAGS.sentiment_data)
    #sys.exit()
    print('Buoc 2.2: doac testing data ...')
    test_data = read_data(FLAGS.test_fname, word2id, max_sentence_len,
                          max_aspect_len, FLAGS.test_data, FLAGS.pre_processed,
                          FLAGS.sentiment_data)

    print('Loading pre-trained word vectors ...')
    word2vec = load_word_embeddings(FLAGS.embedding_fname, FLAGS.embedding_dim,
                                    word2id)

    with tf.Session() as sess:
        model = RAM(FLAGS, word2id, word2vec, sess)

        print('Build model ...')
        model.build_model()

        print('Run model ...')
        model.run(train_data, test_data)
예제 #12
0
def main(_):
    print('Loading data info ...')
    FLAGS.word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info(
        FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info,
        FLAGS.pre_processed)

    print('Loading training data and testing data ...')
    train_data = read_data(FLAGS.train_fname, FLAGS.word2id,
                           FLAGS.max_aspect_len, FLAGS.max_context_len,
                           FLAGS.train_data, FLAGS.pre_processed)
    test_data = read_data(FLAGS.test_fname, FLAGS.word2id,
                          FLAGS.max_aspect_len, FLAGS.max_context_len,
                          FLAGS.test_data, FLAGS.pre_processed)

    print('Loading pre-trained word vectors ...')
    FLAGS.word2vec = load_word_embeddings(FLAGS.embedding_fname,
                                          FLAGS.embedding_dim, FLAGS.word2id)

    with tf.Session() as sess:
        model = IAN(FLAGS, sess)
        model.build_model()
        model.run(train_data, test_data)
예제 #13
0
    def __init__(self):

        print("load compas")
        data_dir = 'dataset'
        data_file = path.join(data_dir, 'compas-scores-two-years.csv')

        df = pd.read_csv(data_file)
        print(df.shape)

        df = df[df['days_b_screening_arrest'] >= -30]
        df = df[df['days_b_screening_arrest'] <= 30]
        df = df[df['is_recid'] != -1]
        df = df[df['c_charge_degree'] != '0']
        df = df[df['score_text'] != 'N/A']

        df['in_custody'] = pd.to_datetime(df['in_custody'])
        df['out_custody'] = pd.to_datetime(df['out_custody'])
        df['diff_custody'] = (df['out_custody'] - df['in_custody']).dt.days
        df['c_jail_in'] = pd.to_datetime(df['c_jail_in'])
        df['c_jail_out'] = pd.to_datetime(df['c_jail_out'])
        df['diff_jail'] = (df['c_jail_out'] - df['c_jail_in']).dt.days

        df.drop([
            'id', 'name', 'first', 'last', 'v_screening_date',
            'compas_screening_date', 'dob', 'c_case_number', 'screening_date',
            'in_custody', 'out_custody', 'c_jail_in', 'c_jail_out'
        ],
                axis=1,
                inplace=True)
        df = df[df['race'].isin(['African-American', 'Caucasian'])]

        features = df.drop([
            'is_recid', 'is_violent_recid', 'violent_recid', 'two_year_recid'
        ],
                           axis=1)
        labels = 1 - df['two_year_recid']

        features = features[[
            'age', 'sex', 'race', 'diff_custody', 'diff_jail', 'priors_count',
            'juv_fel_count', 'c_charge_degree', 'v_score_text'
        ]]
        self.data = pd.concat([features, labels], axis=1)
        self.data[['juv_fel_count', 'two_year_recid'
                   ]] = self.data[['juv_fel_count',
                                   'two_year_recid']].astype('object')

        #self.data = self.data.drop(['diff_jail'],axis=1)
        # discretize diff_custody
        #diff_custody(self.data)

        self.con_vars = [
            i for i in self.data.columns
            if self.data[i].dtype == 'int64' or self.data[i].dtype == 'float64'
        ]

        self.cat_vars = [
            i for i in self.data.columns if i not in self.con_vars
        ]

        self.columns_name = self.con_vars + self.cat_vars
        self.data = self.data[self.columns_name]
        self.con_loc = [
            self.data.columns.get_loc(var) for var in self.con_vars
        ]

        #get data info
        self.data_info = get_data_info(self.data, self.cat_vars)
        print("Data info:", self.data_info)
예제 #14
0
                               'embedding file name')
    tf.app.flags.DEFINE_string('embedding', 'glove', 'oov')
    tf.app.flags.DEFINE_string('train_fname', './data/laptop/train.txt',
                               'training file name')
    tf.app.flags.DEFINE_string('test_fname', './data/laptop/test.txt',
                               'testing file name')
    tf.app.flags.DEFINE_string('data_info', './data/data_info.txt',
                               'the file saving data information')
    tf.app.flags.DEFINE_string('train_data', './data/train_data.txt',
                               'the file saving training data')
    tf.app.flags.DEFINE_string('test_data', './data/test_data.txt',
                               'the file saving testing data')

    print('Loading data info ...')
    FLAGS.word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info(
        FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info,
        FLAGS.pre_processed)

    print('Loading training data and testing data ...')
    train_data = read_data(FLAGS.train_fname, FLAGS.word2id,
                           FLAGS.max_aspect_len, FLAGS.max_context_len,
                           FLAGS.train_data, FLAGS.pre_processed)
    test_data = read_data(FLAGS.test_fname, FLAGS.word2id,
                          FLAGS.max_aspect_len, FLAGS.max_context_len,
                          FLAGS.test_data, FLAGS.pre_processed)

    print('Loading pre-trained word vectors ...')
    FLAGS.word2vec = load_word_embeddings(FLAGS.embedding_fname,
                                          FLAGS.embedding_dim, FLAGS.word2id)

    with tf.Session() as sess: