def __init__(self, data, remove_stop_word = False, lemmatize = True):
        self.sentences = []
        self.graph = {}
        self.PRI = {}
        processor =  ProcessData(data,"Stanford", False, lemmatize, remove_stop_word, False)

        self.sentences = processor.clean_sentences()
Пример #2
0
def trainPredModel(data):
    '''
        Split dataset to training and evaluation (20% evaluation split)
        Fit model to training data, and generate a doc with evaluation
        as well as the summary of errors
    '''
    process = ProcessData(sentdata)
    padded_sequence, tokenizer, labels = process.createTrainData()

    ft = createFTEmbedding()
    ft.processDict(tokenizer)

    x_train, x_val, y_train, y_val = train_test_split(padded_sequence, labels)

    model = compileModel(padded_sequence, labels, tokenizer,
                         ft.embedding_matrix)

    model.fit(x_train,
              y_train,
              batch_size=20,
              epochs=10,
              validation_data=(x_val, y_val))

    y_pred = model.predict(x_val)
    translateEvalData(y_pred, y_val, x_val, tokenizer)
    return model, x_train, x_val, y_train, y_val, y_pred
Пример #3
0
class Receiver:
    def __init__(self):
        # Symbolic name meaning all available interfaces
        self.host = ''
        # IP address for the ambulance
        self.ip = 'localhost'
        self.port = 10000

        # Create a UDP socket
        self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)

        # Bind socket to local host and port
        self.sock.bind((self.host, self.port))

        # Queue with the two last data objects received
        self.position_history = deque()

        self.current_data = None
        self.format_dict = dict.fromkeys(
            ['timestamp', 'longitude', 'latitude'])
        self.process_data = ProcessData()

    def add_to_queue(self, dict_insert):
        '''Add received element to the receiver queue.
        Keyword arguments:
        dict_insert -- dict containing GPS data and timestamp
        '''
        self.position_history.append(dict_insert)

    def notify_process_data(self):
        '''Notifies process_data when receiving data from sender'''
        if len(self.position_history) > 1:
            self.process_data.notify(self.get_data())

    def get_data(self):
        '''Returns the two last datasets from position_history'''
        if len(self.position_history) > 1:
            new_amb = self.position_history.popleft()
            old_amb = self.position_history.popleft()
            self.position_history.appendleft(old_amb)
            return [new_amb, old_amb]

    def receive(self):
        '''Receive data and convert bytes to string.'''
        try:
            # Listen until terminated by user
            while True:
                rawdata, addr = self.sock.recvfrom(1024)
                data = rawdata.decode(encoding='UTF-8')
                # Convert data to dictionary
                acceptable_string = data.replace("'", "\"")
                data = json.loads(acceptable_string)
                self.add_to_queue(data)
                self.notify_process_data()
                if not data:
                    break
        except (KeyboardInterrupt):
            print("Closing socket")
            self.sock.close()
Пример #4
0
def main():
    data_processor = ProcessData()
    forecast_util = Forecast()
    series,train, test = data_processor.read_data('shampoo-sales.csv')

    forecasts = forecast_util.make_forecasts(train,test, 1, 3)
    forecast_util.evaluate_forecasts(test, forecasts, 1, 3)
    forecast_util.plot_forecasts(series, forecasts, 12)
Пример #5
0
def main():
    print('processing data...')
    pd = ProcessData()
    data, half_data = pd()
    print('done processing!')

    trainer = Train()
    kfold_trainer = TrainKfold()
    gaussian_trainer = TrainGaussian()

    # [0.8795, 0.9035, 0.9285, 0.9435, 0.9505, 0.96, 0.963, 0.958, 0.954, 0.9525]
    trainer(data, 'lr', 0.0001, 4, "logistic regression")
    # [0.957, 0.959, 0.96, 0.9595, 0.9575, 0.958, 0.9595, 0.953, 0.9515, 0.947]
    trainer(half_data, 'svm', 0.04, 1.6, "support vector machine")

    # lr: 1.0248700000000002 - training: 0.9738333333333333 testing: 0.9585
    kfold_trainer(data, 'lr', 0.7, 1.1, k=5)
    # svm: 0.06 - training: 0.9683333333333334 testing: 0.958
    kfold_trainer(half_data, 'svm', 0.04, 1.5, k=5)

    # [0.0026414886541018556, 0.0030797496623704845, 0.0035907244833864723, 0.004186477385849299, 0.004881074274375396,
    # 0.0056909147897226675, 0.006635119509224956, 0.007735981389354633, 0.009019492109107729, 0.010515955741336555,
    # 0.012260704240994024, 0.014294931643181576, 0.016666666666666666, 0.019431906686330536, 0.022655939847975447,
    # 0.02641488654101857]
    gaussian_trainer(half_data, 0.04, 1.5)
Пример #6
0
def main():
    """ Kieran Ringel
    For each data set three lines are run in main.
    The first creates an instance of Org with the arguments being the data file name, an array of rows with header
    information to be removed, and the column location of the class so that all the classes can be put in the same column.
    The second line takes the instance of Org and calls the open method, returning the pandas dataframe of the file.
    The third line creates an instance of ProcessData, the arguments are the dataframe created in Org.open(), classification or
    regression, the type (none, edited, condensed), and an array of the columns with discrete values."""

    #print('glass')
    #glass = Org('Data/glass.data', [-1], -1)
    #df = glass.open()
    #ProcessData(df, 'classification', 'condensed', [-1])
    #print('image')
    #img = Org('Data/segmentation.data', [0, 1, 2, 3, 4], 0)
    #df = img.open()
    #ProcessData(df, 'classification', 'condensed', [-1])
    #print('vote')
    #vote = Org('Data/house-votes-84.data', [-1], 0)
    #df = vote.open()
    #ProcessData(df, 'classification', 'edited', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
    #print('abalone')
    #abalone = Org('Data/abalone.data', [-1], -1)
    #df = abalone.open()
    #ProcessData(df, 'regression', 'edited', [0])
    print('machine')
    machine = Org('Data/machine.data', [-1], -1)
    df = machine.open()
    ProcessData(df, 'regression', 'condensed', [0, 1])
Пример #7
0
    def create_relations_image(self):
        # Create the random data
        data_dict_obj = DataDict(self.data_dicts, self.max_rep)

        # Write the data to a csv file
        data_dict_obj.write_dup_data(self.csv_file_name)

        # Process the data and get counts of emails
        process_data_obj = ProcessData(self.csv_file_name)
        processed_counts = process_data_obj.process_data()

        # Make a graph of the processed data
        graph_file_obj = MakeGraphFile(self.dot_file_name, self.png_file_name)
        graph_file_obj.write_dot_file(processed_counts)

        # Write the graph to a file
        graph_file_obj.write_png_file()
Пример #8
0
    def __init__(self, data):
        self.data = []

        for sentence in data:
            tokens = sent_tokenize(sentence)
            for s in tokens:
                if s not in self.data:
                    self.data.append(tokens)

        self.sentence_weights = {}

        data_processor = ProcessData(data)

        self.sentences = data_processor.remove_tags(
            data_processor.clean_sentences())

        self.probabilities = self._get_probabilities(self.sentences)
Пример #9
0
    def __init__(self):
        # Symbolic name meaning all available interfaces
        self.host = ''
        # IP address for the ambulance
        self.ip = 'localhost'
        self.port = 10000

        # Create a UDP socket
        self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)

        # Bind socket to local host and port
        self.sock.bind((self.host, self.port))

        # Queue with the two last data objects received
        self.position_history = deque()

        self.current_data = None
        self.format_dict = dict.fromkeys(
            ['timestamp', 'longitude', 'latitude'])
        self.process_data = ProcessData()
def handle_socket(data, client_address, local_storage):
    # print('The client at {} says {!r}'.format(client_address, data))
    # Load received data into JSON
    data_receive = json.loads(data.decode())

    # Split JSON data into specific parts
    process_data = ProcessData(data_receive)

    # Verify if the client_code is valid
    client_code = process_data.client_code

    # Verify client code
    if client_code == "T":

        print("MESSAGE RECEIVED: T To: {} From: {} Text: {}".format(
            process_data.dest_id, process_data.client_id,
            process_data.send_text))
        if process_data.dest_id not in local_storage:
            local_storage[process_data.dest_id] = [
            ]  # Create a queue to store data with the same key
        local_storage[process_data.dest_id].append(process_data)
        result = json.dumps({"message": "messages stored"})

    elif client_code == "C":
        if process_data.client_id in local_storage.keys():
            obj = local_storage[process_data.client_id].pop(
                0)  # Pop out relative object
            if obj.send_text is not None:
                result = json.dumps({
                    "message": obj.send_text,
                    "sender": obj.client_id
                })  # Convert text and id into json
                print("MESSAGE TO SEND: T To: {} From: {} Text: {}".format(
                    obj.dest_id, obj.client_id, obj.send_text))
            else:
                result = json.dumps({
                    "message": "No Text",
                    "sender": obj.client_id
                })
                print("MESSAGE TO SEND: T To: {} Text: No Text".format(
                    obj.client_id))
        else:
            result = json.dumps({"message": "No Text"})
            print("MESSAGE TO SEND: T To: {} Text: No Text".format(
                process_data.client_id))

    else:
        result = json.dumps({"message": "Invalid <code> format. "})
    with socket_lock:
        UDPServerSocket.sendto(result.encode(), client_address)
Пример #11
0
    def post(self):
        logging.info("---------------------------")
        logging.info("Class ProcessDataSummary: post")
        # Validate request
        redirect_url = authorise_user(self.request.uri)
        if redirect_url:
            logging.info("Redirecting")
            self.redirect(redirect_url)

        # Process data.
        data = self.request.body
        logging.info(">>>>> {}".format(data))
        file_name = FileUtils().create_file(data=data)
        ProcessData().process_data(data=data)
        logging.info("---------------------------")
        self.response.headers['content-Type'] = 'application/json'
        self.response.out.write(json.dumps({"file_name": file_name}))
Пример #12
0
class TestProcessData(TestCase):
    def setUp(self):
        self.process_data = ProcessData('./test/input/',
                                        './test/output/temp_folder/')

    def test___get_csv_names(self):
        names = [
            './test/input/subfolder1/test1.json',
            './test/input/subfolder1/test2.json',
            './test/input/subfolder2/test3.json',
        ]
        test_names = [name for name in self.process_data.get_csv_names()]
        print(test_names)
        self.assertEqual(len(test_names), 3)
        self.assertTrue(all(name in names for name in test_names))

    def tear_down(self):
        pass
Пример #13
0
    def __init__(self):
        # Symbolic name meaning all available interfaces
        self.host = ''
        # IP address for the ambulance
        self.ip = 'localhost'
        self.port = 10000

        # Create a UDP socket
        self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)

        # Bind socket to local host and port
        self.sock.bind((self.host, self.port))

        # Queue with the two last data objects received
        self.position_history = deque()

        self.current_data = None
        self.format_dict = dict.fromkeys(['timestamp', 'longitude',
                                'latitude'])
        self.process_data = ProcessData()
from process_data import ProcessData
from model import Model

if __name__ == '__main__':
    pass
    # one module for data preprocessing
    # another module for model construction
    # another for evaulation (k-fold)
    train_data = ProcessData()
    model = Model()
    model.construct_model(train_data.data_frame)

    #test
    test_data = ProcessData(file_name="test.tsv")
    model.evaluate_model(test_data.data_frame)
Пример #15
0
 def setUp(self):
     self.process_data = ProcessData('./test/input/',
                                     './test/output/temp_folder/')
Пример #16
0
class Receiver:

    def __init__(self):
        # Symbolic name meaning all available interfaces
        self.host = ''
        # IP address for the ambulance
        self.ip = 'localhost'
        self.port = 10000

        # Create a UDP socket
        self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)

        # Bind socket to local host and port
        self.sock.bind((self.host, self.port))

        # Queue with the two last data objects received
        self.position_history = deque()

        self.current_data = None
        self.format_dict = dict.fromkeys(['timestamp', 'longitude',
                                'latitude'])
        self.process_data = ProcessData()


    def add_to_queue(self, dict_insert):
        '''Add received element to the receiver queue.
        Keyword arguments:
        dict_insert -- dict containing GPS data and timestamp
        '''
        self.position_history.append(dict_insert)

    def notify_process_data(self):
        '''Notifies process_data when receiving data from sender'''
        if len(self.position_history) >1:
            self.process_data.notify(self.get_data())

    def get_data(self):
        '''Returns the two last datasets from position_history'''
        if len(self.position_history) > 1:
            new_amb = self.position_history.popleft()
            old_amb = self.position_history.popleft()
            self.position_history.appendleft(old_amb)
            return [new_amb, old_amb]

    def receive(self):
        '''Receive data and convert bytes to string.'''
        try:
        # Listen until terminated by user
            while True:
                rawdata, addr = self.sock.recvfrom(1024)
                data = rawdata.decode(encoding='UTF-8')
                # Convert data to dictionary
                acceptable_string = data.replace("'","\"")
                data = json.loads(acceptable_string)
                self.add_to_queue(data)
                self.notify_process_data()
                if not data:
                    break
        except(KeyboardInterrupt):
            print("Closing socket")
            self.sock.close()
Пример #17
0
PKU_Datafile = '../data/PKU_MOOC/question_sessions.csv'
PKU_train = '../data/PKU_MOOC/training.csv'
PKU_test = '../data/PKU_MOOC/testing.csv'
PKU_evaluate_result = '../data/PKU_MOOC/PKU_results.csv'
PKU_Category_MappingFile = '../data/PKU_MOOC/question_category.csv'

fname_TrainData = ASSISTment_train
fname_TestData = ASSISTment_test
fname_MapData = ASSISTment_Category_MappingFile
fname_Result = ASSISTment_evaluate_result

DATA_READY = True
SILENT_WARNINGS = True  # to silent the warnings (https://github.com/tensorflow/tensorflow/issues/8037)

if not DATA_READY:
    PrePrcess = ProcessData(data_folder=DATA_FOLDER)
    PreProcess.ASSISTment_load_save(ASSISTment_Datafile)
if SILENT_WARNINGS:
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


def main(args):
    batch_size = args.batch_size
    n_epoch = args.n_epoch
    n_step = args.n_step
    keep_prob = args.keep_prob
    n_hidden_units = args.n_hidden_units
    embedding_size = args.embedding_size
    initial_learning_rate = args.initial_learning_rate
    final_learning_rate = args.final_learning_rate
    assert args.embedding in ['random',
Пример #18
0
    def build_model(self, epoch_config):
        """Builds tensorflow neural network layers. 
        
        Initializes layers weights, biases from random normal distribution. Connects layers by matrix multiplication 
        and apply activation function (for non-linearities) except last layer
        
        Args:
            FLAGS: Macro dictionary contains user params and some defaults            
            epoch_config: epoch configuration

        epoch_config:
            nodes_per_layer: List contains number of nodes in each layers and its length determines number of layers
                Example:
                  nodes_per_layer = [1, 5, 4, 1]
                  First (input) layer has 1 node takes input vector of (N,)
                  Second hidden layer has 5 nodes (with tanh activation)
                  Third hidden layer has 4 nodes (with tanh activation)
                  Fourth output layer has 1 node outputs vector of (N,)
            optimizer: optimizer to use for training. Default is Adam
            learning_rate: learning rate used by optimizer. Default is 0.05
            activation: non-linear activation function. Default is relu
            
        Returns:
            None. Computed costs are saved to self.costs
            
        Raises:
            None
        """

        # Instantiate DNN Regressor Data Class
        PD = ProcessData(self.logging)

        if epoch_config['load_data'] == True:
            self.X_train, self.X_test, self.Y_train, self.Y_test = PD.loadData(
            )
        else:
            # Initialize variables to prepare synthetic data
            N = epoch_config['data_instances']
            M = epoch_config['data_features']
            self.X_train, self.X_test, self.Y_train, self.Y_test = PD.generateData(
                N, M)

        self.logging.info("Node per layer:%s", epoch_config['nodes_per_layer'])
        self.logging.info("Train Optimizer:%s",
                          epoch_config['train_optimizer'])
        self.logging.info("Learning rate:%f", epoch_config['learning_rate'])
        self.logging.info("Activation:%s", epoch_config['activation'])

        # Local variables
        nodes_per_layer = epoch_config['nodes_per_layer']

        # TensorFlow Variables and Placeholders

        # Global iteration steps
        global_step = tf.Variable(0, name="global_step", trainable=False)

        # Placeholder for input features and target output
        self.input_features = tf.placeholder(tf.float64)
        self.target_output = tf.placeholder(tf.float64)

        # Each layer is a matrix multiplication followed by a set of nonlinear operators
        # The size of each matrix is [size of output layer] x [size of input layer]
        layer_matrices = [
            None,
        ] * len(nodes_per_layer)
        layer_biases = [
            None,
        ] * len(nodes_per_layer)

        # Compute weight matries and biases for layered neural network
        for layer in range(len(nodes_per_layer) - 1):
            input_size = nodes_per_layer[layer]
            output_size = nodes_per_layer[layer + 1]
            layer_matrices[layer] = tf.Variable(
                tf.random_normal([output_size, input_size], dtype=tf.float64))
            layer_biases[layer] = tf.Variable(
                tf.random_normal([output_size, 1], dtype=tf.float64))
            self.logging.info(
                "[%d] layer_matrices for layer %d of size %d x %d",
                epoch_config['opt_epoch_iter'], layer, output_size, input_size)

        # Now we need to compute the output. We'll do that by connecting the matrix multiplications
        # through non-linearities except at the last layer, where we will just use matrix multiplication.
        intermediate_outputs = [
            None,
        ] * (len(nodes_per_layer) - 1)
        for layer in range(len(nodes_per_layer) - 1):
            if layer == 0:
                matmul = tf.add(
                    tf.matmul(layer_matrices[layer], self.input_features),
                    layer_biases[layer])
            else:
                matmul = tf.add(
                    tf.matmul(layer_matrices[layer],
                              intermediate_outputs[layer - 1]),
                    layer_biases[layer])

            if layer < len(nodes_per_layer) - 2:
                self.logging.info("Using Activation: %s",
                                  epoch_config['activation'])
                if epoch_config['activation'] == "tanh":
                    intermediate_outputs[layer] = tf.nn.tanh(matmul)
                else:  # Default "relu"
                    intermediate_outputs[layer] = tf.nn.relu(matmul)
            else:
                intermediate_outputs[layer] = matmul

        # And now the output -- we'll simply use matrix multiplication
        self.output = intermediate_outputs[-1]

        # compute error between target vs estimated output
        error = self.output - self.target_output
        self.cost = tf.matmul(error, tf.transpose(error))

        # optimize for loss or cost
        self.logging.info("Using Train Optimizer: %s",
                          epoch_config['train_optimizer'])
        if epoch_config['train_optimizer'] == "sgd":
            self.opt = tf.train.GradientDescentOptimizer(
                epoch_config['learning_rate'])
        elif epoch_config['train_optimizer'] == "Adagrad":
            self.opt = tf.train.AdagradOptimizer(epoch_config['learning_rate'])
        else:  # Default is Adam
            self.opt = tf.train.AdamOptimizer(epoch_config['learning_rate'])

        # Between the graph replication. If enabled training happens *syncronously*
        if epoch_config['sync_replicas'] == True:
            worker_spec = epoch_config['worker_hosts'].split(",")
            # Get the number of workers.
            num_workers = len(worker_spec)

            self.opt = tf.train.SyncReplicasOptimizer(
                self.opt,
                replicas_to_aggregate=num_workers,
                total_num_replicas=num_workers,
                name="nn_sync_replicas")
            self.logging.info("Sync Replica Optimizer Enabled...")

        self.train_step = self.opt.minimize(self.cost, global_step=global_step)

        return self.opt
Пример #19
0
    def train_predict(self, data, time_budget, n_class, schema):
        s1 = time.time()
        seed = SEED
        fix_seed(seed)
        LOGGER.info(f'time_budget:{time_budget}')
        LOGGER.info(f'n_class:{n_class}')
        LOGGER.info(f'node:{data["fea_table"].shape[0]}')
        LOGGER.info(f'edge:{data["edge_file"].shape[0]}')

        #pre-process data
        process_data = ProcessData(data)
        table = process_data.pre_process(time_budget, n_class, schema)

        # Feature Dimension Reduction
        feat = Feat()

        process_data.drop_unique_columns(table)
        drop_sum_columns = process_data.drop_excessive_columns(table)

        feat.fit_transform(table, drop_sum_columns)
        LOGGER.info(
            f'train:test={(table.df["is_test"]!=1).sum()}:{(table.df["is_test"]==1).sum()}'
        )

        #这里好像没用到哦
        table.large_features = False
        if table.ori_columns.shape[0] > 500:
            table.large_features = True

        model_type_list = ['sage', 'gat', 'tagc', 'gcn']

        repeat = 3
        model_name_list = [
            f'{model_type_list[i]}{i+len(model_type_list)*j}'
            for j in range(repeat) for i in range(len(model_type_list))
        ]
        model_type_list = model_type_list * repeat

        LOGGER.info('use node embedding')
        categories = [
            'node_index', 'degree_bins', 'bin_2-neighbor_mean_degree_bins'
        ]

        for model in set(model_type_list):
            LOGGER.info(
                f"""{model} feature num:{eval(f'table.{model}_columns.shape[0]')}"""
            )
            exec(
                f'table.{model}_data = process_data.process_gnn_data(table,table.{model}_columns,categories)'
            )

        allmodel = AllModel()

        table.lr_epoch = 16

        table.lr_list = [0.05, 0.03, 0.01, 0.0075, 0.005, 0.003, 0.001, 0.0005]

        train_valid_idx_list, valid_idx_list = split_train_and_valid(
            table, train_rate=0.8, seed=SEED, mode=split_mode)
        train_idx, test_idx = split_train_and_test(table)

        test_idx = test_idx.sort_values()
        run_model = []
        run_type = []
        run_time = {}
        for i in range(len(model_type_list)):
            seed = SEED * (i + 1)
            fix_seed(seed)
            model_type = model_type_list[i]
            model_name = model_name_list[i]
            if model_type not in run_time:
                init_time, one_epoch_time, early_stopping_rounds = allmodel.get_run_time(
                    table,
                    model_type,
                    model_name,
                    train_idx,
                    test_idx,
                    seed=seed)
                run_lr_time = len(table.lr_list) * (
                    init_time + table.lr_epoch * one_epoch_time)
                run_time500 = init_time * (2) + one_epoch_time * (
                    500 + early_stopping_rounds) * 2 + run_lr_time
                run_time300 = init_time * (2) + one_epoch_time * (
                    300 + early_stopping_rounds) * 2 + run_lr_time
                run_time150 = init_time * (2) + one_epoch_time * (
                    150 + early_stopping_rounds) * 2 + run_lr_time
                run_time[model_type] = (run_time500 - run_lr_time,
                                        run_time300 - run_lr_time,
                                        run_time150 - run_lr_time,
                                        early_stopping_rounds, init_time,
                                        one_epoch_time, run_lr_time)
            else:
                run_time500, run_time300, run_time150, early_stopping_rounds, init_time, one_epoch_time, run_lr_time = run_time[
                    model_type]
            s2 = time.time()
            LOGGER.info(
                f"time_budget:{time_budget}s,used time:{s2-s1:.2f}s,{model_name} model will use {run_time500:.2f}s|{run_time300:.2f}s|{run_time150:.2f}s"
            )
            if s2 - s1 + run_time500 + 5 < time_budget:
                LOGGER.info('train 500 epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=500,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif s2 - s1 + run_time300 + 5 < time_budget:
                LOGGER.info('train 300 epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=300,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif s2 - s1 + run_time150 + 5 < time_budget:
                LOGGER.info('train 150 epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=150,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif len(allmodel.valid_models[0]) == 0:
                this_epoch = int((
                    (time_budget -
                     (s2 - s1 + 5) - run_lr_time) / 2 - init_time) /
                                 (one_epoch_time) - early_stopping_rounds)
                LOGGER.info(f'short time train {this_epoch} epoch')
                allmodel.V37_fit_transform(table,
                                           model_type,
                                           model_name,
                                           train_valid_idx_list,
                                           valid_idx_list,
                                           train_idx,
                                           test_idx,
                                           mode=split_mode,
                                           num_boost_round=this_epoch,
                                           seed=seed)
                run_model.append(model_name)
                run_type.append(model_type)
            elif time_budget - (s2 - s1) < 5:
                LOGGER.info('never train; break')
                break
            else:
                LOGGER.info('no train this model; continue')
                continue

        if offline:
            if table.especial:
                df = table.df[['node_index', 'is_test']]
                df = df.merge(data['test_label'], how='left', on='node_index')
                test_label = df.loc[(df['is_test'] == 1) &
                                    (table.directed_mask.tolist()),
                                    'label'].astype('int').values
            else:
                test_label = data['test_label']['label'].values
        else:
            test_label = None

        preds1, valid_acc1 = get_preds(0, run_model, run_type, allmodel,
                                       model_name_list, table, test_label,
                                       valid_idx_list)
        preds2, valid_acc2 = get_preds(1, run_model, run_type, allmodel,
                                       model_name_list, table, test_label,
                                       valid_idx_list)
        preds = (preds1 + preds2) / 2

        preds = preds.argmax(axis=1).flatten()

        if table.especial:
            LOGGER.info(f'preds\n{preds}')
            df = table.df[['label', 'is_test']]
            df['preds'] = int(
                df.loc[[not i for i in table.directed_mask.tolist()],
                       'label'].value_counts().index[0])
            df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()),
                   'preds'] = preds
            preds = df.loc[df['is_test'] == 1, 'preds'].values

        LOGGER.info(
            f"train label\n{data['train_label']['label'].value_counts()/data['train_label'].shape[0]}"
        )
        df_preds = pd.Series(preds, name='preds')
        LOGGER.info(
            f"preds label\n{df_preds.value_counts()/df_preds.shape[0]}")

        if offline:
            preds1 = preds1.argmax(axis=1).flatten()
            preds2 = preds2.argmax(axis=1).flatten()
            if table.especial:
                LOGGER.info(f'preds1\n{preds1}')
                df = table.df[['label', 'is_test']]
                df['preds'] = int(
                    df.loc[[not i for i in table.directed_mask.tolist()],
                           'label'].value_counts().index[0])
                df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()),
                       'preds'] = preds1
                preds1 = df.loc[df['is_test'] == 1, 'preds'].values

                LOGGER.info(f'preds2\n{preds2}')
                df = table.df[['label', 'is_test']]
                df['preds'] = int(
                    df.loc[[not i for i in table.directed_mask.tolist()],
                           'label'].value_counts().index[0])
                df.loc[(df['is_test'] == 1) & (table.directed_mask.tolist()),
                       'preds'] = preds2
                preds2 = df.loc[df['is_test'] == 1, 'preds'].values

            df_test = table.df[['degree', 'label', 'is_test']]
            df_test = df_test.loc[df_test['is_test'] == 1]
            df_test['preds'] = preds
            df_test['label'] = data['test_label']['label'].values
            df_test['acc'] = df_test['preds'] == df_test['label']

            pd.set_option('display.max_rows', 1000)
            print(df_test.groupby('degree')['acc'].mean())

            return preds, valid_acc1, valid_acc2, preds1, preds2
        else:
            return preds
Пример #20
0
            with tf.name_scope('Evaluation'):
                # MAPE : if it not y+1, will be inf
                self.MAPE = tf.reduce_mean(
                    tf.divide(tf.abs(tf.subtract(self.pred, self.y)),
                              tf.add(self.y, 1.0)))
                # RMSE : if it not y+1, will be inf
                self.RMSE = tf.sqrt(
                    tf.reduce_mean(tf.square(tf.subtract(self.pred, self.y))))

            self.merged = tf.summary.merge_all()


if __name__ == '__main__':
    pdata = pdata()
    training_X, training_Y, testing_X, testing_Y = pdata.get_taxi_data_24()
    training_X_batch = np.shape(np.reshape(
        training_X, (-1, 24, 3600)))[0]  # (B, T, N)  255*24*3600

    g = Graph()
    config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True
    with tf.Session(graph=g.graph, config=config) as sess:
        # tensorboard writer
        writer = tf.summary.FileWriter('logs/', sess.graph)

        tStart = time.time()
        sess.run(tf.global_variables_initializer())
        step = 0

        while step < training_iters:
Пример #21
0
import pdfkit
from process_data import ProcessData, create_dir

# 获取配置文件
cf = configparser.ConfigParser()
cf.read('conf.ini')
account_id = cf.get("juejin", "id")
client_id = cf.get("juejin", "client_id")
uid = cf.get("juejin", "uid")
src = cf.get("juejin", "src")
token = cf.get("juejin", "token")
get_url = cf.get("juejin", "getUrl")
section_url = cf.get("juejin", "getSectionUrl")
book_name = cf.get("juejin", "bookName")

create_dir()
process = ProcessData(account_id=account_id, uid=uid, client_id=client_id, token=token,
                      book_name=book_name, get_url=get_url, section_url=section_url, src=src,
                      dir_path="source")

section_ids = process.get_section_id()

process.get_content_from_section(section_ids)
# 制作mobi使用
# create_dir("dist")

# 本来打算用这个库制作mobi https://github.com/jachinlin/kindle_maker,但是目前这个库报错
# make_ebook("source", "dist")

# 使用pdfkit制作电子书
pdfkit.from_file(book_name.strip('"')+'.html', book_name.strip('"')+'.pdf')