def load_json(p, lower):
    source = []
    tgt = []
    flag = False
    for sent in json.load(open(p))['sentences']:
        tokens = [t['word'] for t in sent['tokens']]
        if (lower):
            tokens = [t.lower() for t in tokens]
        if (tokens[0] == '@highlight'):
            flag = True
            tgt.append([])
            continue
        if (flag):
            tgt[-1].extend(tokens)
        else:
            source.append(tokens)

    source = [clean(' '.join(sent)).split() for sent in source]
    tgt = [clean(' '.join(sent)).split() for sent in tgt]
    return source, tgt
示例#2
0
 def process(self, row):
     return clean(row.get('argument') or '', reserved_words = self.reserved_words),
示例#3
0
 def process(self, row):
     return clean(row.get('argument') or '',
                  reserved_words=self.reserved_words),
def reduce_log(tablename, cur):

    print >>sys.stderr, "Reducing general_log.{0} and storing into reduced_log".format(tablename)
    print >>sys.stderr, "Selecting results..."

    cur.execute("USE reduced_log")
    print_and_execute("SELECT user, userid FROM users", cur)
    users = dict(cur.fetchall())
    usernum = max(users.values()) + 1 if users.values() else 0 # first open usernum
    newusers = []

    print_and_execute("SELECT server, serverid FROM servers", cur)
    servers = dict(cur.fetchall())
    servernum = max(servers.values()) + 1 if servers.values() else 0 # first open servernum
    newservers = []

    cur.execute('USE general_log')
    print_and_execute("""SELECT * FROM {0} WHERE command_type IN ('Execute', 'Query')""".format(tablename), cur)

    print >>sys.stderr, "Selected results, cleaning queries and writing temp file..."

    temp_filename = '{0}_reduced.tmp'.format(tablename)
    outfile = open(temp_filename, 'w')

    for event_time, user_host, thread_id, server_id, command_type, query in cur:

        cleaned_query = clean(query, reserved_words)
        # Clean the query some more: remove numlists, replace constants
        cleaned_query = numlist_re.sub(numlist_sub_fcn, cleaned_query)
        try:
            user, server, cleaned_query = reducer.accept(user_host, cleaned_query)
        except TypeError:
            continue
        cleaned_query, vals = repl_constants(cleaned_query)
        vals = ' ~ '.join(vals)

        if user not in users:
            users[user] = usernum
            newusers.append(user)
            usernum += 1
        if server not in servers:
            servers[server] = servernum
            newservers.append(server)
            servernum += 1
            
        if cleaned_query.startswith('INSERT INTO'):
            query_type = 'INSERT'
            if values_re.search(cleaned_query):
                #TODO: count number of rows inserted. Nontrivial because of parens, commas, quotes, etc.
                cleaned_query = insert_re.match(cleaned_query).group(0) + ' <values>'
                vals = ''
        elif cleaned_query.startswith('SELECT'):
            query_type = 'SELECT'
        elif cleaned_query.startswith('CREATE TABLE'):
            # Replacing schemas with length + hash doesn't help much.
            # There aren't many create table statements (~1%)
            # cleaned_query = re.sub(r'\(.*\)',
            #                        lambda x: '<schema len={0}, hash={1}>'.format(x.group().count(',')+1, x.group().__hash__()),
            #                        cleaned_query)
            query_type = 'CREATE_TABLE'
        elif cleaned_query.startswith('SET'):
            query_type = 'SET'
        elif cleaned_query.startswith('LOAD DATA'):
            query_type = 'LOAD'
        elif cleaned_query.startswith('ALTER'):
            query_type = 'ALTER'
        else:
            query_type = 'OTHER'

        #we ignore server_id because it's always 0...
        cleaned_query = repr(cleaned_query)[1:-1] #deal with \n and others
        final = event_time, users[user], servers[server], thread_id, query_type, cleaned_query, vals
        print >>outfile, '\t'.join(str(s) for s in final)
        
    outfile.close()

    print >>sys.stderr, "Wrote temp file, loading data into {0} table...".format(tablename)

    cur.execute("USE reduced_log")
    cur.execute("""CREATE TABLE {0} (event_time DATETIME,
                                     userid INT,
                                     serverid INT,
                                     thread_id INT(11),
                                     query_type ENUM{1},
                                     query MEDIUMTEXT,
                                     vals MEDIUMTEXT,
                                     INDEX (userid),
                                     INDEX (serverid),
                                     INDEX (event_time)
                                    )""".format(tablename, querytypes))

    cur.execute("LOAD DATA LOCAL INFILE '{0}' INTO TABLE {1}".format(temp_filename, tablename))
    os.remove(temp_filename)

    print >>sys.stderr, "Loaded data and removed temp file. Adding into users table..."

    for user in newusers:
        cur.execute("INSERT INTO users VALUES ('{0}', {1})".format(user, users[user]))
    
    print >>sys.stderr, "Added into users table, adding into servers table..."

    for server in newservers:
        cur.execute("INSERT INTO servers VALUES ('{0}', {1})".format(server, servers[server]))

    db.commit()

    print >>sys.stderr, "Added into servers table. Defining time functions..."

    # This redefines the time fcns for every table reduced, but that's a small cost
    define_time_functions(cur)

    print >>sys.stderr, "Defined time functions. Reduction complete"
示例#5
0
import myutils
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

filepath = "E:\\PyProjects\\kaggle\\titanic\\titanic\\"
train_data_ori = pd.read_csv(filepath + "train.csv", delimiter=',')
test_data_ori = pd.read_csv(filepath + "test.csv")
myutils.clean(train_data_ori)
myutils.clean(test_data_ori)

# train_data = tfr.read_tfrecord('./train.tfrecord', 2, epochs=1, batch_size=891)
# # test_data = tfr.read_tfrecord('./test.tfrecord', 2, epochs=1, batch_size=418)
drop_elements = ['PassengerId', 'Name', 'Ticket', 'SibSp', "Parch"]
train_data = train_data_ori.drop(drop_elements, axis=1)
# train_data.iloc[:, 1:9] = myutils.input_normalization(train_data.iloc[:, 1:9])

test_data = test_data_ori.drop(drop_elements, axis=1)
# test_data = myutils.input_normalization(test_data)

decision_tree = DecisionTreeClassifier()
decision_tree.fit(train_data.iloc[:, 1:9], train_data.iloc[:, 0:1])
y_pred = decision_tree.predict(test_data)

acc_decision_tree = round(
    decision_tree.score(train_data.iloc[:, 1:9], train_data.iloc[:, 0:1]) *
    100, 2)
print(acc_decision_tree)
y_pred = y_pred.flatten()
y_pred = np.round(y_pred).astype(int)
示例#6
0
        buffer_size=10000)  # better >= the number of data
    dataset = dataset.prefetch(buffer_size=batch_size)  # need to confirm
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.repeat(epochs)

    # iterator = dataset.make_one_shot_iterator()
    # data_next = iterator.get_next()

    return dataset


if __name__ == '__main__':
    filepath = "E:\\PyProjects\\kaggle\\titanic\\titanic\\"
    train_data = pd.read_csv(filepath + "train.csv")
    test_data = pd.read_csv(filepath + "test.csv")
    myutils.clean(train_data)
    myutils.clean(test_data)
    train_data = pd.DataFrame({
        "Sex": train_data.Sex,
        "Pclass": train_data.Pclass,
        "Survived": train_data.Survived
    })
    test_data = pd.DataFrame({
        "Sex": test_data.Sex,
        "Pclass": test_data.Pclass
    })

    print(train_data.shape)  # (891, 3)
    print(test_data.shape)  # (418, 2)

    tfrecord(train_data, train_data.shape[0], './train.tfrecord')