def load_json(p, lower): source = [] tgt = [] flag = False for sent in json.load(open(p))['sentences']: tokens = [t['word'] for t in sent['tokens']] if (lower): tokens = [t.lower() for t in tokens] if (tokens[0] == '@highlight'): flag = True tgt.append([]) continue if (flag): tgt[-1].extend(tokens) else: source.append(tokens) source = [clean(' '.join(sent)).split() for sent in source] tgt = [clean(' '.join(sent)).split() for sent in tgt] return source, tgt
def process(self, row): return clean(row.get('argument') or '', reserved_words = self.reserved_words),
def process(self, row): return clean(row.get('argument') or '', reserved_words=self.reserved_words),
def reduce_log(tablename, cur): print >>sys.stderr, "Reducing general_log.{0} and storing into reduced_log".format(tablename) print >>sys.stderr, "Selecting results..." cur.execute("USE reduced_log") print_and_execute("SELECT user, userid FROM users", cur) users = dict(cur.fetchall()) usernum = max(users.values()) + 1 if users.values() else 0 # first open usernum newusers = [] print_and_execute("SELECT server, serverid FROM servers", cur) servers = dict(cur.fetchall()) servernum = max(servers.values()) + 1 if servers.values() else 0 # first open servernum newservers = [] cur.execute('USE general_log') print_and_execute("""SELECT * FROM {0} WHERE command_type IN ('Execute', 'Query')""".format(tablename), cur) print >>sys.stderr, "Selected results, cleaning queries and writing temp file..." temp_filename = '{0}_reduced.tmp'.format(tablename) outfile = open(temp_filename, 'w') for event_time, user_host, thread_id, server_id, command_type, query in cur: cleaned_query = clean(query, reserved_words) # Clean the query some more: remove numlists, replace constants cleaned_query = numlist_re.sub(numlist_sub_fcn, cleaned_query) try: user, server, cleaned_query = reducer.accept(user_host, cleaned_query) except TypeError: continue cleaned_query, vals = repl_constants(cleaned_query) vals = ' ~ '.join(vals) if user not in users: users[user] = usernum newusers.append(user) usernum += 1 if server not in servers: servers[server] = servernum newservers.append(server) servernum += 1 if cleaned_query.startswith('INSERT INTO'): query_type = 'INSERT' if values_re.search(cleaned_query): #TODO: count number of rows inserted. Nontrivial because of parens, commas, quotes, etc. cleaned_query = insert_re.match(cleaned_query).group(0) + ' <values>' vals = '' elif cleaned_query.startswith('SELECT'): query_type = 'SELECT' elif cleaned_query.startswith('CREATE TABLE'): # Replacing schemas with length + hash doesn't help much. # There aren't many create table statements (~1%) # cleaned_query = re.sub(r'\(.*\)', # lambda x: '<schema len={0}, hash={1}>'.format(x.group().count(',')+1, x.group().__hash__()), # cleaned_query) query_type = 'CREATE_TABLE' elif cleaned_query.startswith('SET'): query_type = 'SET' elif cleaned_query.startswith('LOAD DATA'): query_type = 'LOAD' elif cleaned_query.startswith('ALTER'): query_type = 'ALTER' else: query_type = 'OTHER' #we ignore server_id because it's always 0... cleaned_query = repr(cleaned_query)[1:-1] #deal with \n and others final = event_time, users[user], servers[server], thread_id, query_type, cleaned_query, vals print >>outfile, '\t'.join(str(s) for s in final) outfile.close() print >>sys.stderr, "Wrote temp file, loading data into {0} table...".format(tablename) cur.execute("USE reduced_log") cur.execute("""CREATE TABLE {0} (event_time DATETIME, userid INT, serverid INT, thread_id INT(11), query_type ENUM{1}, query MEDIUMTEXT, vals MEDIUMTEXT, INDEX (userid), INDEX (serverid), INDEX (event_time) )""".format(tablename, querytypes)) cur.execute("LOAD DATA LOCAL INFILE '{0}' INTO TABLE {1}".format(temp_filename, tablename)) os.remove(temp_filename) print >>sys.stderr, "Loaded data and removed temp file. Adding into users table..." for user in newusers: cur.execute("INSERT INTO users VALUES ('{0}', {1})".format(user, users[user])) print >>sys.stderr, "Added into users table, adding into servers table..." for server in newservers: cur.execute("INSERT INTO servers VALUES ('{0}', {1})".format(server, servers[server])) db.commit() print >>sys.stderr, "Added into servers table. Defining time functions..." # This redefines the time fcns for every table reduced, but that's a small cost define_time_functions(cur) print >>sys.stderr, "Defined time functions. Reduction complete"
import myutils import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeClassifier filepath = "E:\\PyProjects\\kaggle\\titanic\\titanic\\" train_data_ori = pd.read_csv(filepath + "train.csv", delimiter=',') test_data_ori = pd.read_csv(filepath + "test.csv") myutils.clean(train_data_ori) myutils.clean(test_data_ori) # train_data = tfr.read_tfrecord('./train.tfrecord', 2, epochs=1, batch_size=891) # # test_data = tfr.read_tfrecord('./test.tfrecord', 2, epochs=1, batch_size=418) drop_elements = ['PassengerId', 'Name', 'Ticket', 'SibSp', "Parch"] train_data = train_data_ori.drop(drop_elements, axis=1) # train_data.iloc[:, 1:9] = myutils.input_normalization(train_data.iloc[:, 1:9]) test_data = test_data_ori.drop(drop_elements, axis=1) # test_data = myutils.input_normalization(test_data) decision_tree = DecisionTreeClassifier() decision_tree.fit(train_data.iloc[:, 1:9], train_data.iloc[:, 0:1]) y_pred = decision_tree.predict(test_data) acc_decision_tree = round( decision_tree.score(train_data.iloc[:, 1:9], train_data.iloc[:, 0:1]) * 100, 2) print(acc_decision_tree) y_pred = y_pred.flatten() y_pred = np.round(y_pred).astype(int)
buffer_size=10000) # better >= the number of data dataset = dataset.prefetch(buffer_size=batch_size) # need to confirm dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.repeat(epochs) # iterator = dataset.make_one_shot_iterator() # data_next = iterator.get_next() return dataset if __name__ == '__main__': filepath = "E:\\PyProjects\\kaggle\\titanic\\titanic\\" train_data = pd.read_csv(filepath + "train.csv") test_data = pd.read_csv(filepath + "test.csv") myutils.clean(train_data) myutils.clean(test_data) train_data = pd.DataFrame({ "Sex": train_data.Sex, "Pclass": train_data.Pclass, "Survived": train_data.Survived }) test_data = pd.DataFrame({ "Sex": test_data.Sex, "Pclass": test_data.Pclass }) print(train_data.shape) # (891, 3) print(test_data.shape) # (418, 2) tfrecord(train_data, train_data.shape[0], './train.tfrecord')