import numpy as np this_file_path = os.path.realpath(__file__) # this file's path home_dir = os.path.dirname(os.path.dirname(os.path.dirname(this_file_path))) sys.path.insert(0, home_dir + "/lib") # for importing functions import session_parser as sp # For testing #train_path = home_dir + '/data/train_head_10k' #train_path = home_dir + '/data/train_head_million' #train_path = home_dir + '/data/train_sample_10k' # For real train_path = home_dir + '/data/train' session_generator = sp.parse_from_file(train_path) session_count = 0 # Skips greater than or equal to this value will be aggregated limit = 3 # 2D array stores the sums for each position for each 'number of skips' counts = [np.zeros(limit + 1, dtype=int) for i in range(10)] # 2D array stores the corresponding lengths lengths = [np.zeros(limit + 1, dtype=int) for i in range(10)] while True: try: # Print at every millionth session if session_count % (10**6) == 0: print "...reading the {0}th session".format(session_count)
for key in dict: if key in findict: findict[key] += dict[key] else: findict[key] = dict[key] this_file_path = os.path.realpath(__file__) # this file's path home_dir = os.path.dirname(os.path.dirname(this_file_path)) sys.path.insert(0, home_dir + "/script") # for importing functions import session_parser as sp train_path = home_dir + '/data/train_sample' session_generator = sp.parse_from_file(train_path) session_count = 0 while True: try: # Print at every millionth session if session_count % (10 ** 6) == 0: print "...reading the {0}th session".format(session_count) # next() raises the StopIteration exeption when hitting the end session = session_generator.next() queryParse(session.queries) session_count += 1
# Runs in ~122 seconds for the entire test data. import os import sys import pandas as pd this_file_path = os.path.realpath(__file__) # this file's path home_dir = os.path.dirname(os.path.dirname(os.path.dirname(this_file_path))) sys.path.insert(0, home_dir + "/lib") # for importing functions import session_parser as sp # test_path = home_dir + '/data/test_head' # a small header file for testing # print "WARNING, this script is using a header file, not the real file." test_path = home_dir + '/data/test' # for real session_generator = sp.parse_from_file(test_path) # Read results for skipped and global versions # Ignore the first 3 lines in CSV as they are comments. skipped_means = pd.read_csv(home_dir + '/data/results/skipped_means.csv',\ sep=",", skipinitialspace=True, header='infer', skiprows=3).skipped_means global_means = pd.read_csv(home_dir + '/data/results/global_means.csv',\ sep=",", skipinitialspace=True, header='infer', skiprows=3).global_means # File for writing our predictions # Strategy 2 - Goal 1, with a bug. We fixed a bug, so we should try running this # with the fixed algorithm again. results = open(home_dir + '/data/prediction/s2_goal1_with_bug', 'w') results.write("SessionID,URLID\n") session_count = 0
# Runs in ~122 seconds for the entire test data. import os import sys import pandas as pd this_file_path = os.path.realpath(__file__) # this file's path home_dir = os.path.dirname(os.path.dirname(os.path.dirname(this_file_path))) sys.path.insert(0, home_dir + "/lib") # for importing functions import session_parser as sp # test_path = home_dir + '/data/test_head' # a small header file for testing # print "WARNING, this script is using a header file, not the real file." test_path = home_dir + '/data/test' # for real session_generator = sp.parse_from_file(test_path) # Read results for skipped and global versions # Ignore the first 3 lines in CSV as they are comments. skipped_means = pd.read_csv(home_dir + '/data/results/skipped_means.csv',\ sep=",", skipinitialspace=True, header='infer', skiprows=3).skipped_means global_means = pd.read_csv(home_dir + '/data/results/global_means.csv',\ sep=",", skipinitialspace=True, header='infer', skiprows=3).global_means # File for writing our predictions # Strategy 2 - Goal 1, with a bug. We fixed a bug, so we should try running this # with the fixed algorithm again. results = open(home_dir + '/data/prediction/s2_goal1_with_bug','w') results.write("SessionID,URLID\n") session_count = 0