def process_data(self, input_dir): self.queries_data = [] self.results = {"ole": []} all_files, all_fnames = [], [] for r, _, fs in os.walk(input_dir): for f in fs: all_files.append(r + "/" + f) all_fnames.append(f) for i in self.queries: tname = "".join(i.split()) for j in self.locations: lname = "".join(j.split()) fname = get_filename("ole", tname, lname) if fname in all_fnames: ind = all_fnames.index(fname) self.queries_data.append({"q": i, "l": j}) self.results["ole"].append(get_result_from_csv_file(all_files[ind]))
def format_data(self, input_dir, train_percent=70): self.queries_data = [] self.results = {"ole": []} all_files, all_fnames = [], [] for r, _, fs in os.walk(input_dir): for f in fs: all_files.append(r+"/"+f) all_fnames.append(f) for i in self.queries: tname = "".join(i.split()) if tname not in Rank.topic_hash: Rank.topic_hash[tname] = Rank.topic_ctr Rank.topic_ctr += 1 for j in self.locations: lname = "".join(j.split()) fname = get_filename("ole", tname, lname) #include epsilon data also later if fname in all_fnames: ind = all_fnames.index(fname) self.queries_data.append({"q": i, "l":j}) self.results["ole"].append(get_fields_from_csv_file(all_files[ind])) self.data = [] indexes = range(len(self.queries_data)) shuffle(indexes) l = len(indexes) l_train = l*train_percent/100 train_indexes = indexes[:l_train] test_indexes = indexes[l_train:] self.train_data, self.test_data = [] , [] for i in train_indexes: query = self.queries_data[i] result = self.results["ole"][i] self.train_data.append(Rank.get_data_obj(query, result, i)) for i in test_indexes: query = self.queries_data[i] result = self.results["ole"][i] self.test_data.append(Rank.get_data_obj(query, result, i)) write_to_file("train.dat", self.train_data) write_to_file("test.dat", self.test_data)