def show_training_set_histogram(path, config): column_name = config['histogram']['column'] with open(path) as fp: hist_values = [] header = trainingset.normalize_header(next(fp)) # get header for line in fp: row = trainingset.TrainingSetRow(trainingset.normalize_line(line), header) hist_values.append(float(row.get(column_name))) plot_hist(hist_values, config, column_name)
def show_scatter_2d_plot(path, config): with open(path) as fp: header = trainingset.normalize_header(next(fp)) x_key = config['scatter']['x_axis'] y_key = config['scatter']['y_axis'] y_data = [] x_data = [] for line in fp: row = trainingset.TrainingSetRow(trainingset.normalize_line(line), header) x_data.append(float(row.get(x_key))) y_data.append(float(row.get(y_key))) plt.plot(x_data, y_data, '.', label='observation') plt.xlabel(x_key) plt.ylabel(y_key) plt.legend() plt.show()
def show_scatter_3d_plot(path, config): with open(path) as fp: header = trainingset.normalize_header(next(fp)) x_key = config['scatter']['x_axis'] y_key = config['scatter']['y_axis'] z_key = config['scatter']['z_axis'] y_data = [] x_data = [] z_data = [] for line in fp: row = trainingset.TrainingSetRow(trainingset.normalize_line(line), header) x_data.append(float(row.get(x_key))) y_data.append(float(row.get(y_key))) z_data.append(float(row.get(z_key))) fig = plt.figure() ax = fig.gca(projection='3d') ax.scatter(x_data, y_data, z_data, label='observations') ax.set_xlabel(x_key) ax.set_ylabel(y_key) ax.set_zlabel(z_key) ax.legend() plt.show()
def setUp(self): super().setUp() header = '#0->on_sale 1->bm25_score' line = '2 qid:0 0:0 1:4.1 #red jeans->1120' self.test_row = trainingset.TrainingSetRow(line, header)
if __name__ == "__main__": with open('config.json') as json_data_file: config = json.load(json_data_file) url = get_solr_url(config) batch_size = config['indexing']['batch_size'] click_stream_file_path = config['file']['click_stream'] training_set_file_path = config['file']['training_set'] with open(training_set_file_path) as training_set_file: click_data_dict = read_click_data_dict(click_stream_file_path) header = trainingset.normalize_header(next(training_set_file)) docs_batch = [] indexed = 0 batch_num = 0 for line in training_set_file: row = trainingset.TrainingSetRow( trainingset.normalize_line(line), header) doc_for_index = row.get_as_dict() qd_key = row.get_qd_pair() if qd_key in click_data_dict: doc_for_index.update(click_data_dict[qd_key]) else: print('Not joined by key for={}'.format(qd_key)) continue docs_batch.append(doc_for_index) indexed += 1 if len(docs_batch) >= batch_size: index_docs(docs_batch, url) docs_batch.clear() batch_num += 1 print('Batch #{}. Total indexed {} docs.'.format( batch_num, indexed))