def prepare(self): print('prepare: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) contextual_train_set =\ ETLUtils.select_fields(self.headers, self.train_records) contextual_test_set =\ ETLUtils.select_fields(self.headers, self.records_to_predict) ETLUtils.save_csv_file( self.csv_train_file, contextual_train_set, self.headers) ETLUtils.save_csv_file( self.csv_test_file, contextual_test_set, self.headers) print('Exported CSV and JSON files: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) csv_files = [ self.csv_train_file, self.csv_test_file ] num_cols = len(self.headers) context_cols = num_cols print('num_cols', num_cols) # print('context_cols', context_cols) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], range(3, context_cols), ',', has_header=True, suffix='.no_context.libfm') libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], [], ',', has_header=True, suffix='.context.libfm') print('Exported LibFM files: %s' % time.strftime("%Y/%d/%m-%H:%M:%S"))
def run(self, dataset, output_folder, train_records, test_records, train_reviews=None, test_reviews=None): contextual_train_set, contextual_test_set = self.full_cycle( train_records, test_records, train_reviews, test_reviews ) print("Prepared data: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) # json_train_file = output_folder + 'yelp_' + dataset + '_context_shuffled_train5.json' csv_train_file = output_folder + "yelp_" + dataset + "_context_shuffled_train5.csv" # json_test_file = output_folder + 'yelp_' + dataset + '_context_shuffled_test5.json' csv_test_file = output_folder + "yelp_" + dataset + "_context_shuffled_test5.csv" # ETLUtils.save_json_file(json_train_file, contextual_train_set) ETLUtils.save_csv_file(csv_train_file, contextual_train_set, self.headers) # ETLUtils.save_json_file(json_test_file, contextual_test_set) ETLUtils.save_csv_file(csv_test_file, contextual_test_set, self.headers) print("Exported CSV and JSON files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) csv_files = [csv_train_file, csv_test_file] num_cols = len(self.headers) context_cols = num_cols print("num_cols", num_cols) # print('context_cols', context_cols) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], range(3, context_cols), ",", has_header=True, suffix=".no_context.libfm" ) libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ",", has_header=True, suffix=".context.libfm") print("Exported LibFM files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))
def test_csv_to_libfm(self): input_file = folder + 'yelp.csv_train_0' expected_file = input_file + ".libfm" output_file = expected_file + "_test" if os.path.isfile(output_file): os.remove(output_file) delete_columns = [] csv_to_libfm( input_file, output_file, 2, delete_columns=delete_columns, delimiter=',', has_header=True) self.assertTrue(filecmp.cmp(output_file, expected_file)) if os.path.isfile(output_file): os.remove(output_file)
def prepare(self): print('prepare: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.headers = build_headers(len(self.sense_groups)) if Constants.USE_CONTEXT is True: for record in self.train_records: record.update(record[Constants.CONTEXT_WORDS_FIELD]) for record in self.records_to_predict: record.update(record[Constants.CONTEXT_WORDS_FIELD]) if Constants.FM_REVIEW_TYPE: self.train_records = ETLUtils.filter_records( self.train_records, Constants.PREDICTED_CLASS_FIELD, [Constants.FM_REVIEW_TYPE]) # ETLUtils.drop_fields([Constants.TOPICS_FIELD], self.train_records) ETLUtils.keep_fields(self.headers, self.train_records) ETLUtils.keep_fields(self.headers, self.records_to_predict) ETLUtils.save_csv_file( self.csv_train_file, self.train_records, self.headers) ETLUtils.save_csv_file( self.csv_test_file, self.records_to_predict, self.headers) print('Exported CSV and JSON files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) csv_files = [ self.csv_train_file, self.csv_test_file ] print('num_cols', len(self.headers)) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], [], ',', has_header=True, suffix='.libfm') print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
def test_csv_to_libfm(self): input_file = folder + 'yelp.csv_train_0' expected_file = input_file + ".libfm" output_file = expected_file + "_test" if os.path.isfile(output_file): os.remove(output_file) delete_columns = [] csv_to_libfm(input_file, output_file, 2, delete_columns=delete_columns, delimiter=',', has_header=True) self.assertTrue(filecmp.cmp(output_file, expected_file)) if os.path.isfile(output_file): os.remove(output_file)
def main_converter(): csv_train_file = GENERATED_FOLDER + 'yelp_training_set_review_' + DATASET + 's_shuffled_train.csv' csv_test_file = GENERATED_FOLDER + 'records_to_predict_' + DATASET + '.csv' # ETLUtils.json_to_csv(TRAIN_RECORDS_FILE, csv_train_file, 'user_id', 'business_id', 'stars', False, True) # ETLUtils.json_to_csv(RECORDS_TO_PREDICT_FILE, csv_test_file, 'user_id', 'business_id', 'stars', False, True) headers = ['stars', 'user_id', 'business_id'] train_records = ETLUtils.load_json_file(TRAIN_RECORDS_FILE) records_to_predict = ETLUtils.load_json_file(RECORDS_TO_PREDICT_FILE) train_records = ETLUtils.select_fields(headers, train_records) records_to_predict = ETLUtils.select_fields(headers, records_to_predict) ETLUtils.save_csv_file(csv_train_file, train_records, headers) ETLUtils.save_csv_file(csv_test_file, records_to_predict, headers) csv_files = [ csv_train_file, csv_test_file ] csv_to_libfm(csv_files, 0, [1, 2], [], ',', has_header=True)
def prepare_records_for_libfm(self): print('prepare_records_for_libfm: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.headers = build_headers(self.context_rich_topics) if Constants.USE_CONTEXT is True: if Constants.REVIEW_TYPE == Constants.SPECIFIC or \ Constants.REVIEW_TYPE == Constants.GENERIC: self.train_records = ETLUtils.filter_records( self.train_records, Constants.PREDICTED_CLASS_FIELD, [Constants.REVIEW_TYPE]) with open(self.csv_train_file, 'w') as out_file: writer = csv.writer(out_file) # Write header writer.writerow(self.headers) for record in self.train_records: row = [] for header in basic_headers: row.append(record[header]) if Constants.USE_CONTEXT is True: for topic in self.context_rich_topics: context_topics = record[Constants.CONTEXT_TOPICS_FIELD] # print('context_topics', context_topics) row.append(context_topics['topic' + str(topic[0])]) writer.writerow(row) self.train_records = None gc.collect() with open(self.csv_test_file, 'w') as out_file: writer = csv.writer(out_file) # Write header writer.writerow(self.headers) for record in self.records_to_predict: row = [] for header in basic_headers: row.append(record[header]) if Constants.USE_CONTEXT is True: for topic in self.context_rich_topics: important_record = record[Constants.REVIEW_ID_FIELD] context_topics =\ self.context_topics_map[important_record] row.append(context_topics['topic' + str(topic[0])]) writer.writerow(row) # self.records_to_predict = None self.context_topics_map = None self.context_rich_topics = None gc.collect() print('Exported CSV and JSON files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) csv_files = [self.csv_train_file, self.csv_test_file] print('num_cols', len(self.headers)) libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ',', has_header=True, suffix='.libfm') print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
def prepare_records_for_libfm(self): print('prepare_records_for_libfm: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.headers = build_headers(self.context_rich_topics) if Constants.REVIEW_TYPE == Constants.SPECIFIC or \ Constants.REVIEW_TYPE == Constants.GENERIC: self.train_records = ETLUtils.filter_records( self.train_records, Constants.PREDICTED_CLASS_FIELD, [Constants.REVIEW_TYPE]) with open(self.csv_train_file, 'w') as out_file: writer = csv.writer(out_file) # Write header writer.writerow(self.headers) for record in self.train_records: row = [] for header in basic_headers: row.append(record[header]) if Constants.USE_CONTEXT is True: for topic in self.context_rich_topics: context_topics = record[Constants.CONTEXT_TOPICS_FIELD] # print('context_topics', context_topics) row.append(context_topics['topic' + str(topic[0])]) writer.writerow(row) self.train_records = None gc.collect() with open(self.csv_test_file, 'w') as out_file: writer = csv.writer(out_file) # Write header writer.writerow(self.headers) for record in self.records_to_predict: row = [] for header in basic_headers: row.append(record[header]) if Constants.USE_CONTEXT is True: for topic in self.context_rich_topics: important_record = record[Constants.REVIEW_ID_FIELD] context_topics =\ self.context_topics_map[important_record] row.append(context_topics['topic' + str(topic[0])]) writer.writerow(row) # self.records_to_predict = None self.context_topics_map = None self.context_rich_topics = None gc.collect() print('Exported CSV and JSON files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) csv_files = [ self.csv_train_file, self.csv_test_file ] print('num_cols', len(self.headers)) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], [], ',', has_header=True, suffix='.libfm') print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))