def make_train_csv_dict(train_csv): train_dict = dict() for author in train_csv: train_dict[author['author_id']] = author return train_dict def make_clean_paper_author(confirmed_pairs, paper_author_pairs, output_file_path): with open(output_file_path, 'w+') as f: print >> f, 'PaperId,AuthorId,Name,Affiliation' for idx in confirmed_pairs: print >> f, '%d,%d,%s,%s' % (paper_author_pairs[idx]['paper_id'], paper_author_pairs[idx]['author_id'], paper_author_pairs[idx]['author_name'], paper_author_pairs[idx]['author_affiliation']) if __name__ == '__main__': if len(sys.argv) != 1 + 5: print >> sys.stderr, "Usage : %s Train.csv PaperAuthor.csv Test.csv label output_file" % (sys.argv[0], ) sys.exit(-1) train_csv = loader.load_train_csv(sys.argv[1]) train_csv_dict = make_train_csv_dict(train_csv) paper_author_csv = loader.load_paper_author_csv(sys.argv[2]) test_csv = loader.load_test_csv(sys.argv[3]) labels = load_labels(sys.argv[4]) print 'Data loaded!' confirmed_pairs = get_confirmed_pairs(get_author_in_train(train_csv), train_csv_dict, test_csv, labels) pair_idxs = filter_paper_author_csv(paper_author_csv, confirmed_pairs) make_clean_paper_author(pair_idxs, paper_author_csv, sys.argv[5]) sys.exit(0)
from python_util.config import * def get_internal_author_ids(internal_train_csv): author_ids = dict() for rec in internal_train_csv: author_ids[rec['author_id']] = 1 return author_ids if __name__ == '__main__': if len(sys.argv) != 1 + 3: print >> sys.stderr, "Usage : %s feature_file internal_train_output internal_valid_output" % (sys.argv[0],) sys.exit(-1) train_csv = loader.load_train_csv(DATA['Train']) internal_train_csv = loader.load_train_csv(DATA['internal_train']) internal_valid_csv = loader.load_test_csv(DATA['internal_valid']) author_papers = dict() with open(sys.argv[1], 'r') as f: lines = f.read().split('\n') pair_count = 0 for rec in train_csv: author_id = rec['author_id'] paper_ids = rec['confirmed_paper_ids'] + rec['deleted_paper_ids'] paper_id_count = len(paper_ids) author_papers[author_id] = dict()
def get_internal_author_ids(internal_train_csv): author_ids = dict() for rec in internal_train_csv: author_ids[rec['author_id']] = 1 return author_ids if __name__ == '__main__': if len(sys.argv) != 1 + 3: print >> sys.stderr, "Usage : %s feature_file internal_train_output internal_valid_output" % ( sys.argv[0], ) sys.exit(-1) train_csv = loader.load_train_csv(DATA['Train']) internal_train_csv = loader.load_train_csv(DATA['internal_train']) internal_valid_csv = loader.load_test_csv(DATA['internal_valid']) author_papers = dict() with open(sys.argv[1], 'r') as f: lines = f.read().split('\n') pair_count = 0 for rec in train_csv: author_id = rec['author_id'] paper_ids = rec['confirmed_paper_ids'] + rec['deleted_paper_ids'] paper_id_count = len(paper_ids) author_papers[author_id] = dict() for idx, paper_id in enumerate(paper_ids):
def make_clean_paper_author(confirmed_pairs, paper_author_pairs, output_file_path): with open(output_file_path, 'w+') as f: print >> f, 'PaperId,AuthorId,Name,Affiliation' for idx in confirmed_pairs: print >> f, '%d,%d,%s,%s' % ( paper_author_pairs[idx]['paper_id'], paper_author_pairs[idx]['author_id'], paper_author_pairs[idx]['author_name'], paper_author_pairs[idx]['author_affiliation']) if __name__ == '__main__': if len(sys.argv) != 1 + 5: print >> sys.stderr, "Usage : %s Train.csv PaperAuthor.csv Test.csv label output_file" % ( sys.argv[0], ) sys.exit(-1) train_csv = loader.load_train_csv(sys.argv[1]) train_csv_dict = make_train_csv_dict(train_csv) paper_author_csv = loader.load_paper_author_csv(sys.argv[2]) test_csv = loader.load_test_csv(sys.argv[3]) labels = load_labels(sys.argv[4]) print 'Data loaded!' confirmed_pairs = get_confirmed_pairs(get_author_in_train(train_csv), train_csv_dict, test_csv, labels) pair_idxs = filter_paper_author_csv(paper_author_csv, confirmed_pairs) make_clean_paper_author(pair_idxs, paper_author_csv, sys.argv[5]) sys.exit(0)