예제 #1
0
def train_feature_gen():
    ap_pairs = []
    labels = []
    print (DATA['Train'])
    with open(DATA['Train'], 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            confirmed_paper_ids = [paper_id for paper_id in row['ConfirmedPaperIds'].strip().split(' ') if paper_id.__len__() > 0]
            deleted_paper_ids = [paper_id for paper_id in row['DeletedPaperIds'].strip().split(' ') if paper_id.__len__() > 0]
            ap_pairs += [(row['AuthorId'], pid) for pid in confirmed_paper_ids]
            labels += [1] * len(confirmed_paper_ids)
            ap_pairs += [(row['AuthorId'], pid) for pid in deleted_paper_ids]
            labels += [-1] * len(deleted_paper_ids)
        

#    for pair in ap_pairs:
#        print (pair[0], pair[1])

#    print (len(ap_pairs))
#    sys.exit(0)

    f_bcmk = features.benchmark(ap_pairs=ap_pairs, paper_author_csv=DATA['PaperAuthor'], paper_csv=DATA['Paper'])
    f_yrre = features.years_related(ap_pairs=ap_pairs, paper_author_csv=DATA['PaperAuthor'], paper_csv=DATA['Paper'])
    f_strdist = features.string_distances(ap_pairs=ap_pairs, paper_author_csv=DATA['PaperAuthor'], author_csv=DATA['Author'])
    f_coauth = features.coauthor_related(ap_pairs=ap_pairs, paper_author_csv=DATA['PaperAuthor'])

    with open(sys.argv[1], 'w') as g:
        for i in range(len(labels)):
            print(labels[i], end='', file=g)
            print('', *f_bcmk[i], sep=' ', end='', file=g)
            print('', *f_yrre[i], sep=' ', end='', file=g)
            print('', *f_strdist[i], sep=' ', end='', file=g)
            print('', *f_coauth[i], sep=' ', end='', file=g)
            print('', file=g)
예제 #2
0
def valid_feature_gen():
    ap_pairs = []
    labels = []
    with open(DATA['Valid'], 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            paper_ids = row['PaperIds'].strip().split(' ')
            ap_pairs += [(row['AuthorId'], pid) for pid in paper_ids]
            labels += [0] * len(paper_ids)

    f_bcmk = features.benchmark(ap_pairs=ap_pairs,
                                paper_author_csv=DATA['PaperAuthor'],
                                paper_csv=DATA['Paper'])
    f_yrre = features.years_related(ap_pairs=ap_pairs,
                                    paper_author_csv=DATA['PaperAuthor'],
                                    paper_csv=DATA['Paper'])
    f_strdist = features.string_distances(ap_pairs=ap_pairs,
                                          paper_author_csv=DATA['PaperAuthor'],
                                          author_csv=DATA['Author'])
    f_coauth = features.coauthor_related(ap_pairs=ap_pairs,
                                         paper_author_csv=DATA['PaperAuthor'])

    with open(sys.argv[2], 'w') as g:
        for i in range(len(labels)):
            print(labels[i], end='', file=g)
            print('', *f_bcmk[i], sep=' ', end='', file=g)
            print('', *f_yrre[i], sep=' ', end='', file=g)
            print('', *f_strdist[i], sep=' ', end='', file=g)
            print('', *f_coauth[i], sep=' ', end='', file=g)
            print('', file=g)
예제 #3
0
def valid_feature_gen():
    ap_pairs = []
    labels = []
    with open(DATA['Valid'], 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            paper_ids = row['PaperIds'].strip().split(' ')
            ap_pairs += [(row['AuthorId'], pid) for pid in paper_ids]
            labels += [0] * len(paper_ids)

    f_bcmk = features.benchmark(ap_pairs=ap_pairs, paper_author_csv=DATA['PaperAuthor'], paper_csv=DATA['Paper'])
    f_yrre = features.years_related(ap_pairs=ap_pairs, paper_author_csv=DATA['PaperAuthor'], paper_csv=DATA['Paper'])
    f_strdist = features.string_distances(ap_pairs=ap_pairs, paper_author_csv=DATA['PaperAuthor'], author_csv=DATA['Author'])
    f_coauth = features.coauthor_related(ap_pairs=ap_pairs, paper_author_csv=DATA['PaperAuthor'])

    with open(sys.argv[2], 'w') as g:
        for i in range(len(labels)):
            print(labels[i], end='', file=g)
            print('', *f_bcmk[i], sep=' ', end='', file=g)
            print('', *f_yrre[i], sep=' ', end='', file=g)
            print('', *f_strdist[i], sep=' ', end='', file=g)
            print('', *f_coauth[i], sep=' ', end='', file=g)
            print('', file=g)