Пример #1
0
def get_nauthors():
    print >> sys.stderr, 'Reading data...'
    papers, authors = get_author_papers()
    confirmed, deleted = get_train()

    print >> sys.stderr, 'Calculating scores for train set...'
    confirmed, deleted = get_train()
    outf = open('nauthors.train.dat', 'w')
    for aid in confirmed:
        allPapers = confirmed[aid] + deleted[aid]
        for p1 in allPapers:
            if p1 in confirmed[aid]:
                tf = 'T'
            else:
                tf = 'F'
            print >> outf, aid, p1, len(authors[p1]), tf
    outf.close()

    print >> sys.stderr, 'Calculating scores for validation set...'
    validation = get_valid()
    outf = open('nauthors.valid.dat', 'w')
    for aid in validation:
        allPapers = validation[aid]
        for p1 in allPapers:
            print >> outf, aid, p1, len(authors[p1])
    outf.close()
Пример #2
0
def get_sum_coauthors():
    print >> sys.stderr, 'Reading data...'
    papers, authors = get_author_papers()

    #Create the number of collaborations between 2 authors
    num_collaborations = {}
    for author in authors.values():
        for i in range(0,len(author)):
            for j in range(i+1,len(author)):
                #Always use the lowest author id as the first key
                if author[i]<author[j]:
                    try:
                        num_collaborations[(author[i], author[j])] += 1
                    except KeyError:
                        num_collaborations[(author[i], author[j])] = 1
                else:
                    try:
                        num_collaborations[(author[j], author[i])] += 1
                    except KeyError:
                        num_collaborations[(author[j], author[i])] = 1


    print >> sys.stderr, 'Calculating scores for train set...'
    confirmed, deleted = get_train()
    print confirmed
    outf = open('sumcoauthors.train.dat', 'w')
    for aid in confirmed:
        allPapers = confirmed[aid] + deleted[aid]
        for p1 in allPapers:
            sum_coauthors = 0
            for author in authors[p1]:
                if author<aid:
                    sum_coauthors += num_collaborations[(author,aid)]
                elif aid<author:
                    sum_coauthors += num_collaborations[(aid,author)]

            if p1 in confirmed[aid]:
                tf = 'T'
            else:
                tf = 'F'
            print >> outf, aid, p1, sum_coauthors, tf
    outf.close()

    print >> sys.stderr, 'Calculating scores for validation set...'
    validation = get_valid()
    outf = open('sumcoauthors.valid.dat', 'w')
    for aid in validation:
        allPapers = validation[aid]
        for p1 in allPapers:
            sum_coauthors = 0
            for author in authors[p1]:
                if author<aid:
                    sum_coauthors += num_collaborations[(author,aid)]
                elif aid<author:
                    sum_coauthors += num_collaborations[(aid,author)]
            print >> outf, aid, p1, sum_coauthors
    outf.close()
Пример #3
0
def get_nvenue():
    print >> sys.stderr, 'Reading venue info...'
    journal, conference = get_venue()
    print >> sys.stderr, 'Reading author-paper info...'
    papers, authors = get_author_papers()

    print >> sys.stderr, 'Counting papers in journals...'
    aid2journal = build_author_venue_count(papers, journal)
    print >> sys.stderr, 'Counting papers in conferences...'
    aid2conference = build_author_venue_count(papers, conference)

    print >> sys.stderr, 'Training set...'
    confirmed, deleted = get_train()
    outf = open('nvenue.train.dat', 'w')
    for aid in confirmed:
        for p1 in [p for p in confirmed[aid] + deleted[aid]]:
            if p1 in confirmed[aid]:
                tf = 'T'
            elif p1 in deleted[aid]:
                tf = 'F'
            else:
                raise WhatTheFuck
            try:
                sj = aid2journal[aid][journal[p1]]
            except KeyError:
                sj = aid2journal[aid][-1]                
            try:
                sc = aid2conference[aid][conference[p1]]
            except KeyError:
                sc = aid2conference[aid][-1]
            print >> outf, aid, p1, sj, sc, tf
    outf.close()

    print >> sys.stderr, 'Validation set...'
    validation = get_valid()
    outf = open('nvenue.valid.dat', 'w')
    for aid in validation:
        for p1 in validation[aid]:
            try:
                sj = aid2journal[aid][journal[p1]]
            except KeyError:
                sj = aid2journal[aid][-1]                
            try:
                sc = aid2conference[aid][conference[p1]]
            except KeyError:
                sc = aid2conference[aid][-1]
            print >> outf, aid, p1, sj, sc
    outf.close()
Пример #4
0
def get_affiliation_score():
    print >> sys.stderr, 'Loading author reference affiliations...'
    affil_auth = get_affsauthors()
    print >> sys.stderr, 'Loading author-paper table...'
    affil_paper = get_affspapers()

    print >> sys.stderr, 'Creating training...'
    confirmed, deleted = get_train()
    outf = open('affiliation.train.dat', 'w')
    count, tot = 0, len(confirmed)
    for aid in confirmed:
        count += 1
        print >> sys.stderr, '%d / %d' % (count, tot)
        all = confirmed[aid] + deleted[aid]
        for pid in confirmed[aid] + deleted[aid]:
            if pid in confirmed[aid]:
                tf = 'T'
            else:
                tf = 'F'

            scorea= affil_align_score(affil_paper[aid][pid],
                                      affil_auth[aid])
            ## scorep = mean(
            ##     [affil_align_score(affil_paper[aid][pid],
            ##                        affil_paper[aid][pid2])
            ##      for pid2 in all if pid2 != pid]
            ##     )
            ## scorem = max(scorea,scorep)    
            ## print >> outf, aid, pid, scorea, scorep, scorem, tf
            print >> outf, aid, pid, scorea, tf
    outf.close()

    print >> sys.stderr, 'Creating validation...'
    validation = get_valid()
    outf = open('affiliation.valid.dat', 'w')
    for aid in validation:
        for pid in validation[aid]:
            scorea= affil_align_score(affil_paper[aid][pid],
                                      affil_auth[aid])
            ## scorep = mean(
            ##     [affil_align_score(affil_paper[aid][pid],
            ##                        affil_paper[aid][pid2])
            ##      for pid2 in validation[aid] if pid2 != pid]
            ##     )
            ## scorem = max(scorea,scorep)
            print >> outf, aid, pid, scorea
    outf.close()    
Пример #5
0
def compute_venue_score():
    venue = get_venue()

    confirmed, deleted = get_train()
    P1All, P2All, norm1All, norm2All = get_ps(confirmed, venue)
    count, tot = 0, len(confirmed)
    outf = open('venue.train.dat', 'w')
    for aid in confirmed:
        count += 1
        print >> sys.stderr, '%d / %d' % (count, tot)

        P1, P2 = ps_without_author(
            aid, confirmed, venue, P1All, P2All, norm1All, norm2All
            )
        all = confirmed[aid] + deleted[aid]

        aidConfirmed = confirmed[aid]
        confirmed[aid] = []
        confirmed[aid] = aidConfirmed

        for p1 in all:
            if p1 in confirmed[aid]:
                tf = 'T'
            else:
                tf = 'F'
            others = [p for p in all if p != p1]
            s = get_score(p1, others, venue, P1, P2)
            if s > -.5 :
                print >> outf, aid, p1, s, tf
    outf.close()

    validation = get_valid()
    count, tot = 0, len(validation)
    outf = open('venue.valid.dat', 'w')
    for aid in validation:
        count += 1
        print >> sys.stderr, '%d / %d' % (count, tot)

        all = validation[aid]
        for p1 in all:
            others = [p for p in all if p != p1]
            s = get_score(p1, others, venue, P1All, P2All)
            if s > -.5 :
                print >> outf, aid, p1, s
    outf.close()
Пример #6
0
def get_name_score():
    print >> sys.stderr, 'Loading author-paper table...'
    paperName = get_paper_name()
    print >> sys.stderr, 'Loading author reference names...'
    baseName = get_base_name()

    confirmed, deleted = get_train()
    outf = open('name.train.dat', 'w')
    for aid in confirmed:
        all = confirmed[aid] + deleted[aid]
        for pid in confirmed[aid]:
            try:
                sFull, sInit = name_align_score(paperName[aid][pid],
                                                baseName[aid])
                print >> outf, aid, pid, sFull, sInit, 'T'
            except KeyError:
                pass
        for pid in deleted[aid]:
            try:
                sFull, sInit = name_align_score(paperName[aid][pid],
                                                baseName[aid])
                print >> outf, aid, pid, sFull, sInit, 'F'
            except KeyError:
                pass
    outf.close()
        
    validation = get_valid()
    outf = open('name.valid.dat', 'w')
    for aid in validation:
        all = validation[aid]
        for pid in all:
            try:
                sFull, sInit = name_align_score(paperName[aid][pid],
                                                baseName[aid])
                print >> outf, aid, pid, sFull, sInit
            except KeyError:
                pass
    outf.close()
Пример #7
0
def get_year_score():
    print >> sys.stderr, 'Loading publication years...'
    year = get_year()

    print >> sys.stderr, 'Creating training...'
    confirmed, deleted = get_train()
    outf = open('year.train.dat', 'w')
    count, tot = 0, len(confirmed)
    for aid in confirmed:
        count += 1
        print >> sys.stderr, '%d / %d' % (count, tot)
        all = confirmed[aid] + deleted[aid]
        paperYears = [year[p] for p in all]
        for pid in all:
            if pid in confirmed[aid]:
                tf = 'T'
            else:
                tf = 'F'
            score = (year[pid] - mean(paperYears)) / std(paperYears)
            if isnan(score):
                score = -100
            print >> outf, aid, pid, score, tf
    outf.close()

    print >> sys.stderr, 'Creating validation...'
    validation = get_valid()
    outf = open('year.valid.dat', 'w')
    for aid in validation:
        all = validation[aid]
        paperYears = [year[p] for p in all]
        for pid in all:
            score = (year[pid] - mean(paperYears)) / std(paperYears)
            if isnan(score):
                score = -100
            print >> outf, aid, pid, score
    outf.close()    
Пример #8
0
def compute_title_score():
    confirmed, deleted = get_train()
    title = get_titles()

    quick = {}

    for aid in confirmed:
        all = confirmed[aid] + deleted[aid]

        for pid in [p for p in confirmed[aid] if title.has_key(p)]:
            score = mean(
                [title_align_score(title[pid], title[pid2])
                 for pid2 in all
                 if pid2 != pid and title.has_key(pid2)]
                )
            print >> sys.stderr, aid, pid, score, 'T'

        for pid in [p for p in deleted[aid] if title.has_key(p)]:
            score = mean(
                [title_align_score(title[pid], title[pid2])
                 for pid2 in all
                 if pid2 != pid and title.has_key(pid2)]
                )
            print >> sys.stderr, aid, pid, score, 'F'
Пример #9
0
import sys
import gc
from copy import deepcopy
from random import choice
from numpy import log, mean, std
from scipy.misc import factorial
from scipy.sparse import coo_matrix
import scipy.sparse.linalg as sp

from common import get_author_papers, get_train, get_valid

if __name__ == '__main__':
    print >> sys.stderr, 'Reading data...'
    papers, authors = get_author_papers()
    confirmed, deleted = get_train()

    print >> sys.stderr, 'Calculating scores for train set...'
    confirmed, deleted = get_train()
    outf = open('nauthors.train.dat', 'w')
    for aid in confirmed:
        allPapers = confirmed[aid] + deleted[aid]
        for p1 in allPapers:
            if p1 in confirmed[aid]:
                tf = 'T'
            else:
                tf = 'F'
            print >> outf, aid, p1, len(authors[p1]), tf
    outf.close()

    print >> sys.stderr, 'Calculating scores for validation set...'
    validation = get_valid()
Пример #10
0
def get_kw_score(start=None, nauthors=100):
    kws = get_kws()

    print >> sys.stderr, 'Training set...'
    confirmed, deleted = get_train()
    aids = confirmed.keys()[:]
    naids = len(aids)
    if start != None:
        start=int(start)
        nauthors=int(nauthors)
        aids = aids[
            min(naids, nauthors * (start - 1)) :
            min(naids, nauthors * start)
            ]
        outFileName = 'keywords.train_%d-%d.dat' % (
            min(naids, nauthors * (start - 1)),
            min(naids, nauthors * start)
            )
    else:
        outFileName = 'keywords.train.dat'
    count, tot = 0, len(aids)
    outf = open(outFileName, 'w')
    for aid in aids:
        count += 1
        print >> sys.stderr, '%d / %d' % (count, tot)
        all = confirmed[aid] + deleted[aid]
        for pid in all:
            if pid in confirmed[aid]:
                tf = 'T'
            else:
                tf = 'F'
            scoren = len(kws[pid])
            scorea = mean(
                [kw_align_score(kws[pid], kws[pid2])
                 for pid2 in all if pid2 != pid]
                )
            print >> outf, aid, pid, scorea, scoren, tf
    outf.close()

    print >> sys.stderr, 'Valid set...'
    validation = get_valid()
    aids = validation.keys()[:]
    naids = len(aids)
    if start != None:
        aids = aids[
            min(naids, nauthors * (start - 1)) :
            min(naids, nauthors * start)
            ]
        outFileName = 'keywords.valid_%d-%d.dat' % (
            min(naids, nauthors * (start - 1)),
            min(naids, nauthors * start)
            )
    else:
        outFileName = 'keywords.valid.dat'
    count, tot = 0, len(aids)
    outf = open(outFileName, 'w')
    for aid in aids:
        all = validation[aid]
        for pid in all:
            scoren = len(kws[pid])
            scorea = mean(
                [kw_align_score(kws[pid], kws[pid2])
                 for pid2 in all if pid2 != pid]
                )
            print >> outf, aid, pid, scorea, scoren
    outf.close()