Пример #1
0
def get_npapers():
    print >> sys.stderr, 'Reading data...'
    papers, authors = get_author_papers()

    print >> sys.stderr, 'Calculating scores for train set...'
    confirmed, deleted = get_train()
    outf = open('npapers.train.dat', 'w')
    for aid in confirmed:
        allPapers = confirmed[aid] + deleted[aid]
        for p1 in allPapers:
            if p1 in confirmed[aid]:
                tf = 'T'
            else:
                tf = 'F'
            print >> outf, aid, p1, len(papers[aid]), tf
    outf.close()

    print >> sys.stderr, 'Calculating scores for validation set...'
    validation = get_valid()
    outf = open('npapers.valid.dat', 'w')
    for aid in validation:
        allPapers = validation[aid]
        for p1 in allPapers:
            print >> outf, aid, p1, len(papers[aid])
    outf.close()
Пример #2
0
def get_sum_coauthors():
    print >> sys.stderr, 'Reading data...'
    papers, authors = get_author_papers()

    #Create the number of collaborations between 2 authors
    num_collaborations = {}
    for author in authors.values():
        for i in range(0,len(author)):
            for j in range(i+1,len(author)):
                #Always use the lowest author id as the first key
                if author[i]<author[j]:
                    try:
                        num_collaborations[(author[i], author[j])] += 1
                    except KeyError:
                        num_collaborations[(author[i], author[j])] = 1
                else:
                    try:
                        num_collaborations[(author[j], author[i])] += 1
                    except KeyError:
                        num_collaborations[(author[j], author[i])] = 1


    print >> sys.stderr, 'Calculating scores for train set...'
    confirmed, deleted = get_train()
    print confirmed
    outf = open('sumcoauthors.train.dat', 'w')
    for aid in confirmed:
        allPapers = confirmed[aid] + deleted[aid]
        for p1 in allPapers:
            sum_coauthors = 0
            for author in authors[p1]:
                if author<aid:
                    sum_coauthors += num_collaborations[(author,aid)]
                elif aid<author:
                    sum_coauthors += num_collaborations[(aid,author)]

            if p1 in confirmed[aid]:
                tf = 'T'
            else:
                tf = 'F'
            print >> outf, aid, p1, sum_coauthors, tf
    outf.close()

    print >> sys.stderr, 'Calculating scores for validation set...'
    validation = get_valid()
    outf = open('sumcoauthors.valid.dat', 'w')
    for aid in validation:
        allPapers = validation[aid]
        for p1 in allPapers:
            sum_coauthors = 0
            for author in authors[p1]:
                if author<aid:
                    sum_coauthors += num_collaborations[(author,aid)]
                elif aid<author:
                    sum_coauthors += num_collaborations[(aid,author)]
            print >> outf, aid, p1, sum_coauthors
    outf.close()
Пример #3
0
def get_nvenue():
    print >> sys.stderr, 'Reading venue info...'
    journal, conference = get_venue()
    print >> sys.stderr, 'Reading author-paper info...'
    papers, authors = get_author_papers()

    print >> sys.stderr, 'Counting papers in journals...'
    aid2journal = build_author_venue_count(papers, journal)
    print >> sys.stderr, 'Counting papers in conferences...'
    aid2conference = build_author_venue_count(papers, conference)

    print >> sys.stderr, 'Training set...'
    confirmed, deleted = get_train()
    outf = open('nvenue.train.dat', 'w')
    for aid in confirmed:
        for p1 in [p for p in confirmed[aid] + deleted[aid]]:
            if p1 in confirmed[aid]:
                tf = 'T'
            elif p1 in deleted[aid]:
                tf = 'F'
            else:
                raise WhatTheFuck
            try:
                sj = aid2journal[aid][journal[p1]]
            except KeyError:
                sj = aid2journal[aid][-1]                
            try:
                sc = aid2conference[aid][conference[p1]]
            except KeyError:
                sc = aid2conference[aid][-1]
            print >> outf, aid, p1, sj, sc, tf
    outf.close()

    print >> sys.stderr, 'Validation set...'
    validation = get_valid()
    outf = open('nvenue.valid.dat', 'w')
    for aid in validation:
        for p1 in validation[aid]:
            try:
                sj = aid2journal[aid][journal[p1]]
            except KeyError:
                sj = aid2journal[aid][-1]                
            try:
                sc = aid2conference[aid][conference[p1]]
            except KeyError:
                sc = aid2conference[aid][-1]
            print >> outf, aid, p1, sj, sc
    outf.close()
Пример #4
0
import sys
import gc
from copy import deepcopy
from random import choice
from numpy import log, mean, std
from scipy.misc import factorial
from scipy.sparse import coo_matrix
import scipy.sparse.linalg as sp

from common import get_author_papers, get_train, get_valid

if __name__ == '__main__':
    print >> sys.stderr, 'Reading data...'
    papers, authors = get_author_papers()
    confirmed, deleted = get_train()

    print >> sys.stderr, 'Calculating scores for train set...'
    confirmed, deleted = get_train()
    outf = open('nauthors.train.dat', 'w')
    for aid in confirmed:
        allPapers = confirmed[aid] + deleted[aid]
        for p1 in allPapers:
            if p1 in confirmed[aid]:
                tf = 'T'
            else:
                tf = 'F'
            print >> outf, aid, p1, len(authors[p1]), tf
    outf.close()

    print >> sys.stderr, 'Calculating scores for validation set...'
    validation = get_valid()