Пример #1
0
def paper_downloading(years, groups_num, group_num, directory_mag_data,
                      field_of_study):

    length = math.ceil(len(years) / groups_num)
    start = (group_num - 1) * length
    end = min(group_num * length - 1, len(years) - 1)

    for i in range(start, end + 1):
        year = years[i]
        count = year_num[year]
        offsets = np.arange(0, count, step)
        for offset in offsets:
            if os.path.exists(
                    os.path.join(
                        directory_mag_data,
                        'paper_entities_{}_{}.pkl'.format(year, offset))):
                print('paper_entities_{}_{}'.format(year, offset),
                      'already exists')
                continue
            paper_entities = evaluate(field_of_study.replace('_', ' '), year,
                                      count, offset)
            if not paper_entities:
                continue
            save_pkl_file(directory_mag_data,
                          'paper_entities_{}_{}'.format(year, offset),
                          paper_entities)
Пример #2
0
import sys
sys.path.append('..')
from utils.pkl_io import open_pkl_file, save_pkl_file
from utils.directories import *
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--fos',
                        default='physics',
                        type=str,
                        choices=('physics', 'cs', 'sociology', 'math'),
                        help='field of study')
    args = parser.parse_args()
    print(args.fos)
    directories = Directory(args.fos)
    directories.refresh()

    year_affIds = open_pkl_file(directories.directory_data, 'year_affIds')
    years = list(year_affIds.keys())
    years.sort()

    year_cul_affIds = {}
    year_cul_affIds[years[0]] = year_affIds[years[0]]
    for i in range(len(years) - 1):
        year_cul_affIds[years[i + 1]] = year_cul_affIds[years[i]].union(
            year_affIds[years[i + 1]])

    save_pkl_file(directories.directory_data, 'year_cul_affIds',
                  year_cul_affIds)
Пример #3
0
def within_institution_scaling(property_x, property_y, filepath, filename,
                               field_of_study):
    directories = Directory(field_of_study)

    fig_filepath = os.path.join(directories.directory_figures,
                                '{}_vs_{}'.format(property_y, property_x))
    make_dir(fig_filepath)

    affIds = open_pkl_file(directories.directory_dataset_description, 'affIds')
    affId_x_y = {}
    affId_affname = []

    for affId in affIds:
        affiliation = open_affiliation(affId, field_of_study)
        sizes = np.array(list(affiliation.year_size.values()))
        if sizes.max() - sizes.min() < 50:
            continue
        X = getattr(affiliation, property_x)
        Y = getattr(affiliation, property_y)

        x_y_year = []
        for year in Y:
            x = X[year]
            if x < 2:  # eliminate the biased data, e.g. institution size < 2 (institution size = 1)
                continue
            y = Y[year]
            if y == 0:  # log0 is not defined, which will be eliminated
                continue
            x_y_year.append([x, y, year])

        affId_x_y[affId] = x_y_year
        affId_affname.append([affId, affiliation.aff_name])

    affId_affname.sort(key=lambda t: t[1])
    affId_alpha_and_R2 = [['affiliation', 'exponent', 'R2']]
    affId_alpha_and_R2_dict = {}

    # do the log-log linear regression for each institution
    valid_affIds = []
    for affId in affId_x_y:
        print('within:', property_y, property_x, affId)
        x_y = affId_x_y[affId]
        x_y = np.asarray(x_y)
        slope, r2, p_value, intercept, std_err = linear_regression(
            affId_x_y[affId])
        if np.isnan(slope):
            continue
        valid_affIds.append(affId)
        affiliation = open_affiliation(affId, field_of_study)
        aff_name = affiliation.aff_name

        affId_alpha_and_R2.append([aff_name, slope, r2])
        affId_alpha_and_R2_dict[aff_name] = (slope, r2)

        # make the plots (linear regression)
        if 'Harvard' in open_affiliation(
                affId, field_of_study).aff_name and not np.isnan(slope):
            xlabel = property_x
            ylabel = '{} in {}'.format(
                property_y,
                open_affiliation(affId, field_of_study).aff_name)
            fig_filename = '{}_vs_{}_in_{}_within'.format(
                property_y, property_x,
                open_affiliation(affId, field_of_study).aff_name)
            line_plot(x_y[:, 0], x_y[:, 1], slope, intercept, r2, xlabel,
                      ylabel, fig_filepath, fig_filename)

    hist_filename = '{}_vs_{}_hist_within'.format(property_y, property_x)
    affId_alpha_and_R2 = np.asarray(affId_alpha_and_R2[1:])
    histogram_plot(
        np.asarray(affId_alpha_and_R2[:, 1], dtype=np.float),
        r'$\alpha$ of {} vs {} (with)'.format(property_y, property_x),
        fig_filepath, hist_filename)

    # save the data
    with pd.ExcelWriter(os.path.join(filepath, filename + '.xlsx')) as writer:
        pd.DataFrame(affId_alpha_and_R2).to_excel(
            writer,
            sheet_name='alpha_and_R2_in_each_aff',
            header=['year', 'alpha', 'R2'],
            index=False)
        for affId in valid_affIds:
            affiliation = open_affiliation(affId, field_of_study)
            aff_name = affiliation.aff_name
            aff_name = aff_name[:min(29, len(aff_name))]
            pd.DataFrame(affId_x_y[affId]).to_excel(
                writer,
                sheet_name=aff_name,
                header=[property_x, property_y, 'year'],
                index=False)

    save_pkl_file(filepath, filename, affId_alpha_and_R2_dict)
    args = parser.parse_args()
    print(args.fos)
    directories = Directory(args.fos)
    directories.refresh()

    paperIds = open_pkl_file(directories.directory_dataset_description,
                             'paperIds')
    affId_paperIds = {}
    num = 0
    for paperId in paperIds:
        paper = open_paper(paperId, args.fos)
        num += 1
        if num % 1000 == 0:
            print(num, '/', len(paperIds), ',',
                  time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        authorIds = set()
        for author in paper.authors:
            authorId = author.authorId
            if authorId in authorIds:
                continue
            authorIds.add(authorId)
            affId = author.affId
            if affId not in affId_paperIds:
                affId_paperIds[affId] = []
            affId_paperIds[affId].append(paperId)

    save_pkl_file(directories.directory_dataset_description, 'affId_paperIds',
                  affId_paperIds)
    save_pkl_file(directories.directory_dataset_description, 'affIds',
                  list(affId_paperIds.keys()))
    return True


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--fos',
                        default='physics',
                        type=str,
                        choices=('physics', 'cs', 'sociology', 'math'),
                        help='field of study')
    args = parser.parse_args()
    print(args.fos)
    directories = Directory(args.fos)
    directories.refresh()

    valid_paperIds = set()
    num = 0
    for filename in os.listdir(directories.directory_mag_data):
        paper_entities = open_pkl_file(directories.directory_mag_data,
                                       filename[0:-4])
        for paper_entity in paper_entities:
            num += 1
            if num % 1000 == 0:
                print(num, time.strftime("%Y-%m-%d %H:%M:%S",
                                         time.localtime()))
            if not is_valid_paper(paper_entity):
                continue
            valid_paperIds.add(paper_entity['Id'])
    save_pkl_file(directories.directory_dataset_description, 'paperIds',
                  valid_paperIds)
Пример #6
0
def cross_institution_scaling(property_x, property_y, filepath, filename,
                              field_of_study):
    directories = Directory(field_of_study)

    fig_filepath = os.path.join(directories.directory_figures,
                                '{}_vs_{}'.format(property_y, property_x))
    make_dir(fig_filepath)

    affIds = open_pkl_file(directories.directory_dataset_description, 'affIds')
    year_x_y = {}
    for affId in affIds:
        affiliation = open_affiliation(affId, field_of_study)
        X = getattr(affiliation, property_x)
        Y = getattr(affiliation, property_y)

        for year in Y:
            x = X[year]
            if x < 2:
                continue  # eliminate the biased data, e.g. institution size < 2 (institution size = 1)
            y = Y[year]
            if y == 0:
                continue  # log0 is noe defined, so we eliminate the data with y = 0
            if year not in year_x_y:
                year_x_y[year] = []
            year_x_y[year].append([x, y])

    # do the log-log linear regression for each year
    years = list(year_x_y.keys())

    years.sort(reverse=True)
    year_alpha_and_r2 = [['year', 'alpha', 'R2']]
    year_alpha_and_r2_dict = {}
    for year in years:
        print('cross:', property_y, property_x, year)
        x_y = year_x_y[year]
        x_y = np.asarray(x_y)
        slope, r2, p_value, intercept, std_err = linear_regression(x_y)
        if np.isnan(slope):
            continue
        year_alpha_and_r2.append([year, slope, r2])
        year_alpha_and_r2_dict[year] = [slope, r2]

        # make the plots (linear regression)
        if ('impact' not in property_y
                and year == 2017) or ('impact' in property_y
                                      and year in [2012, 2010]):
            xlabel = property_x
            ylabel = '{} in {}'.format(property_y, year)
            fig_filename = '{}_vs_{}_in_{}_plots_cross'.format(
                property_y, property_x, year)
            line_plot(x_y[:, 0], x_y[:, 1], slope, intercept, r2, xlabel,
                      ylabel, fig_filepath, fig_filename)

    year_alpha_and_r2 = np.asarray(year_alpha_and_r2[1:])
    curve_filename = '{}_vs_{}_curve_cross'.format(property_y, property_x)
    curve_plot(year_alpha_and_r2[:, 0], year_alpha_and_r2[:, 1], 'year',
               r'$\alpha$ of {} vs {} (cross)'.format(property_y, property_x),
               fig_filepath, curve_filename)

    with pd.ExcelWriter(os.path.join(filepath, filename + '.xlsx')) as writer:
        pd.DataFrame(year_alpha_and_r2).to_excel(
            writer,
            sheet_name='exponent_and_R2_in_each_year',
            header=['year', 'alpha', 'R2'],
            index=False)
        for year in years:
            pd.DataFrame(year_x_y[year]).to_excel(
                writer,
                sheet_name=str(year),
                header=[property_x, property_y],
                index=False)

    save_pkl_file(filepath, filename, year_alpha_and_r2_dict)
Пример #7
0
        paper_entities = open_pkl_file(directories.directory_mag_data,
                                       filename[0:-4])
        for paper_entity in paper_entities:
            if not is_valid_paper(paper_entity):
                continue
            num += 1
            if num % 1000 == 0:
                print(num, time.strftime("%Y-%m-%d %H:%M:%S",
                                         time.localtime()))
            paperId = paper_entity['Id']
            year = paper_entity['Y']
            references = paper_entity['RId'] if 'RId' in paper_entity else []
            paperId_year[paperId] = year
            paperId_references[paperId] = references

            for author in paper_entity['AA']:
                affId = author['AfId']
                aff_name = author['DAfN']
                affId_affnames.add((affId, aff_name))

    pd.DataFrame(list(affId_affnames)).to_csv(os.path.join(
        directories.directory_dataset_description, 'paperId_year.csv'),
                                              index=False)

    save_pkl_file(directories.directory_dataset_description, 'paperId_year',
                  paperId_year)
    save_pkl_file(directories.directory_dataset_description,
                  'paperId_references', paperId_references)
    save_pkl_file(directories.directory_dataset_description, 'affId_affnames',
                  affId_affnames)
    args = parser.parse_args()
    print(args.fos)
    directories = Directory(args.fos)
    directories.refresh()

    paperId_year = open_pkl_file(directories.directory_dataset_description,
                                 'paperId_year')
    paperId_references = open_pkl_file(
        directories.directory_dataset_description, 'paperId_references')
    paperIds = open_pkl_file(directories.directory_dataset_description,
                             'paperIds')

    num = 0
    cited_paper_citing_papers = {}
    for citing_paperId in paperIds:
        num += 1
        if num % 1000 == 0:
            print(num, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        cited_paperIds = paperId_references[citing_paperId]
        for cited_paperId in cited_paperIds:
            if cited_paperId not in paperId_year:
                continue
            if paperId_year[citing_paperId] - paperId_year[cited_paperId] > 10:
                continue
            if cited_paperId not in cited_paper_citing_papers:
                cited_paper_citing_papers[cited_paperId] = set()
            cited_paper_citing_papers[cited_paperId].add(citing_paperId)

    save_pkl_file(directories.directory_dataset_description,
                  'cited_paper_citing_papers', cited_paper_citing_papers)
Пример #9
0
                        default='physics',
                        type=str,
                        choices=('physics', 'cs', 'sociology', 'math'),
                        help='field of study')
    args = parser.parse_args()
    print(args.fos)
    directories = Directory(args.fos)
    directories.refresh()

    paperIds = open_pkl_file(directories.directory_dataset_description,
                             'paperIds')
    authorId_first_year = {}
    num = 0
    for paperId in paperIds:
        paper = open_paper(paperId, args.fos)
        num += 1
        if num % 1000 == 0:
            print(num, '/', len(paperIds), ',',
                  time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        year = paper.year
        for author in paper.authors:
            authorId = author.authorId
            if authorId not in authorId_first_year:
                authorId_first_year[authorId] = year
            else:
                authorId_first_year[authorId] = min(
                    authorId_first_year[authorId], year)

    save_pkl_file(directories.directory_dataset_description,
                  'authorId_first_year', authorId_first_year)
"""
This script creates (author, institution) sequence and each (author, institution) can appear only once.
"""
import sys
sys.path.append('..')
from utils.directories import *
from utils.pkl_io import open_pkl_file, save_pkl_file
import time
from ordered_set import OrderedSet

if __name__ == '__main__':
    authorId_sequence = open_pkl_file(directory_urn_model,
                                      'ordered_authorId_sequence')
    affId_sequence = open_pkl_file(directory_urn_model,
                                   'ordered_affId_sequence')
    authorId_affId_sequence = OrderedSet()  # (authorId, affId)

    num = 0
    for i in range(len(authorId_sequence)):
        num += 1
        if num % 1000 == 0:
            print(num, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        authorId = authorId_sequence[i][0]
        affId = affId_sequence[i][0]
        authorId_affId_sequence.add((authorId, affId))

    authorId_affId_sequence = list(authorId_affId_sequence)
    print(len(authorId_affId_sequence))
    save_pkl_file(directory_urn_model, 'authorId_affId_sequence',
                  authorId_affId_sequence)
    for filename in os.listdir(directories.directory_mag_data):
        paper_entities = open_pkl_file(directories.directory_mag_data,
                                       filename[0:-4])
        for paper_entity in paper_entities:
            num += 1
            if num % 1000 == 0:
                print(num, time.strftime("%Y-%m-%d %H:%M:%S",
                                         time.localtime()))
            paperId = paper_entity['Id']
            if paperId not in paperIds:
                continue
            date = paper_entity['D']
            authors = paper_entity['AA']
            authorIds = set()
            for author in authors:
                authorId = author['AuId']
                if authorId in authorIds:
                    continue
                authorIds.add(authorId)
                affId = author['AfId']
                authorId_sequence.append((authorId, date))
                affId_sequence.append((affId, date))

    authorId_sequence.sort(key=lambda t: t[1])
    affId_sequence.sort(key=lambda t: t[1])

    save_pkl_file(directories.directory_urn_model, 'ordered_authorId_sequence',
                  authorId_sequence)
    save_pkl_file(directories.directory_urn_model, 'ordered_affId_sequence',
                  affId_sequence)