def paper_downloading(years, groups_num, group_num, directory_mag_data, field_of_study): length = math.ceil(len(years) / groups_num) start = (group_num - 1) * length end = min(group_num * length - 1, len(years) - 1) for i in range(start, end + 1): year = years[i] count = year_num[year] offsets = np.arange(0, count, step) for offset in offsets: if os.path.exists( os.path.join( directory_mag_data, 'paper_entities_{}_{}.pkl'.format(year, offset))): print('paper_entities_{}_{}'.format(year, offset), 'already exists') continue paper_entities = evaluate(field_of_study.replace('_', ' '), year, count, offset) if not paper_entities: continue save_pkl_file(directory_mag_data, 'paper_entities_{}_{}'.format(year, offset), paper_entities)
import sys sys.path.append('..') from utils.pkl_io import open_pkl_file, save_pkl_file from utils.directories import * import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--fos', default='physics', type=str, choices=('physics', 'cs', 'sociology', 'math'), help='field of study') args = parser.parse_args() print(args.fos) directories = Directory(args.fos) directories.refresh() year_affIds = open_pkl_file(directories.directory_data, 'year_affIds') years = list(year_affIds.keys()) years.sort() year_cul_affIds = {} year_cul_affIds[years[0]] = year_affIds[years[0]] for i in range(len(years) - 1): year_cul_affIds[years[i + 1]] = year_cul_affIds[years[i]].union( year_affIds[years[i + 1]]) save_pkl_file(directories.directory_data, 'year_cul_affIds', year_cul_affIds)
def within_institution_scaling(property_x, property_y, filepath, filename, field_of_study): directories = Directory(field_of_study) fig_filepath = os.path.join(directories.directory_figures, '{}_vs_{}'.format(property_y, property_x)) make_dir(fig_filepath) affIds = open_pkl_file(directories.directory_dataset_description, 'affIds') affId_x_y = {} affId_affname = [] for affId in affIds: affiliation = open_affiliation(affId, field_of_study) sizes = np.array(list(affiliation.year_size.values())) if sizes.max() - sizes.min() < 50: continue X = getattr(affiliation, property_x) Y = getattr(affiliation, property_y) x_y_year = [] for year in Y: x = X[year] if x < 2: # eliminate the biased data, e.g. institution size < 2 (institution size = 1) continue y = Y[year] if y == 0: # log0 is not defined, which will be eliminated continue x_y_year.append([x, y, year]) affId_x_y[affId] = x_y_year affId_affname.append([affId, affiliation.aff_name]) affId_affname.sort(key=lambda t: t[1]) affId_alpha_and_R2 = [['affiliation', 'exponent', 'R2']] affId_alpha_and_R2_dict = {} # do the log-log linear regression for each institution valid_affIds = [] for affId in affId_x_y: print('within:', property_y, property_x, affId) x_y = affId_x_y[affId] x_y = np.asarray(x_y) slope, r2, p_value, intercept, std_err = linear_regression( affId_x_y[affId]) if np.isnan(slope): continue valid_affIds.append(affId) affiliation = open_affiliation(affId, field_of_study) aff_name = affiliation.aff_name affId_alpha_and_R2.append([aff_name, slope, r2]) affId_alpha_and_R2_dict[aff_name] = (slope, r2) # make the plots (linear regression) if 'Harvard' in open_affiliation( affId, field_of_study).aff_name and not np.isnan(slope): xlabel = property_x ylabel = '{} in {}'.format( property_y, open_affiliation(affId, field_of_study).aff_name) fig_filename = '{}_vs_{}_in_{}_within'.format( property_y, property_x, open_affiliation(affId, field_of_study).aff_name) line_plot(x_y[:, 0], x_y[:, 1], slope, intercept, r2, xlabel, ylabel, fig_filepath, fig_filename) hist_filename = '{}_vs_{}_hist_within'.format(property_y, property_x) affId_alpha_and_R2 = np.asarray(affId_alpha_and_R2[1:]) histogram_plot( np.asarray(affId_alpha_and_R2[:, 1], dtype=np.float), r'$\alpha$ of {} vs {} (with)'.format(property_y, property_x), fig_filepath, hist_filename) # save the data with pd.ExcelWriter(os.path.join(filepath, filename + '.xlsx')) as writer: pd.DataFrame(affId_alpha_and_R2).to_excel( writer, sheet_name='alpha_and_R2_in_each_aff', header=['year', 'alpha', 'R2'], index=False) for affId in valid_affIds: affiliation = open_affiliation(affId, field_of_study) aff_name = affiliation.aff_name aff_name = aff_name[:min(29, len(aff_name))] pd.DataFrame(affId_x_y[affId]).to_excel( writer, sheet_name=aff_name, header=[property_x, property_y, 'year'], index=False) save_pkl_file(filepath, filename, affId_alpha_and_R2_dict)
args = parser.parse_args() print(args.fos) directories = Directory(args.fos) directories.refresh() paperIds = open_pkl_file(directories.directory_dataset_description, 'paperIds') affId_paperIds = {} num = 0 for paperId in paperIds: paper = open_paper(paperId, args.fos) num += 1 if num % 1000 == 0: print(num, '/', len(paperIds), ',', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) authorIds = set() for author in paper.authors: authorId = author.authorId if authorId in authorIds: continue authorIds.add(authorId) affId = author.affId if affId not in affId_paperIds: affId_paperIds[affId] = [] affId_paperIds[affId].append(paperId) save_pkl_file(directories.directory_dataset_description, 'affId_paperIds', affId_paperIds) save_pkl_file(directories.directory_dataset_description, 'affIds', list(affId_paperIds.keys()))
return True if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--fos', default='physics', type=str, choices=('physics', 'cs', 'sociology', 'math'), help='field of study') args = parser.parse_args() print(args.fos) directories = Directory(args.fos) directories.refresh() valid_paperIds = set() num = 0 for filename in os.listdir(directories.directory_mag_data): paper_entities = open_pkl_file(directories.directory_mag_data, filename[0:-4]) for paper_entity in paper_entities: num += 1 if num % 1000 == 0: print(num, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) if not is_valid_paper(paper_entity): continue valid_paperIds.add(paper_entity['Id']) save_pkl_file(directories.directory_dataset_description, 'paperIds', valid_paperIds)
def cross_institution_scaling(property_x, property_y, filepath, filename, field_of_study): directories = Directory(field_of_study) fig_filepath = os.path.join(directories.directory_figures, '{}_vs_{}'.format(property_y, property_x)) make_dir(fig_filepath) affIds = open_pkl_file(directories.directory_dataset_description, 'affIds') year_x_y = {} for affId in affIds: affiliation = open_affiliation(affId, field_of_study) X = getattr(affiliation, property_x) Y = getattr(affiliation, property_y) for year in Y: x = X[year] if x < 2: continue # eliminate the biased data, e.g. institution size < 2 (institution size = 1) y = Y[year] if y == 0: continue # log0 is noe defined, so we eliminate the data with y = 0 if year not in year_x_y: year_x_y[year] = [] year_x_y[year].append([x, y]) # do the log-log linear regression for each year years = list(year_x_y.keys()) years.sort(reverse=True) year_alpha_and_r2 = [['year', 'alpha', 'R2']] year_alpha_and_r2_dict = {} for year in years: print('cross:', property_y, property_x, year) x_y = year_x_y[year] x_y = np.asarray(x_y) slope, r2, p_value, intercept, std_err = linear_regression(x_y) if np.isnan(slope): continue year_alpha_and_r2.append([year, slope, r2]) year_alpha_and_r2_dict[year] = [slope, r2] # make the plots (linear regression) if ('impact' not in property_y and year == 2017) or ('impact' in property_y and year in [2012, 2010]): xlabel = property_x ylabel = '{} in {}'.format(property_y, year) fig_filename = '{}_vs_{}_in_{}_plots_cross'.format( property_y, property_x, year) line_plot(x_y[:, 0], x_y[:, 1], slope, intercept, r2, xlabel, ylabel, fig_filepath, fig_filename) year_alpha_and_r2 = np.asarray(year_alpha_and_r2[1:]) curve_filename = '{}_vs_{}_curve_cross'.format(property_y, property_x) curve_plot(year_alpha_and_r2[:, 0], year_alpha_and_r2[:, 1], 'year', r'$\alpha$ of {} vs {} (cross)'.format(property_y, property_x), fig_filepath, curve_filename) with pd.ExcelWriter(os.path.join(filepath, filename + '.xlsx')) as writer: pd.DataFrame(year_alpha_and_r2).to_excel( writer, sheet_name='exponent_and_R2_in_each_year', header=['year', 'alpha', 'R2'], index=False) for year in years: pd.DataFrame(year_x_y[year]).to_excel( writer, sheet_name=str(year), header=[property_x, property_y], index=False) save_pkl_file(filepath, filename, year_alpha_and_r2_dict)
paper_entities = open_pkl_file(directories.directory_mag_data, filename[0:-4]) for paper_entity in paper_entities: if not is_valid_paper(paper_entity): continue num += 1 if num % 1000 == 0: print(num, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) paperId = paper_entity['Id'] year = paper_entity['Y'] references = paper_entity['RId'] if 'RId' in paper_entity else [] paperId_year[paperId] = year paperId_references[paperId] = references for author in paper_entity['AA']: affId = author['AfId'] aff_name = author['DAfN'] affId_affnames.add((affId, aff_name)) pd.DataFrame(list(affId_affnames)).to_csv(os.path.join( directories.directory_dataset_description, 'paperId_year.csv'), index=False) save_pkl_file(directories.directory_dataset_description, 'paperId_year', paperId_year) save_pkl_file(directories.directory_dataset_description, 'paperId_references', paperId_references) save_pkl_file(directories.directory_dataset_description, 'affId_affnames', affId_affnames)
args = parser.parse_args() print(args.fos) directories = Directory(args.fos) directories.refresh() paperId_year = open_pkl_file(directories.directory_dataset_description, 'paperId_year') paperId_references = open_pkl_file( directories.directory_dataset_description, 'paperId_references') paperIds = open_pkl_file(directories.directory_dataset_description, 'paperIds') num = 0 cited_paper_citing_papers = {} for citing_paperId in paperIds: num += 1 if num % 1000 == 0: print(num, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) cited_paperIds = paperId_references[citing_paperId] for cited_paperId in cited_paperIds: if cited_paperId not in paperId_year: continue if paperId_year[citing_paperId] - paperId_year[cited_paperId] > 10: continue if cited_paperId not in cited_paper_citing_papers: cited_paper_citing_papers[cited_paperId] = set() cited_paper_citing_papers[cited_paperId].add(citing_paperId) save_pkl_file(directories.directory_dataset_description, 'cited_paper_citing_papers', cited_paper_citing_papers)
default='physics', type=str, choices=('physics', 'cs', 'sociology', 'math'), help='field of study') args = parser.parse_args() print(args.fos) directories = Directory(args.fos) directories.refresh() paperIds = open_pkl_file(directories.directory_dataset_description, 'paperIds') authorId_first_year = {} num = 0 for paperId in paperIds: paper = open_paper(paperId, args.fos) num += 1 if num % 1000 == 0: print(num, '/', len(paperIds), ',', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) year = paper.year for author in paper.authors: authorId = author.authorId if authorId not in authorId_first_year: authorId_first_year[authorId] = year else: authorId_first_year[authorId] = min( authorId_first_year[authorId], year) save_pkl_file(directories.directory_dataset_description, 'authorId_first_year', authorId_first_year)
""" This script creates (author, institution) sequence and each (author, institution) can appear only once. """ import sys sys.path.append('..') from utils.directories import * from utils.pkl_io import open_pkl_file, save_pkl_file import time from ordered_set import OrderedSet if __name__ == '__main__': authorId_sequence = open_pkl_file(directory_urn_model, 'ordered_authorId_sequence') affId_sequence = open_pkl_file(directory_urn_model, 'ordered_affId_sequence') authorId_affId_sequence = OrderedSet() # (authorId, affId) num = 0 for i in range(len(authorId_sequence)): num += 1 if num % 1000 == 0: print(num, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) authorId = authorId_sequence[i][0] affId = affId_sequence[i][0] authorId_affId_sequence.add((authorId, affId)) authorId_affId_sequence = list(authorId_affId_sequence) print(len(authorId_affId_sequence)) save_pkl_file(directory_urn_model, 'authorId_affId_sequence', authorId_affId_sequence)
for filename in os.listdir(directories.directory_mag_data): paper_entities = open_pkl_file(directories.directory_mag_data, filename[0:-4]) for paper_entity in paper_entities: num += 1 if num % 1000 == 0: print(num, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) paperId = paper_entity['Id'] if paperId not in paperIds: continue date = paper_entity['D'] authors = paper_entity['AA'] authorIds = set() for author in authors: authorId = author['AuId'] if authorId in authorIds: continue authorIds.add(authorId) affId = author['AfId'] authorId_sequence.append((authorId, date)) affId_sequence.append((affId, date)) authorId_sequence.sort(key=lambda t: t[1]) affId_sequence.sort(key=lambda t: t[1]) save_pkl_file(directories.directory_urn_model, 'ordered_authorId_sequence', authorId_sequence) save_pkl_file(directories.directory_urn_model, 'ordered_affId_sequence', affId_sequence)