if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--fos', default='physics', type=str, choices=('physics', 'cs', 'sociology', 'math'), help='field of study') args = parser.parse_args() print(args.fos) directories = Directory(args.fos) directories.refresh() year_num = open_pkl_file(directories.directory_dataset_description, 'year_papernum') years = list(year_num.keys()) rd.shuffle(years) thread_num = 20 threads = [] for i in range(thread_num): threads.append( threading.Thread(target=paper_downloading, args=(years, thread_num, i + 1, directories.directory_mag_data, directories.field_of_study))) for t in threads: t.setDaemon(True) t.start()
import time import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--fos', default='physics', type=str, choices=('physics', 'cs', 'sociology', 'math'), help='field of study') args = parser.parse_args() print(args.fos) directories = Directory(args.fos) directories.refresh() paperIds = open_pkl_file(directories.directory_dataset_description, 'paperIds') affId_paperIds = {} num = 0 for paperId in paperIds: paper = open_paper(paperId, args.fos) num += 1 if num % 1000 == 0: print(num, '/', len(paperIds), ',', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) authorIds = set() for author in paper.authors: authorId = author.authorId if authorId in authorIds: continue authorIds.add(authorId) affId = author.affId
import sys sys.path.append('..') from utils.pkl_io import open_pkl_file, save_pkl_file from utils.directories import * import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--fos', default='physics', type=str, choices=('physics', 'cs', 'sociology', 'math'), help='field of study') args = parser.parse_args() print(args.fos) directories = Directory(args.fos) directories.refresh() year_affIds = open_pkl_file(directories.directory_data, 'year_affIds') years = list(year_affIds.keys()) years.sort() year_cul_affIds = {} year_cul_affIds[years[0]] = year_affIds[years[0]] for i in range(len(years) - 1): year_cul_affIds[years[i + 1]] = year_cul_affIds[years[i]].union( year_affIds[years[i + 1]]) save_pkl_file(directories.directory_data, 'year_cul_affIds', year_cul_affIds)
def cross_institution_scaling(property_x, property_y, filepath, filename, field_of_study): directories = Directory(field_of_study) fig_filepath = os.path.join(directories.directory_figures, '{}_vs_{}'.format(property_y, property_x)) make_dir(fig_filepath) affIds = open_pkl_file(directories.directory_dataset_description, 'affIds') year_x_y = {} for affId in affIds: affiliation = open_affiliation(affId, field_of_study) X = getattr(affiliation, property_x) Y = getattr(affiliation, property_y) for year in Y: x = X[year] if x < 2: continue # eliminate the biased data, e.g. institution size < 2 (institution size = 1) y = Y[year] if y == 0: continue # log0 is noe defined, so we eliminate the data with y = 0 if year not in year_x_y: year_x_y[year] = [] year_x_y[year].append([x, y]) # do the log-log linear regression for each year years = list(year_x_y.keys()) years.sort(reverse=True) year_alpha_and_r2 = [['year', 'alpha', 'R2']] year_alpha_and_r2_dict = {} for year in years: print('cross:', property_y, property_x, year) x_y = year_x_y[year] x_y = np.asarray(x_y) slope, r2, p_value, intercept, std_err = linear_regression(x_y) if np.isnan(slope): continue year_alpha_and_r2.append([year, slope, r2]) year_alpha_and_r2_dict[year] = [slope, r2] # make the plots (linear regression) if ('impact' not in property_y and year == 2017) or ('impact' in property_y and year in [2012, 2010]): xlabel = property_x ylabel = '{} in {}'.format(property_y, year) fig_filename = '{}_vs_{}_in_{}_plots_cross'.format( property_y, property_x, year) line_plot(x_y[:, 0], x_y[:, 1], slope, intercept, r2, xlabel, ylabel, fig_filepath, fig_filename) year_alpha_and_r2 = np.asarray(year_alpha_and_r2[1:]) curve_filename = '{}_vs_{}_curve_cross'.format(property_y, property_x) curve_plot(year_alpha_and_r2[:, 0], year_alpha_and_r2[:, 1], 'year', r'$\alpha$ of {} vs {} (cross)'.format(property_y, property_x), fig_filepath, curve_filename) with pd.ExcelWriter(os.path.join(filepath, filename + '.xlsx')) as writer: pd.DataFrame(year_alpha_and_r2).to_excel( writer, sheet_name='exponent_and_R2_in_each_year', header=['year', 'alpha', 'R2'], index=False) for year in years: pd.DataFrame(year_x_y[year]).to_excel( writer, sheet_name=str(year), header=[property_x, property_y], index=False) save_pkl_file(filepath, filename, year_alpha_and_r2_dict)
def within_institution_scaling(property_x, property_y, filepath, filename, field_of_study): directories = Directory(field_of_study) fig_filepath = os.path.join(directories.directory_figures, '{}_vs_{}'.format(property_y, property_x)) make_dir(fig_filepath) affIds = open_pkl_file(directories.directory_dataset_description, 'affIds') affId_x_y = {} affId_affname = [] for affId in affIds: affiliation = open_affiliation(affId, field_of_study) sizes = np.array(list(affiliation.year_size.values())) if sizes.max() - sizes.min() < 50: continue X = getattr(affiliation, property_x) Y = getattr(affiliation, property_y) x_y_year = [] for year in Y: x = X[year] if x < 2: # eliminate the biased data, e.g. institution size < 2 (institution size = 1) continue y = Y[year] if y == 0: # log0 is not defined, which will be eliminated continue x_y_year.append([x, y, year]) affId_x_y[affId] = x_y_year affId_affname.append([affId, affiliation.aff_name]) affId_affname.sort(key=lambda t: t[1]) affId_alpha_and_R2 = [['affiliation', 'exponent', 'R2']] affId_alpha_and_R2_dict = {} # do the log-log linear regression for each institution valid_affIds = [] for affId in affId_x_y: print('within:', property_y, property_x, affId) x_y = affId_x_y[affId] x_y = np.asarray(x_y) slope, r2, p_value, intercept, std_err = linear_regression( affId_x_y[affId]) if np.isnan(slope): continue valid_affIds.append(affId) affiliation = open_affiliation(affId, field_of_study) aff_name = affiliation.aff_name affId_alpha_and_R2.append([aff_name, slope, r2]) affId_alpha_and_R2_dict[aff_name] = (slope, r2) # make the plots (linear regression) if 'Harvard' in open_affiliation( affId, field_of_study).aff_name and not np.isnan(slope): xlabel = property_x ylabel = '{} in {}'.format( property_y, open_affiliation(affId, field_of_study).aff_name) fig_filename = '{}_vs_{}_in_{}_within'.format( property_y, property_x, open_affiliation(affId, field_of_study).aff_name) line_plot(x_y[:, 0], x_y[:, 1], slope, intercept, r2, xlabel, ylabel, fig_filepath, fig_filename) hist_filename = '{}_vs_{}_hist_within'.format(property_y, property_x) affId_alpha_and_R2 = np.asarray(affId_alpha_and_R2[1:]) histogram_plot( np.asarray(affId_alpha_and_R2[:, 1], dtype=np.float), r'$\alpha$ of {} vs {} (with)'.format(property_y, property_x), fig_filepath, hist_filename) # save the data with pd.ExcelWriter(os.path.join(filepath, filename + '.xlsx')) as writer: pd.DataFrame(affId_alpha_and_R2).to_excel( writer, sheet_name='alpha_and_R2_in_each_aff', header=['year', 'alpha', 'R2'], index=False) for affId in valid_affIds: affiliation = open_affiliation(affId, field_of_study) aff_name = affiliation.aff_name aff_name = aff_name[:min(29, len(aff_name))] pd.DataFrame(affId_x_y[affId]).to_excel( writer, sheet_name=aff_name, header=[property_x, property_y, 'year'], index=False) save_pkl_file(filepath, filename, affId_alpha_and_R2_dict)
return True if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--fos', default='physics', type=str, choices=('physics', 'cs', 'sociology', 'math'), help='field of study') args = parser.parse_args() print(args.fos) directories = Directory(args.fos) directories.refresh() valid_paperIds = set() num = 0 for filename in os.listdir(directories.directory_mag_data): paper_entities = open_pkl_file(directories.directory_mag_data, filename[0:-4]) for paper_entity in paper_entities: num += 1 if num % 1000 == 0: print(num, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) if not is_valid_paper(paper_entity): continue valid_paperIds.add(paper_entity['Id']) save_pkl_file(directories.directory_dataset_description, 'paperIds', valid_paperIds)
sys.path.append('..') from utils.pkl_io import open_pkl_file, save_pkl_file from utils.directories import * from utils.entity_io import open_affiliation import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--fos', default='physics', type=str, choices=('physics', 'cs', 'sociology', 'math'), help='field of study') args = parser.parse_args() print(args.fos) directories = Directory(args.fos) directories.refresh() affIds = open_pkl_file(directories.directory_data, 'affiliations').affIds year_affIds = {} for affId in affIds: affiliation = open_affiliation(affId) year_sizes = affiliation.year_size for year in year_sizes: if year not in year_affIds: year_affIds[year] = set() year_affIds[year].add(affId) save_pkl_file(directories.directory_data, 'year_affIds', year_affIds)
from utils.directories import * import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--fos', default='physics', type=str, choices=('physics', 'cs', 'sociology', 'math'), help='field of study') args = parser.parse_args() print(args.fos) directories = Directory(args.fos) directories.refresh() paperId_year = open_pkl_file(directories.directory_dataset_description, 'paperId_year') paperId_references = open_pkl_file( directories.directory_dataset_description, 'paperId_references') paperIds = open_pkl_file(directories.directory_dataset_description, 'paperIds') num = 0 cited_paper_citing_papers = {} for citing_paperId in paperIds: num += 1 if num % 1000 == 0: print(num, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) cited_paperIds = paperId_references[citing_paperId] for cited_paperId in cited_paperIds: if cited_paperId not in paperId_year: continue
""" This script creates (author, institution) sequence and each (author, institution) can appear only once. """ import sys sys.path.append('..') from utils.directories import * from utils.pkl_io import open_pkl_file, save_pkl_file import time from ordered_set import OrderedSet if __name__ == '__main__': authorId_sequence = open_pkl_file(directory_urn_model, 'ordered_authorId_sequence') affId_sequence = open_pkl_file(directory_urn_model, 'ordered_affId_sequence') authorId_affId_sequence = OrderedSet() # (authorId, affId) num = 0 for i in range(len(authorId_sequence)): num += 1 if num % 1000 == 0: print(num, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) authorId = authorId_sequence[i][0] affId = affId_sequence[i][0] authorId_affId_sequence.add((authorId, affId)) authorId_affId_sequence = list(authorId_affId_sequence) print(len(authorId_affId_sequence)) save_pkl_file(directory_urn_model, 'authorId_affId_sequence', authorId_affId_sequence)
from utils.pkl_io import open_pkl_file, save_pkl_file import time import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--fos', default='physics', type=str, choices=('physics', 'cs', 'sociology', 'math'), help='field of study') args = parser.parse_args() print(args.fos) directories = Directory(args.fos) paperIds = open_pkl_file(directories.directory_dataset_description, 'paperIds') authorId_sequence = [] # (authorId, date) affId_sequence = [] # (affId, date) num = 0 for filename in os.listdir(directories.directory_mag_data): paper_entities = open_pkl_file(directories.directory_mag_data, filename[0:-4]) for paper_entity in paper_entities: num += 1 if num % 1000 == 0: print(num, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) paperId = paper_entity['Id'] if paperId not in paperIds: continue