def coauthorship_network(paa_df, focus_author_ids=None, focus_constraint='authors', show_progress=False): """ Create the co-authorship network. Parameters ---------- :param paa_df : DataFrame A DataFrame with the links between authors and publications. :param focus_author_ids : numpy array or list, default None A list of the AuthorIds to seed the coauthorship-network. :param focus_constraint : str, default `authors` If focus_author_ids is not None: `authors` : the `focus_author_ids' defines the node set, giving only the co-authorships between authors in the set. `publications` : the publication history of `focus_author_ids' defines the edge set, giving the co-authorhips where at least one author from `focus_author_ids' was involved. 'ego' : the `focus_author_ids' defines a seed set, such that all authors must have co-authored at least one publication with an author from `focus_author_ids', but co-authorships are also found between the second-order author sets. :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- coo_matrix The adjacency matrix for the co-authorship network author2int, dict A mapping of AuthorIds to the row/column of the adjacency matrix. """ required_columns = ['AuthorId', 'PublicationId'] check4columns(paa_df, required_columns) paa_df = paa_df[required_columns].dropna() if not focus_author_ids is None: focus_author_ids = np.sort(focus_author_ids) # identify the subset of the publications we need to form the network if focus_constraint == 'authors': # take only the publication-author links that have an author from the `focus_author_ids' paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values, focus_author_ids)] elif focus_constraint == 'publications': # take all publications authored by an author from the `focus_author_ids' focus_pubs = np.sort(paa_df.loc[isin_sorted( paa_df['AuthorId'].values, focus_author_ids)]['PublicationId'].unique()) # then take only the subset of publication-author links inducded by these publications paa_df = paa_df.loc[isin_sorted(paa_df['PublicationId'].values, focus_pubs)] del focus_pubs elif focus_constraint == 'ego': # take all publications authored by an author from the `focus_author_ids' focus_pubs = np.sort(paa_df.loc[isin_sorted( paa_df['AuthorId'].values, focus_author_ids)]['PublicationId'].unique()) # then take all authors who contribute to this subset of publications focus_author_ids = np.sort(paa_df.loc[isin_sorted( paa_df['PublicationId'].values, focus_pubs)]['AuthorId'].unique()) del focus_pubs # finally take the publication-author links that have an author from the above ego subset paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values, focus_author_ids)] # map authors to the row/column of the adj mat author2int = { aid: i for i, aid in enumerate(np.sort(paa_df['AuthorId'].unique())) } Nauthors = paa_df['AuthorId'].nunique() adj_mat = sparse.dok_matrix((Nauthors, Nauthors), dtype=int) def coauthor_cluster(author_list): if author_list.shape[0] >= 2: for ia, ja in combinations(author_list, 2): adj_mat[author2int[ia], author2int[ja]] += 1 # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='CoAuthorship Relations', leave=True, disable=not show_progress) # go through all publications and apply the coauthorship edge generator paa_df.groupby('PublicationId')['AuthorId'].progress_apply( coauthor_cluster) adj_mat = adj_mat + adj_mat.transpose() return adj_mat, author2int
def raostriling_interdisciplinarity(pub2ref_df, pub2field_df, focus_pub_ids=None, pub2field_norm=True, temporal=False, citation_direction='references', field_distance_metric='cosine', distance_matrix=None, show_progress=False): """ Calculate the RaoStirling index as a measure of a publication's interdisciplinarity. See :cite:`stirling20` for the definition and :cite:`gates2019naturereach` for an application. Parameters ---------- :param pub2ref_df : DataFrame A DataFrame with the citation information for each Publication. :param pub2field_df : DataFrame A DataFrame with the field information for each Publication. :param focus_pub_ids : numpy array or list, default None A list of the PublicationIds to calculate interdisciplinarity. :param pub2field_norm : bool, default True When a publication occurs in m > 1 fields, count the publication 1/m times in each field. Normalizes the membership vector so it sums to 1 for each publication. :param temporal : bool, default False If True, compute the distance matrix using only publications for each year. :param citation_direction : str, default `references` `references` : the fields are defined by a publication's references. `citations` : the fields are defined by a publication's citations. :param field_distance_metric : str, default `cosine` The interfield distance metric. Valid entries come from sklearn.metrics.pairwise_distances: ‘cosine‘, ‘euclidean’, ‘l1’, ‘l2’, etc. :param distance_matrix : numpy array, default None The precomputed field distance matrix. :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- DataFrame DataFrame with 2 columns: 'PublicationId', 'RaoStirling' """ # now we map citing and cited to the source and target depending on which diretion was specified by `citation_direction' if citation_direction == 'references': pub2ref_rename_dict = { 'CitedPublicationId': 'TargetId', 'CitingPublicationId': 'SourceId' } year_col = 'CitingYear' elif citation_direction == 'citations': pub2ref_rename_dict = { 'CitedPublicationId': 'SourceId', 'CitingPublicationId': 'TargetId' } year_col = 'CitedYear' required_columns = ['CitedPublicationId', 'CitingPublicationId'] if temporal: required_columns.append(year_col) check4columns(pub2ref_df, required_columns) pub2ref_df = pub2ref_df[required_columns].dropna().copy(deep=True) check4columns(pub2field_df, ['PublicationId', 'FieldId']) pub2field_df = pub2field_df.copy(deep=True) # check that the precomputed distance matrix is the correct size if distance_matrix is None: distance_matrix = field_citation_distance(pub2ref_df, pub2field_df, pub2field_norm, temporal, citation_direction, field_distance_metric, show_progress) field2int = { fid: i for i, fid in enumerate(np.sort(pub2field_df['FieldId'].unique())) } pub2field_df['FieldId'] = [ field2int[fid] for fid in pub2field_df['FieldId'].values ] Nfields = len(field2int) pub2ref_df.rename(columns=pub2ref_rename_dict, inplace=True) if not focus_pub_ids is None: pub2ref_df = pub2ref_df.loc[isin_sorted(pub2ref_df['SourceId'].values, focus_pub_ids)] if temporal: years = np.sort(pub2ref_df[year_col].unique()) year2int = {y: i for i, y in enumerate(years)} Nyears = years.shape[0] if type(distance_matrix) == pd.DataFrame and temporal: check4columns(distance_matrix, ['iFieldId', 'jFieldId', year_col, 'FieldDistance']) distance_matrix = distance_matrix.loc[isin_sorted( distance_matrix[year_col].values, years)].copy(deep=True) distance_matrix['iFieldId'] = [ field2int.get(fid, None) for fid in distance_matrix['iFieldId'].values ] distance_matrix['jFieldId'] = [ field2int.get(fid, None) for fid in distance_matrix['jFieldId'].values ] distance_matrix.dropna(inplace=True) tdm = np.zeros((Nyears, Nfields, Nfields)) for y in years: tdm[year2int[y]] = dataframe2bipartite( df=distance_matrix[distance_matrix[year_col] == y], rowname='iFieldId', colname='jFieldId', shape=(Nfields, Nfields), weightname='FieldDistance').todense() tdm[year2int[y]] = tdm[year2int[y]] + tdm[year2int[y]].T distance_matrix = tdm elif type(distance_matrix) == pd.DataFrame and not temporal: check4columns(distance_matrix, ['iFieldId', 'jFieldId', 'FieldDistance']) distance_matrix = distance_matrix.copy(deep=True) distance_matrix['iFieldId'] = [ field2int.get(fid, None) for fid in distance_matrix['iFieldId'].values ] distance_matrix['jFieldId'] = [ field2int.get(fid, None) for fid in distance_matrix['jFieldId'].values ] distance_matrix.dropna(inplace=True) distance_matrix = dataframe2bipartite( df=distance_matrix, rowname='iFieldId', colname='jFieldId', shape=(Nfields, Nfields), weightname='FieldDistance').todense() distance_matrix = distance_matrix + distance_matrix.T elif (type(distance_matrix) == np.array or type(distance_matrix) == np.matrix): if not temporal and distance_matrix.shape != (Nfields, Nfields): raise pySciSciMetricError( 'The precomputed_distance_matrix is of the wrong size to compute the RaoStirling interdisciplinarity for the publications passed.' ) elif temporal and distance_matrix.shape != (Nyears, Nfields, Nfields): raise pySciSciMetricError( 'The precomputed_distance_matrix is of the wrong size to compute the RaoStirling interdisciplinarity for the publications and years passed.' ) # the assignment of a publication to a field is 1/(number of fields) when normalized, and 1 otherwise if pub2field_norm: pub2nfields = pub2field_df.groupby( 'PublicationId')['FieldId'].nunique() else: pub2nfields = defaultdict(lambda: 1) pub2field_df['PubFieldContribution'] = [ 1.0 / pub2nfields[pid] for pid in pub2field_df['PublicationId'].values ] # merge the references to the fields for the target fields pub2ref_df = pub2ref_df.merge( pub2field_df, how='left', left_on='TargetId', right_on='PublicationId').rename( columns={ 'FieldId': 'TargetFieldId', 'PubFieldContribution': 'TargetPubFieldContribution' }) del pub2ref_df['PublicationId'] pub2ref_df.dropna(inplace=True) # Now we start on the RaoStiring calculation if temporal: rsdf = [] for y, ydf in pub2ref_df.groupby(year_col): # for each year, we need to map individual publications to the rows of our matrix ypub2int = { pid: i for i, pid in enumerate(np.sort(ydf['SourceId'].unique())) } yint2pub = {i: pid for pid, i in ypub2int.items()} ydf['SourceId'] = [ypub2int[fid] for fid in ydf['SourceId'].values] yNpubs = len(ypub2int) # calculate the publication representation vectors over fields ypub2field_mat = dataframe2bipartite( df=ydf, rowname='SourceId', colname='TargetFieldId', shape=(yNpubs, Nfields), weightname='TargetPubFieldContribution').tocsr() # make sure the publication 2 field vector is normalized ypub2field_mat = normalize(ypub2field_mat, norm='l1', axis=1) # finally, we calculate the matrix representation of the RS measure yrsdf = pd.DataFrame() yrsdf['PublicationId'] = [ yint2pub[i] for i in np.sort(ydf['SourceId'].unique()) ] yrsdf['CitingYear'] = y yrsdf['RaoStirling'] = 0.5 * np.squeeze( np.asarray( ypub2field_mat.dot( spsparse.csr_matrix(distance_matrix[year2int[y]])). multiply(ypub2field_mat).sum(axis=1))) rsdf.append(yrsdf) rsdf = pd.concat(rsdf) return rsdf else: # first map individual publications to the rows of our matrix pub2int = { pid: i for i, pid in enumerate(np.sort(pub2ref_df['SourceId'].unique())) } int2pub = {i: pid for pid, i in pub2int.items()} pub2ref_df['SourceId'] = [ pub2int[pid] for pid in pub2ref_df['SourceId'].values ] pub2ref_df[['SourceId', 'TargetFieldId' ]] = pub2ref_df[['SourceId', 'TargetFieldId']].astype(int) Npubs = len(pub2int) # calculate the publication representation vectors over fields pub2field_mat = dataframe2bipartite( df=pub2ref_df, rowname='SourceId', colname='TargetFieldId', shape=(Npubs, Nfields), weightname='TargetPubFieldContribution').tocsr() # make sure the publication 2 field vector is normalized pub2field_mat = normalize(pub2field_mat, norm='l1', axis=1) distance_matrix = spsparse.csr_matrix(distance_matrix) # finally, we calculate the matrix representation of the RS measure rsdf = pd.DataFrame() rsdf['RaoStirling'] = 0.5 * np.squeeze( np.asarray( spsparse.csr_matrix.multiply( pub2field_mat.dot(distance_matrix), pub2field_mat).sum(axis=1))) rsdf['PublicationId'] = [ int2pub[i] for i in np.sort(pub2ref_df['SourceId'].unique()) ] return rsdf
def credit_share(focus_pid, pub2ref_df, pub2author_df, temporal=False, normed=False, show_progress=False): """ Calculate the credit share for each author of a publication based on :cite:`Shen2014credit`. Parameters ---------- :param focus_pid : int, str The focus publication id. :param pub2ref_df : DataFrame A DataFrame with the citation information for each Publication. :param pub2author_df : DataFrame A DataFrame with the author information for each Publication. :param temporal : bool, default False If True, compute the adjacency matrix using only publications for each year. :param normed : bool, default False Normalize the sum of credit share to 1.0 :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- credit_share, numpy array If temporal == False: The adjacency matrix for the co-citation network If temporal == True: A dictionary with key for each year, and value of the adjacency matrix for the cocitation network induced by citing publications in that year. author2int, dict A mapping of the AuthorIds from the focus publication to the column of the credit share vector or matrix (see above). """ # the focus publication's authors focus_authors = np.sort(pub2author_df.loc[pub2author_df['PublicationId']==focus_pid]['AuthorId'].unique()) author2int = {aid:i for i, aid in enumerate(focus_authors)} if focus_authors.shape[0] > 1: # start by getting the co-citation network around the focus publication adj_mat, cited2int = cocitation_network(pub2ref_df, focus_pub_ids=np.sort([focus_pid]), focus_constraint='egocited', temporal=temporal, show_progress=show_progress) # get the authorships for the publications in the cocitation network cocited_pubs = np.sort(list(cited2int.keys())) pa_df = pub2author_df.loc[isin_sorted(pub2author_df['PublicationId'].values, cocited_pubs)] if cocited_pubs.shape[0] > 0: # the credit allocation matrix has a row for each focus author, and a column for each cocited publication (including the focus pub) credit_allocation_mat = np.zeros((focus_authors.shape[0], cocited_pubs.shape[0]), dtype = float) # for each cocited publication, we count the number of authors # and assign to each focus author, their fractional share of the credit (1 divided by the number of authors) for cocitedid, adf in pa_df.groupby('PublicationId'): author2row = [author2int[aid] for aid in adf['AuthorId'].unique() if not author2int.get(aid, None) is None] if len(author2row) > 0: credit_allocation_mat[author2row, cited2int[cocitedid]] = 1.0/adf['AuthorId'].nunique() if temporal: # temporal credit allocation - broken down by year # we need the temporal citations to the focus article focus_citations = groupby_count(pub2ref_df.loc[isin_sorted(pub2ref_df['CitedPublicationId'].values, np.sort([focus_pid]))], colgroupby='CitingYear', colcountby='CitingPublicationId', count_unique=True, show_progress=False) focus_citations={y:c for y,c in focus_citations[['CitingYear', 'CitingPublicationIdCount']].values} # when temporal is True, a temporal adj mat is returned where each key is the year years = np.sort(list(adj_mat.keys())) cocite_counts = np.zeros((years.shape[0], cocited_pubs.shape[0]), dtype=float) for iy, y in enumerate(years): cocite_counts[iy] = adj_mat[y].tocsr()[cited2int[focus_pid]].todense()#set the off-diagonal to be the total co-citations from that year cocite_counts[iy, cited2int[focus_pid]] = focus_citations[y] #set the diagonal to be the total citations from that year cocite_counts = cocite_counts.cumsum(axis=0) else: # just do credit allocation with the full cocitation matrix cocite_counts = adj_mat.tocsr()[cited2int[focus_pid]].todense() # the co-citation matrix misses the number of citations to the focus publication # so explicitly calculate the number of citations to the focus publication cocite_counts[0,cited2int[focus_pid]] = pub2ref_df.loc[isin_sorted(pub2ref_df['CitedPublicationId'].values, np.sort([focus_pid]))]['CitingPublicationId'].nunique() # credit share is the matrix product of the credit_allocation_mat with cocite_counts credit_share = np.squeeze(np.asarray(credit_allocation_mat.dot(cocite_counts.T))) # normalize the credit share vector to sum to 1 if normed: credit_share = credit_share/credit_share.sum(axis=0) if temporal: return credit_share, author2int, years else: return credit_share, author2int else: if temporal: years = np.sort(pub2ref_df.loc[pub2ref_df['CitedPublicationId'] == focus_pid]['CitingYear'].unique()) return np.array([[None for y in years] for a in author2int]), author2int, years else: return np.array([None for a in author2int]), author2int elif focus_authors.shape[0] == 1: if temporal: years = np.sort(pub2ref_df.loc[pub2ref_df['CitedPublicationId'] == focus_pid]['CitingYear'].unique()) return np.ones(shape=(1,years.shape[0])), author2int, years else: return np.array([1.0]), author2int
def cocitation_network(pub2ref_df, focus_pub_ids=None, focus_constraint='citing', temporal=False, show_progress=False): """ Create the co-citation network. Parameters ---------- :param pub2ref_df : DataFrame A DataFrame with the links between authors and publications. :param focus_pub_ids : numpy array or list, default None A list of the PublicationIds to seed the cocitation-network. :param focus_constraint : str, default `citing` If focus_author_ids is not None: `citing` : the `focus_pub_ids' defines the citation set, giving only the co-citations between the references of the publications from this set. `cited` : the `focus_pub_ids' defines the cocitation node set. 'egocited' : the `focus_pub_ids' defines a seed set, such that all other publications must have been co-citeed with at least one publication from this set. :param temporal : bool, default False If True, compute the adjacency matrix using only publications for each year. :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- coo_matrix or dict of coo_matrix If temporal == False: The adjacency matrix for the co-citation network If temporal == True: A dictionary with key for each year, and value of the adjacency matrix for the cocitation network induced by citing publications in that year. pub2int, dict A mapping of PublicationIds to the row/column of the adjacency matrix. """ required_columns = ['CitedPublicationId', 'CitingPublicationId'] if temporal: required_columns.append('CitingYear') check4columns(pub2ref_df, required_columns) pub2ref_df = pub2ref_df[required_columns].dropna() if not focus_pub_ids is None: focus_pub_ids = np.sort(focus_pub_ids) # identify the subset of the publications we need to form the network if focus_constraint == 'citing': # take only the links that have a citing publication from the `focus_pub_ids' pub2ref_df = pub2ref_df.loc[isin_sorted( pub2ref_df['CitingPublicationId'].values, focus_pub_ids)] elif focus_constraint == 'cited': # take only the links that have a cited publication from the `focus_pub_ids' pub2ref_df = pub2ref_df.loc[isin_sorted( pub2ref_df['CitedPublicationId'].values, focus_pub_ids)] elif focus_constraint == 'egocited': # take all publications that cite one of the publications in `focus_pub_ids' focus_citing_pubs = np.sort(pub2ref_df.loc[isin_sorted( pub2ref_df['CitedPublicationId'].values, focus_pub_ids)]['CitingPublicationId'].unique()) # then take all the links that have a citing publication from the `focus_citing_pubs' pub2ref_df = pub2ref_df.loc[isin_sorted( pub2ref_df['CitingPublicationId'].values, focus_citing_pubs)] del focus_citing_pubs pub2ref_df.drop_duplicates( subset=['CitingPublicationId', 'CitedPublicationId'], inplace=True) if pub2ref_df.shape[0] > 0: # map cited publications to the rows of the bipartite adj mat cited2int = { pid: i for i, pid in enumerate( np.sort(pub2ref_df['CitedPublicationId'].unique())) } Ncited = pub2ref_df['CitedPublicationId'].nunique() pub2ref_df['CitedPublicationId'] = [ cited2int[pid] for pid in pub2ref_df['CitedPublicationId'].values ] # map citing publications to the columns of the bipartite adj mat citing2int = { pid: i for i, pid in enumerate( np.sort(pub2ref_df['CitingPublicationId'].unique())) } Nciting = pub2ref_df['CitingPublicationId'].nunique() pub2ref_df['CitingPublicationId'] = [ citing2int[pid] for pid in pub2ref_df['CitingPublicationId'].values ] if temporal: years = np.sort(pub2ref_df['CitingYear'].unique()) temporal_adj = {} for y in years: bipartite_adj = dataframe2bipartite( pub2ref_df.loc[pub2ref_df['CitingYear'] == y], 'CitedPublicationId', 'CitingPublicationId', (Ncited, Nciting)) adj_mat = project_bipartite_mat(bipartite_adj, project_to='row') # remove diagonal entries adj_mat.setdiag(0) adj_mat.eliminate_zeros() temporal_adj[y] = adj_mat return temporal_adj, cited2int else: bipartite_adj = dataframe2bipartite(pub2ref_df, 'CitedPublicationId', 'CitingPublicationId', (Ncited, Nciting)) adj_mat = project_bipartite_mat(bipartite_adj, project_to='row') # remove diagonal entries adj_mat.setdiag(0) adj_mat.eliminate_zeros() return adj_mat, cited2int else: return spsparse.coo_matrix(), {}
def coauthorship_network(paa_df, focus_author_ids=None, focus_constraint='authors', temporal=False, show_progress=False): """ Create the co-authorship network. Parameters ---------- :param paa_df : DataFrame A DataFrame with the links between authors and publications. :param focus_author_ids : numpy array or list, default None A list of the AuthorIds to seed the coauthorship-network. :param focus_constraint : str, default `authors` If focus_author_ids is not None: `authors` : the `focus_author_ids' defines the node set, giving only the co-authorships between authors in the set. `publications` : the publication history of `focus_author_ids' defines the edge set, giving the co-authorhips where at least one author from `focus_author_ids' was involved. 'ego' : the `focus_author_ids' defines a seed set, such that all authors must have co-authored at least one publication with an author from `focus_author_ids', but co-authorships are also found between the second-order author sets. :param temporal : bool, default False If True, compute the adjacency matrix using only publications for each year. :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- coo_matrix or dict of coo_matrix If temporal == False: The adjacency matrix for the co-authorship network If temporal == True: A dictionary with key for each year, and value of the adjacency matrix for the co-authorship network induced by publications in that year. author2int, dict A mapping of AuthorIds to the row/column of the adjacency matrix. """ required_columns = ['AuthorId', 'PublicationId'] if temporal: required_columns.append('Year') check4columns(paa_df, required_columns) paa_df = paa_df[required_columns].dropna() if not focus_author_ids is None: focus_author_ids = np.sort(focus_author_ids) # identify the subset of the publications we need to form the network if focus_constraint == 'authors': # take only the publication-author links that have an author from the `focus_author_ids' paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values, focus_author_ids)] elif focus_constraint == 'publications': # take all publications authored by an author from the `focus_author_ids' focus_pubs = np.sort(paa_df.loc[isin_sorted( paa_df['AuthorId'].values, focus_author_ids)]['PublicationId'].unique()) # then take only the subset of publication-author links inducded by these publications paa_df = paa_df.loc[isin_sorted(paa_df['PublicationId'].values, focus_pubs)] del focus_pubs elif focus_constraint == 'ego': # take all publications authored by an author from the `focus_author_ids' focus_pubs = np.sort(paa_df.loc[isin_sorted( paa_df['AuthorId'].values, focus_author_ids)]['PublicationId'].unique()) # then take all authors who contribute to this subset of publications focus_author_ids = np.sort(paa_df.loc[isin_sorted( paa_df['PublicationId'].values, focus_pubs)]['AuthorId'].unique()) del focus_pubs # finally take the publication-author links that have an author from the above ego subset paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values, focus_author_ids)] paa_df.drop_duplicates(subset=['AuthorId', 'PublicationId'], inplace=True) # map authors to the rows of the bipartite adj mat author2int = { aid: i for i, aid in enumerate(np.sort(paa_df['AuthorId'].unique())) } Nauthors = paa_df['AuthorId'].nunique() paa_df['AuthorId'] = [author2int[aid] for aid in paa_df['AuthorId'].values] # map publications to the columns of the bipartite adj mat pub2int = { pid: i for i, pid in enumerate(np.sort(paa_df['PublicationId'].unique())) } Npubs = paa_df['PublicationId'].nunique() paa_df['PublicationId'] = [ pub2int[pid] for pid in paa_df['PublicationId'].values ] if temporal: years = np.sort(paa_df['Year'].unique()) temporal_adj = {} for y in years: bipartite_adj = dataframe2bipartite( paa_df.loc[paa_df['Year'] == y], 'AuthorId', 'PublicationId', (Nauthors, Npubs)) adj_mat = project_bipartite_mat(bipartite_adj, project_to='row') # remove diagonal entries adj_mat.setdiag(0) adj_mat.eliminate_zeros() temporal_adj[y] = adj_mat return temporal_adj, author2int else: bipartite_adj = dataframe2bipartite(paa_df, 'AuthorId', 'PublicationId', (Nauthors, Npubs)) adj_mat = project_bipartite_mat(bipartite_adj, project_to='row') # remove diagonal entries adj_mat.setdiag(0) adj_mat.eliminate_zeros() return adj_mat, author2int
def filter_doctypes(self, doctypes = ['j', 'b', 'bc', 'c'], show_progress=False): """ Filter all of the publication files keeping only the publications of specified doctype. :param list doctypes: optional the list of doctypes :return None: """ doctypes = np.sort(doctypes) if show_progress: print("Starting DocType filter. \nFiltering Publications.") valid_pubids = [] pub2year = {} pub2doctype = {} Nfiles = sum('publication' in fname for fname in os.listdir(os.path.join(self.path2database, 'publication'))) for ifile in range(Nfiles): pubdf = pd.read_hdf(os.path.join(self.path2database, 'publication', 'publication{}.hdf'.format(ifile))) pubdf.loc[isin_sorted(pubdf['DocType'].values, doctypes)] pubdf.dropna(subset=['Year'], inplace=True) pubdf['Year'] = pubdf['Year'].astype(int) pubdf.to_hdf(os.path.join(self.path2database, 'publication', 'publication{}.hdf'.format(ifile)), key='pub', mode='w') valid_pubids.extend(pubdf['PublicationId'].values) for pid, y, dt in pubdf[['PublicationId', 'Year', 'DocType']].values: pub2year[pid] = y pub2doctype[pid] = dt with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2year).encode('utf8')) with gzip.open(os.path.join(self.path2database, 'pub2doctype.json.gz'), 'w') as outfile: outfile.write(json.dumps(pub2doctype).encode('utf8')) del pubdf valid_pubids = np.sort(valid_pubids) if show_progress: print("Filtering References.") Nfiles = sum('pub2ref' in fname for fname in os.listdir(os.path.join(self.path2database, 'pub2ref'))) for ifile in range(Nfiles): pub2refdf = pd.read_hdf(os.path.join(self.path2database, 'pub2ref', 'pub2ref{}.hdf'.format(ifile))) pub2refdf = pub2refdf.loc[isin_sorted(pub2refdf['CitedPublicationId'].values, valid_pubids)] pub2refdf = pub2refdf.loc[isin_sorted(pub2refdf['CitingPublicationId'].values, valid_pubids)] pub2refdf.to_hdf(os.path.join(self.path2database, 'pub2ref', 'pub2ref{}.hdf'.format(ifile)), key='pub2ref', mode='w') if show_progress: print("Filtering Publication and Author.") Nfiles = sum('publicationauthoraffiliation' in fname for fname in os.listdir(os.path.join(self.path2database, 'publicationauthoraffiliation'))) for ifile in range(Nfiles): paa_df = pd.read_hdf(os.path.join(self.path2database, 'publicationauthoraffiliation', 'publicationauthoraffiliation{}.hdf'.format(ifile))) paa_df = paa_df.loc[isin_sorted(paa_df['PublicationId'].values, valid_pubids)] paa_df.to_hdf(os.path.join(self.path2database, 'publicationauthoraffiliation', 'publicationauthoraffiliation{}.hdf'.format(ifile)), key='paa', mode='w') if show_progress: print("Finished filtering DocType.")
def load_preprocessed_data(dataname, path2database, columns=None, isindict=None, duplicate_subset=None, duplicate_keep='last', dropna=None, keep_source_file=False, prefunc2apply=None, postfunc2apply=None, show_progress=False): """ Load the preprocessed DataFrame from a preprocessed directory. Parameters ---------- :param dataname : str The type of preprocessed data to load. :param path2database : str The path to the database directory. :param columns : list, default None Load only this subset of columns :param isindict : dict, default None Dictionary of format {"ColumnName":"ListofValues"} where "ColumnName" is a data column and "ListofValues" is a sorted list of valid values. A DataFrame only containing rows that appear in "ListofValues" will be returned. :param duplicate_subset : list, default None Drop any duplicate entries as specified by this subset of columns :param duplicate_keep : str, default 'last', Optional If duplicates are being dropped, keep the 'first' or 'last' (see `pandas.DataFram.drop_duplicates <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html>`_) :param dropna : list, default None, Optional Drop any NaN entries as specified by this subset of columns :param keep_source_file : bool, default False Keep track of the source file the data was loaded from. :param prefunc2apply : callable, default None A function to apply to each of the sub-DataFrames as they are loaded before filtering. :param postfunc2apply : callable, default None A function to apply to each of the sub-DataFrames as they are loaded after filtering. Returns ------- DataFrame dataname DataFrame. """ path2files = os.path.join(path2database, dataname) if not os.path.exists(path2files): # TODO: make a real warning raise NotImplementedError("First preprocess the raw data.") return [] if isinstance(columns, str): columns = [columns] if isinstance(dropna, str): dropna = [dropna] if isinstance(duplicate_subset, str): duplicate_subset = [duplicate_subset] if isinstance(isindict, dict): isindict = { isinkey: np.sort(isinlist) for isinkey, isinlist in isindict.items() } FileNumbers = sorted([ int(fname.replace(dataname, '').split('.')[0]) for fname in os.listdir(path2files) if dataname in fname ]) desc = '' if isinstance(show_progress, str): desc = show_progress data_df = [] for ifile in tqdm(FileNumbers, desc=desc, leave=True, disable=not show_progress): fname = os.path.join(path2files, dataname + "{}.hdf".format(ifile)) subdf = pd.read_hdf(fname, mode='r') if callable(prefunc2apply): subdf = prefunc2apply(subdf) if isinstance(columns, list): subdf = subdf[columns] if isinstance(dropna, list): subdf.dropna(subset=dropna, inplace=True, how='any') if isinstance(isindict, dict): for isinkey, isinlist in isindict.items(): subdf = subdf[isin_sorted(subdf[isinkey], isinlist)] if isinstance(duplicate_subset, list): subdf.drop_duplicates(subset=duplicate_subset, keep=duplicate_keep, inplace=True) if keep_source_file: subdf['filetag'] = ifile if callable(postfunc2apply): postfunc2apply(subdf) data_df.append(subdf) data_df = pd.concat(data_df) if isinstance(duplicate_subset, list): data_df.drop_duplicates(subset=duplicate_subset, keep=duplicate_keep, inplace=True) return data_df
def cociting_network(pub2ref_df, focus_pub_ids=None, focus_constraint='citing', temporal=False, show_progress=False): """ Create the co-citing network. Each node is a publication, two publications are linked if they cite the same article. Parameters ---------- pub2ref_df : DataFrame A DataFrame with the links between authors and publications. focus_pub_ids : numpy array or list, default None A list of the PublicationIds to seed the cocitation-network. focus_constraint : str, default 'citing' If focus_author_ids is not None - 'citing' : the 'focus_pub_ids' defines the citation set, giving only the co-citations between the references of the publications from this set. - 'cited' : the 'focus_pub_ids' defines the cocitation node set. show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- coo_matrix or dict of coo_matrix The adjacency matrix for the co-citing network pub2int, dict A mapping of PublicationIds to the row/column of the adjacency matrix. | """ required_columns = ['CitedPublicationId', 'CitingPublicationId'] check4columns(pub2ref_df, required_columns) pub2ref_df = pub2ref_df[required_columns].dropna() if not focus_pub_ids is None: focus_pub_ids = np.sort(focus_pub_ids) # identify the subset of the publications we need to form the network if focus_constraint == 'citing': # take only the links that have a citing publication from the `focus_pub_ids' pub2ref_df = pub2ref_df.loc[isin_sorted( pub2ref_df['CitingPublicationId'].values, focus_pub_ids)] elif focus_constraint == 'cited': # take only the links that have a cited publication from the `focus_pub_ids' pub2ref_df = pub2ref_df.loc[isin_sorted( pub2ref_df['CitedPublicationId'].values, focus_pub_ids)] pub2ref_df.drop_duplicates( subset=['CitingPublicationId', 'CitedPublicationId'], inplace=True) if pub2ref_df.shape[0] > 0: # map cited publications to the rows of the bipartite adj mat cited2int = { pid: i for i, pid in enumerate( np.sort(pub2ref_df['CitedPublicationId'].unique())) } Ncited = pub2ref_df['CitedPublicationId'].nunique() pub2ref_df['CitedPublicationId'] = [ cited2int[pid] for pid in pub2ref_df['CitedPublicationId'].values ] # map citing publications to the columns of the bipartite adj mat citing2int = { pid: i for i, pid in enumerate( np.sort(pub2ref_df['CitingPublicationId'].unique())) } Nciting = pub2ref_df['CitingPublicationId'].nunique() pub2ref_df['CitingPublicationId'] = [ citing2int[pid] for pid in pub2ref_df['CitingPublicationId'].values ] bipartite_adj = dataframe2bipartite(pub2ref_df, 'CitedPublicationId', 'CitingPublicationId', (Ncited, Nciting)) adj_mat = project_bipartite_mat(bipartite_adj, project_to='col') # remove diagonal entries adj_mat.setdiag(0) adj_mat.eliminate_zeros() return adj_mat, cited2int else: return spsparse.coo_matrix(), {}