Пример #1
0
def create_journalcitation_table(pubdf, pub2ref):
    required_pub_columns = ['PublicationId', 'JournalId', 'Year']
    check4columns(pubdf, required_pub_columns)
    pubdf = pubdf[required_pub_columns]

    required_pub2ref_columns = ['CitingPublicationId', 'CitedPublicationId']
    check4columns(pub2ref, required_pub_columns)
    pub2ref = pub2ref[required_pub2ref_columns]

    journals = np.sort(pubdf['JournalId'].unique())
    journal2int = {j: i for i, j in enumerate(journals)}
    pubdf['JournalInt'] = [journal2int[jid] for jid in pubdf['JournalId']]

    jctable = pub2ref.merge(pubdf[['PublicationId', 'Year', 'JournalInt']],
                            how='left',
                            left_on='CitingPublicationId',
                            right_on='PublicationId')
    jctable.rename({'Year': 'CitingYear', 'JournalInt': 'CitingJournalInt'})
    del jctable['PublicationId']
    del jctable['CitingPublicationId']

    jctable = jctable.merge(pubdf[['PublicationId', 'Year', 'JournalInt']],
                            how='left',
                            left_on='CitedPublicationId',
                            right_on='PublicationId')
    jctable.rename({'Year': 'CitedYear', 'JournalInt': 'CitedJournalInt'})
    del jctable['PublicationId']
    del jctable['CitedPublicationId']

    return jctable, {i: j for j, i in journal2int.items()}
Пример #2
0
def author_top_field(pub2author_df, colgroupby = 'AuthorId', colcountby = 'FieldId', fractional_field_counts = False, show_progress=False):
    """
    Calculate the most frequent field in the authors career.

    Parameters
    ----------
    pub2author_df : DataFrame
        A DataFrame with the author2publication field information.

    colgroupby : str, default 'AuthorId'
        The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.

    colcountby : str, default 'FieldId'
        The DataFrame column with Citation counts for each publication.  If None then the database 'FieldId' is used.

    fractional_field_counts : bool, default False
        How to count publications that are assigned to multiple fields:
            - If False, each publication-field assignment is counted once.
            - If True, each publication is counted once, contributing 1/#fields to each field.

    Returns
    -------
    DataFrame
        DataFrame with 2 columns: 'AuthorId', 'TopFieldId'

    """

    check4columns(pub2author_df, [colgroupby, 'PublicationId', colcountby])

    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='Author Top Field', disable= not show_progress)

    if not fractional_field_counts:
        author2field = pub2author_df.groupby(colgroupby)[colcountby].progress_apply(lambda x: x.mode()[0])

    else:
        # first calculate how many fields each publication maps too
        pub2nfields = groupby_count(pub2author_df, colgroupby='PublicationId', colcountby=colcountby)

        # each pub2field mapping is weighted by the number of fields for the publication
        pub2nfields['PublicationWeight'] = 1.0/pub2nfields['PublicationIdCount']
        del pub2nfields[str(colcountby)+'Count']

        # merge counts
        author2field = pub2author_df.merge(pub2nfields, how='left', on='PublicationId')

        # custom weighted mode based on 
        def weighted_mode(adf):
            p = adf.groupby(colcountby)['PublicationWeight'].sum()
            return p.idxmax()

        # now take the weighted mode for each groupby column
        author2field = author2field.groupby(colgroupby).progress_apply(weighted_mode)

    newname_dict = zip2dict([str(colcountby), '0'], ['Top' + str(colcountby)]*2)
    return author2field.to_frame().reset_index().rename(columns=newname_dict)
Пример #3
0
def compute_cnorm(pub2ref, pub2year):
    """
    This function calculates the cnorm for publications.

    References
    ----------
    .. [h] Ke, Q., Gates, A. J., Barabasi, A.-L. (2020): "title",
           *in submission*.
           DOI: xxx
    """
    raise NotImplementedError

    required_pub2ref_columns = ['CitingPublicationId', 'CitedPublicationId']
    check4columns(pub2ref, required_pub_columns)
    pub2ref = pub2ref[required_pub2ref_columns]

    # we need the citation counts and cocitation network
    temporal_cocitation_dict = {
        y: defaultdict(set)
        for y in set(pub2year.values())
    }
    temporal_citation_dict = {
        y: defaultdict(int)
        for y in temporal_cocitation_dict.keys()
    }

    def count_cocite(cited_df):
        y = pub2year[cited_df.name]

        for citedpid in cited_df['CitedPublicationId'].values:
            temporal_citation_dict[y][citedpid] += 1
        for icitedpid, jcitedpid in combinations(
                cited_df['CitedPublicationId'].values, 2):
            temporal_cocitation_dict[y][icitedpid].add(jcitedpid)
            temporal_cocitation_dict[y][jcitedpid].add(icitedpid)

    pub2ref.groupby('CitingPublicationId', sort=False).apply(count_cocite)

    cnorm = {}
    for y in temporal_citation_dict.keys():
        for citedpid, year_cites in temporal_citation_dict[y].items():
            if cnorm.get(citedpid, None) is None:
                cnorm[citedpid] = {y: year_cites / np.mean()}
Пример #4
0
def publication_beauty(pub2ref_df,
                       colgroupby='CitedPublicationId',
                       colcountby='CitingPublicationId',
                       show_progress=False):
    """
    Calculate the sleeping beauty and awakening time for each cited publication.  See :cite:`Sinatra2016qfactor` for the derivation.

    The algorithmic implementation can be found in :py:func:`metrics.qfactor`.

    Parameters
    ----------
    pub2ref_df : DataFrame, default None, Optional
        A DataFrame with the temporal citing information information.

    colgroupby : str, default 'CitedPublicationId', Optional
        The DataFrame column with Author Ids.  If None then the database 'CitedPublicationId' is used.

    colcountby : str, default 'CitingPublicationId', Optional
        The DataFrame column with Citation counts for each publication.  If None then the database 'CitingPublicationId' is used.

    Returns
    -------
    DataFrame
        Trajectory DataFrame with 2 columns: 'AuthorId', 'Hindex'

    """

    check4columns(pub2ref_df,
                  ['CitedPublicationId', 'CitingPublicationId', 'CitingYear'])

    tqdm.pandas(desc='Beauty', disable=not show_progress)

    df = groupby_count(pub2ref_df,
                       colgroupby=['CitedPublicationId', 'CitingYear'],
                       colcountby='CitingPublicationId',
                       count_unique=True)

    newname_dict = zip2dict([str(colcountby), '0', '1'],
                            [str(colgroupby) + 'Beauty'] * 2 + ['Awakening'])
    return df.groupby(colgroupby)[colcountby + 'Count'].progress_transform(
        beauty_coefficient).rename(columns=newname_dict)
Пример #5
0
def temporal_cocited_edgedict(pub2ref, pub2year):

    required_pub2ref_columns = ['CitingPublicationId', 'CitedPublicationId']
    check4columns(pub2ref, required_pub2ref_columns)
    pub2ref = pub2ref[required_pub2ref_columns]

    year_values = sorted(list(set(pub2year.values())))

    # we need the citation counts and cocitation network
    temporal_cocitation_dict = {y: defaultdict(set) for y in year_values}
    temporal_citation_dict = {y: defaultdict(int) for y in year_values}

    def count_cocite(cited_df):
        y = pub2year[cited_df.name]

        for citedpid in cited_df['CitedPublicationId'].values:
            temporal_citation_dict[y][citedpid] += 1
        for icitedpid, jcitedpid in combinations(
                cited_df['CitedPublicationId'].values, 2):
            temporal_cocitation_dict[y][icitedpid].add(jcitedpid)
            temporal_cocitation_dict[y][jcitedpid].add(icitedpid)

    pub2ref.groupby('CitingPublicationId', sort=False).apply(count_cocite)
Пример #6
0
def coauthorship_network(paa_df,
                         focus_author_ids=None,
                         focus_constraint='authors',
                         show_progress=False):
    """
    Create the co-authorship network.

    Parameters
    ----------
    :param paa_df : DataFrame
        A DataFrame with the links between authors and publications.

    :param focus_author_ids : numpy array or list, default None
        A list of the AuthorIds to seed the coauthorship-network.

    :param focus_constraint : str, default `authors`
        If focus_author_ids is not None:
            `authors` : the `focus_author_ids' defines the node set, giving only the co-authorships between authors in the set.
            `publications` : the publication history of `focus_author_ids' defines the edge set, giving the co-authorhips where at least 
                                one author from `focus_author_ids' was involved.
            'ego' : the `focus_author_ids' defines a seed set, such that all authors must have co-authored at least one publication with 
                                an author from `focus_author_ids', but co-authorships are also found between the second-order author sets. 

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.

    Returns
    -------
    coo_matrix
        The adjacency matrix for the co-authorship network

    author2int, dict
        A mapping of AuthorIds to the row/column of the adjacency matrix.

    """
    required_columns = ['AuthorId', 'PublicationId']
    check4columns(paa_df, required_columns)
    paa_df = paa_df[required_columns].dropna()

    if not focus_author_ids is None:
        focus_author_ids = np.sort(focus_author_ids)

        # identify the subset of the publications we need to form the network
        if focus_constraint == 'authors':
            # take only the publication-author links that have an author from the `focus_author_ids'
            paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values,
                                            focus_author_ids)]

        elif focus_constraint == 'publications':
            # take all publications authored by an author from the `focus_author_ids'
            focus_pubs = np.sort(paa_df.loc[isin_sorted(
                paa_df['AuthorId'].values,
                focus_author_ids)]['PublicationId'].unique())
            # then take only the subset of publication-author links inducded by these publications
            paa_df = paa_df.loc[isin_sorted(paa_df['PublicationId'].values,
                                            focus_pubs)]
            del focus_pubs

        elif focus_constraint == 'ego':
            # take all publications authored by an author from the `focus_author_ids'
            focus_pubs = np.sort(paa_df.loc[isin_sorted(
                paa_df['AuthorId'].values,
                focus_author_ids)]['PublicationId'].unique())
            # then take all authors who contribute to this subset of publications
            focus_author_ids = np.sort(paa_df.loc[isin_sorted(
                paa_df['PublicationId'].values,
                focus_pubs)]['AuthorId'].unique())
            del focus_pubs
            # finally take the publication-author links that have an author from the above ego subset
            paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values,
                                            focus_author_ids)]

    #  map authors to the row/column of the adj mat
    author2int = {
        aid: i
        for i, aid in enumerate(np.sort(paa_df['AuthorId'].unique()))
    }
    Nauthors = paa_df['AuthorId'].nunique()

    adj_mat = sparse.dok_matrix((Nauthors, Nauthors), dtype=int)

    def coauthor_cluster(author_list):
        if author_list.shape[0] >= 2:
            for ia, ja in combinations(author_list, 2):
                adj_mat[author2int[ia], author2int[ja]] += 1

    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='CoAuthorship Relations',
                leave=True,
                disable=not show_progress)

    # go through all publications and apply the coauthorship edge generator
    paa_df.groupby('PublicationId')['AuthorId'].progress_apply(
        coauthor_cluster)

    adj_mat = adj_mat + adj_mat.transpose()

    return adj_mat, author2int
Пример #7
0
def compute_raostriling_interdisciplinarity(pub2ref_df,
                                            pub2field_df,
                                            focus_pub_ids=None,
                                            pub2field_norm=True,
                                            temporal=False,
                                            citation_direction='references',
                                            field_distance_metric='cosine',
                                            distance_matrix=None,
                                            show_progress=False):
    """
    Calculate the RaoStirling index as a measure of a publication's interdisciplinarity.
    See :cite:`stirling20` for the definition and :cite:`gates2019naturereach` for an application.

    Parameters
    ----------
    :param pub2ref_df : DataFrame
        A DataFrame with the citation information for each Publication.

    :param pub2field_df : DataFrame
        A DataFrame with the field information for each Publication.

    :param focus_pub_ids : numpy array or list, default None
        A list of the PublicationIds to calculate interdisciplinarity.

    :param pub2field_norm : bool, default True
        When a publication occurs in m > 1 fields, count the publication 1/m times in each field.  Normalizes the membership
        vector so it sums to 1 for each publication.

    :param temporal : bool, default False
        If True, compute the distance matrix using only publications for each year.

    :param citation_direction : str, default `references`
        `references` : the fields are defined by a publication's references.
        `citations` : the fields are defined by a publication's citations.

    :param field_distance_metric : str, default `cosine`
        The interfield distance metric.  Valid entries come from sklearn.metrics.pairwise_distances:
        ‘cosine‘, ‘euclidean’, ‘l1’, ‘l2’, etc.

    :param distance_matrix : numpy array, default None
        The precomputed field distance matrix.

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.

    Returns
    -------
    DataFrame
        DataFrame with 2 columns: 'PublicationId', 'RaoStirling'

    """

    required_columns = ['CitedPublicationId', 'CitingPublicationId']
    if temporal:
        required_columns.append('CitingYear')
    check4columns(pub2ref_df, required_columns)
    pub2ref_df = pub2ref_df[required_columns].dropna()

    check4columns(pub2field_df, ['PublicationId', 'FieldId'])

    # to leverage matrix operations we need to map fields to the rows/cols of the matrix
    field2int = {
        fid: i
        for i, fid in enumerate(np.sort(pub2field_df['FieldId'].unique()))
    }
    pub2field_df['FieldId'] = [
        field2int[fid] for fid in pub2field_df['FieldId'].values
    ]
    Nfields = len(field2int)

    if temporal:
        years = np.sort(pub2ref_df['CitingYear'].unique())
        year2int = {y: i for i, y in enumerate(years)}
        Nyears = years.shape[0]

    # check that the precomputed distance matrix is the correct size
    if not precomputed_distance_matrix is None:
        if not temporal and precomputed_distance_matrix != (Nfields, Nfields):
            raise pySciSciMetricError(
                'The precomputed_distance_matrix is of the wrong size to compute the RaoStirling interdisciplinarity for the publications passed.'
            )
        elif temporal and precomputed_distance_matrix != (Nyears, Nfields,
                                                          Nfields):
            raise pySciSciMetricError(
                'The precomputed_distance_matrix is of the wrong size to compute the RaoStirling interdisciplinarity for the publications and years passed.'
            )

    # the assignment of a publication to a field is 1/(number of fields) when normalized, and 1 otherwise
    if pub2field_norm:
        pub2nfields = pub2field_df.groupby(
            'PublicationId')['FieldId'].nunique()
    else:
        pub2nfields = defaultdict(lambda: 1)
    pub2field_df['PubFieldContribution'] = [
        1.0 / pub2nfields[pid] for pid in pub2field_df['PublicationId'].values
    ]

    # now we map citing and cited to the source and target depending on which diretion was specified by `citation_direction'
    if citation_direction == 'references':
        pub2ref_rename_dict = {
            'CitedPublicationId': 'TargetId',
            'CitingPublicationId': 'SourceId'
        }
    elif citation_direction == 'citations':
        pub2ref_rename_dict = {
            'CitedPublicationId': 'SourceId',
            'CitingPublicationId': 'TargetId'
        }

    pub2ref_df = pub2ref_df.rename(columns=pub2ref_rename_dict)

    # merge the references to the fields for the target fields
    pub2ref_df = pub2ref_df.merge(
        pub2field_df, how='left', left_on='TargetId',
        right_on='PublicationId').rename(
            columns={
                'FieldId': 'TargetFieldId',
                'PubFieldContribution': 'TargetPubFieldContribution'
            })
    del pub2ref_df['PublicationId']

    # we need to calcuate the field 2 field distance matrix
    if distance_matrix is None:

        # merge the references to the fields for the source fields
        pub2ref_df = pub2ref_df.merge(
            pub2field_df,
            how='left',
            left_on='SourceId',
            right_on='PublicationId').rename(
                columns={
                    'FieldId': 'SourceFieldId',
                    'PubFieldContribution': 'SourcePubFieldContribution'
                })
        del pub2ref_df['PublicationId']

        # drop any citation relationships for which we dont have field information
        pub2ref_df.dropna(inplace=True)

        # we need to use integer ids to map to the matrix
        pub2ref_df[['SourceFieldId', 'TargetFieldId'
                    ]] = pub2ref_df[['SourceFieldId',
                                     'TargetFieldId']].astype(int)

        # in the field2field distance matrix, the weighted contribution from a source publication in multiple fields
        # is the product of the source and target contributions
        pub2ref_df['SourcePubFieldContribution'] = pub2ref_df[
            'SourcePubFieldContribution'] * pub2ref_df[
                'TargetPubFieldContribution']

        # differeniate between the temporal and the static RS
        if temporal:
            # make the temporal distance matrix
            distance_matrix = np.zeros((Nyears, Nfields, Nfields))

            for y, ydf in pub2ref_df.groupby('CitingYear'):
                # calculate the field representation vectors for this year only
                yfield2field_mat = dataframe2bipartite(
                    df=ydf,
                    rowname='SourceFieldId',
                    colname='TargetFieldId',
                    shape=(Nfields, Nfields),
                    weightname='SourcePubFieldContribution')

                # now compute the distance matrix for this year only
                distance_matrix[year2int[y]] = pairwise_distances(
                    yfield2field_mat, metric=field_distance_metric)

        else:
            # calculate the field representation vectors
            field2field_mat = dataframe2bipartite(
                df=pub2ref_df,
                rowname='SourceFieldId',
                colname='TargetFieldId',
                shape=(Nfields, Nfields),
                weightname='SourcePubFieldContribution')

            # now compute the distance matrix
            distance_matrix = pairwise_distances(field2field_mat,
                                                 metric=field_distance_metric)

        # we no longer need the 'SourceFieldId' or 'SourcePubFieldContribution' so cleanup
        del pub2ref_df['SourceFieldId']
        del pub2ref_df['SourcePubFieldContribution']
        pub2ref_df.drop_duplicates(
            subset=['SourceId', 'TargetId', 'TargetFieldId'], inplace=True)

    # Now we start on the RaoStiring calculation

    # drop any citation relationships for which we dont have field information
    pub2ref_df.dropna(inplace=True)

    if temporal:

        rsdf = []
        for y, ydf in pub2ref_df.groupby('CitingYear'):

            # for each year, we need to map individual publications to the rows of our matrix
            ypub2int = {
                pid: i
                for i, pid in enumerate(np.sort(ydf['SourceId'].unique()))
            }
            ydf['SourceId'] = [ypub2int[fid] for fid in ydf['SourceId'].values]
            ydf[['SourceId',
                 'TargetFieldId']] = ydf[['SourceId',
                                          'TargetFieldId']].astype(int)
            yNpubs = len(ypub2int)

            # calculate the publication representation vectors over fields
            ypub2field_mat = dataframe2bipartite(
                df=ydf,
                rowname='SourceId',
                colname='TargetFieldId',
                shape=(yNpubs, Nfields),
                weightname='TargetPubFieldContribution').tocsr()

            # make sure the publication 2 field vector is normalized
            ypub2field_mat = normalize(ypub2field_mat, norm='l1', axis=1)

            # finally, we calculate the matrix representation of the RS measure
            yrsdf = 0.5 * np.squeeze(
                np.asarray(
                    ypub2field_mat.dot(distance_matrix[year2int[y]]).multiply(
                        ypub2field_mat).sum(axis=1)))

            rsdf.append(
                pd.DataFrame(
                    zip(np.sort(ydf['SourceId'].unique()), yrsdf,
                        [y] * yNpubs),
                    columns=['PublicationId', 'RaoStirling', 'CitingYear']))

        rsdf = pd.concat(rsdf)

        return rsdf, precomputed_distance_matrix, field2int, years

    else:

        # first map individual publications to the rows of our matrix
        pub2int = {
            pid: i
            for i, pid in enumerate(np.sort(pub2ref_df['SourceId'].unique()))
        }
        pub2ref_df['SourceId'] = [
            pub2int[fid] for fid in pub2ref_df['SourceId'].values
        ]
        pub2ref_df[['SourceId', 'TargetFieldId'
                    ]] = pub2ref_df[['SourceId', 'TargetFieldId']].astype(int)
        Npubs = len(pub2int)

        # calculate the publication representation vectors over fields
        pub2field_mat = dataframe2bipartite(
            df=pub2ref_df,
            rowname='SourceId',
            colname='TargetFieldId',
            shape=(Npubs, Nfields),
            weightname='TargetPubFieldContribution').tocsr()

        # make sure the publication 2 field vector is normalized
        pub2field_mat = normalize(pub2field_mat, norm='l1', axis=1)

        # finally, we calculate the matrix representation of the RS measure
        rsdf = 0.5 * np.squeeze(
            np.asarray(
                pub2field_mat.dot(distance_matrix).multiply(pub2field_mat).sum(
                    axis=1)))

        rsdf = pd.DataFrame(zip(np.sort(pub2ref_df['SourceId'].unique()),
                                rsdf),
                            columns=['PublicationId', 'RaoStirling'])

        return rsdf, distance_matrix, field2int
Пример #8
0
def raostriling_interdisciplinarity(pub2ref_df,
                                    pub2field_df,
                                    focus_pub_ids=None,
                                    pub2field_norm=True,
                                    temporal=False,
                                    citation_direction='references',
                                    field_distance_metric='cosine',
                                    distance_matrix=None,
                                    show_progress=False):
    """
    Calculate the RaoStirling index as a measure of a publication's interdisciplinarity.
    See :cite:`stirling20` for the definition and :cite:`gates2019naturereach` for an application.

    Parameters
    ----------
    :param pub2ref_df : DataFrame
        A DataFrame with the citation information for each Publication.

    :param pub2field_df : DataFrame
        A DataFrame with the field information for each Publication.

    :param focus_pub_ids : numpy array or list, default None
        A list of the PublicationIds to calculate interdisciplinarity.

    :param pub2field_norm : bool, default True
        When a publication occurs in m > 1 fields, count the publication 1/m times in each field.  Normalizes the membership
        vector so it sums to 1 for each publication.

    :param temporal : bool, default False
        If True, compute the distance matrix using only publications for each year.

    :param citation_direction : str, default `references`
        `references` : the fields are defined by a publication's references.
        `citations` : the fields are defined by a publication's citations.

    :param field_distance_metric : str, default `cosine`
        The interfield distance metric.  Valid entries come from sklearn.metrics.pairwise_distances:
        ‘cosine‘, ‘euclidean’, ‘l1’, ‘l2’, etc.

    :param distance_matrix : numpy array, default None
        The precomputed field distance matrix.

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.

    Returns
    -------
    DataFrame
        DataFrame with 2 columns: 'PublicationId', 'RaoStirling'

    """

    # now we map citing and cited to the source and target depending on which diretion was specified by `citation_direction'
    if citation_direction == 'references':
        pub2ref_rename_dict = {
            'CitedPublicationId': 'TargetId',
            'CitingPublicationId': 'SourceId'
        }
        year_col = 'CitingYear'
    elif citation_direction == 'citations':
        pub2ref_rename_dict = {
            'CitedPublicationId': 'SourceId',
            'CitingPublicationId': 'TargetId'
        }
        year_col = 'CitedYear'

    required_columns = ['CitedPublicationId', 'CitingPublicationId']
    if temporal:
        required_columns.append(year_col)
    check4columns(pub2ref_df, required_columns)
    pub2ref_df = pub2ref_df[required_columns].dropna().copy(deep=True)

    check4columns(pub2field_df, ['PublicationId', 'FieldId'])
    pub2field_df = pub2field_df.copy(deep=True)

    # check that the precomputed distance matrix is the correct size
    if distance_matrix is None:
        distance_matrix = field_citation_distance(pub2ref_df, pub2field_df,
                                                  pub2field_norm, temporal,
                                                  citation_direction,
                                                  field_distance_metric,
                                                  show_progress)

    field2int = {
        fid: i
        for i, fid in enumerate(np.sort(pub2field_df['FieldId'].unique()))
    }
    pub2field_df['FieldId'] = [
        field2int[fid] for fid in pub2field_df['FieldId'].values
    ]
    Nfields = len(field2int)

    pub2ref_df.rename(columns=pub2ref_rename_dict, inplace=True)

    if not focus_pub_ids is None:
        pub2ref_df = pub2ref_df.loc[isin_sorted(pub2ref_df['SourceId'].values,
                                                focus_pub_ids)]

    if temporal:
        years = np.sort(pub2ref_df[year_col].unique())
        year2int = {y: i for i, y in enumerate(years)}
        Nyears = years.shape[0]

    if type(distance_matrix) == pd.DataFrame and temporal:
        check4columns(distance_matrix,
                      ['iFieldId', 'jFieldId', year_col, 'FieldDistance'])

        distance_matrix = distance_matrix.loc[isin_sorted(
            distance_matrix[year_col].values, years)].copy(deep=True)

        distance_matrix['iFieldId'] = [
            field2int.get(fid, None)
            for fid in distance_matrix['iFieldId'].values
        ]
        distance_matrix['jFieldId'] = [
            field2int.get(fid, None)
            for fid in distance_matrix['jFieldId'].values
        ]
        distance_matrix.dropna(inplace=True)

        tdm = np.zeros((Nyears, Nfields, Nfields))
        for y in years:
            tdm[year2int[y]] = dataframe2bipartite(
                df=distance_matrix[distance_matrix[year_col] == y],
                rowname='iFieldId',
                colname='jFieldId',
                shape=(Nfields, Nfields),
                weightname='FieldDistance').todense()

            tdm[year2int[y]] = tdm[year2int[y]] + tdm[year2int[y]].T

        distance_matrix = tdm

    elif type(distance_matrix) == pd.DataFrame and not temporal:
        check4columns(distance_matrix,
                      ['iFieldId', 'jFieldId', 'FieldDistance'])
        distance_matrix = distance_matrix.copy(deep=True)
        distance_matrix['iFieldId'] = [
            field2int.get(fid, None)
            for fid in distance_matrix['iFieldId'].values
        ]
        distance_matrix['jFieldId'] = [
            field2int.get(fid, None)
            for fid in distance_matrix['jFieldId'].values
        ]
        distance_matrix.dropna(inplace=True)
        distance_matrix = dataframe2bipartite(
            df=distance_matrix,
            rowname='iFieldId',
            colname='jFieldId',
            shape=(Nfields, Nfields),
            weightname='FieldDistance').todense()

        distance_matrix = distance_matrix + distance_matrix.T

    elif (type(distance_matrix) == np.array
          or type(distance_matrix) == np.matrix):
        if not temporal and distance_matrix.shape != (Nfields, Nfields):
            raise pySciSciMetricError(
                'The precomputed_distance_matrix is of the wrong size to compute the RaoStirling interdisciplinarity for the publications passed.'
            )
        elif temporal and distance_matrix.shape != (Nyears, Nfields, Nfields):
            raise pySciSciMetricError(
                'The precomputed_distance_matrix is of the wrong size to compute the RaoStirling interdisciplinarity for the publications and years passed.'
            )

    # the assignment of a publication to a field is 1/(number of fields) when normalized, and 1 otherwise
    if pub2field_norm:
        pub2nfields = pub2field_df.groupby(
            'PublicationId')['FieldId'].nunique()
    else:
        pub2nfields = defaultdict(lambda: 1)
    pub2field_df['PubFieldContribution'] = [
        1.0 / pub2nfields[pid] for pid in pub2field_df['PublicationId'].values
    ]

    # merge the references to the fields for the target fields
    pub2ref_df = pub2ref_df.merge(
        pub2field_df, how='left', left_on='TargetId',
        right_on='PublicationId').rename(
            columns={
                'FieldId': 'TargetFieldId',
                'PubFieldContribution': 'TargetPubFieldContribution'
            })
    del pub2ref_df['PublicationId']

    pub2ref_df.dropna(inplace=True)

    # Now we start on the RaoStiring calculation
    if temporal:

        rsdf = []
        for y, ydf in pub2ref_df.groupby(year_col):

            # for each year, we need to map individual publications to the rows of our matrix
            ypub2int = {
                pid: i
                for i, pid in enumerate(np.sort(ydf['SourceId'].unique()))
            }
            yint2pub = {i: pid for pid, i in ypub2int.items()}
            ydf['SourceId'] = [ypub2int[fid] for fid in ydf['SourceId'].values]
            yNpubs = len(ypub2int)

            # calculate the publication representation vectors over fields
            ypub2field_mat = dataframe2bipartite(
                df=ydf,
                rowname='SourceId',
                colname='TargetFieldId',
                shape=(yNpubs, Nfields),
                weightname='TargetPubFieldContribution').tocsr()

            # make sure the publication 2 field vector is normalized
            ypub2field_mat = normalize(ypub2field_mat, norm='l1', axis=1)

            # finally, we calculate the matrix representation of the RS measure
            yrsdf = pd.DataFrame()
            yrsdf['PublicationId'] = [
                yint2pub[i] for i in np.sort(ydf['SourceId'].unique())
            ]
            yrsdf['CitingYear'] = y
            yrsdf['RaoStirling'] = 0.5 * np.squeeze(
                np.asarray(
                    ypub2field_mat.dot(
                        spsparse.csr_matrix(distance_matrix[year2int[y]])).
                    multiply(ypub2field_mat).sum(axis=1)))

            rsdf.append(yrsdf)

        rsdf = pd.concat(rsdf)

        return rsdf

    else:

        # first map individual publications to the rows of our matrix
        pub2int = {
            pid: i
            for i, pid in enumerate(np.sort(pub2ref_df['SourceId'].unique()))
        }
        int2pub = {i: pid for pid, i in pub2int.items()}
        pub2ref_df['SourceId'] = [
            pub2int[pid] for pid in pub2ref_df['SourceId'].values
        ]
        pub2ref_df[['SourceId', 'TargetFieldId'
                    ]] = pub2ref_df[['SourceId', 'TargetFieldId']].astype(int)
        Npubs = len(pub2int)

        # calculate the publication representation vectors over fields
        pub2field_mat = dataframe2bipartite(
            df=pub2ref_df,
            rowname='SourceId',
            colname='TargetFieldId',
            shape=(Npubs, Nfields),
            weightname='TargetPubFieldContribution').tocsr()

        # make sure the publication 2 field vector is normalized
        pub2field_mat = normalize(pub2field_mat, norm='l1', axis=1)

        distance_matrix = spsparse.csr_matrix(distance_matrix)

        # finally, we calculate the matrix representation of the RS measure
        rsdf = pd.DataFrame()
        rsdf['RaoStirling'] = 0.5 * np.squeeze(
            np.asarray(
                spsparse.csr_matrix.multiply(
                    pub2field_mat.dot(distance_matrix),
                    pub2field_mat).sum(axis=1)))
        rsdf['PublicationId'] = [
            int2pub[i] for i in np.sort(pub2ref_df['SourceId'].unique())
        ]

        return rsdf
Пример #9
0
def field_citation_distance(pub2ref_df,
                            pub2field_df,
                            pub2field_norm=True,
                            temporal=True,
                            citation_direction='references',
                            field_distance_metric='cosine',
                            show_progress=False):
    """
    Calculate the field distance matrix based on references or citations.

    Parameters
    ----------
    :param pub2ref_df : DataFrame
        A DataFrame with the citation information for each Publication.

    :param pub2field_df : DataFrame
        A DataFrame with the field information for each Publication.

    :param pub2field_norm : bool, default True
        When a publication occurs in m > 1 fields, count the publication 1/m times in each field.  Normalizes the membership
        vector so it sums to 1 for each publication.

    :param temporal : bool, default False
        If True, compute the distance matrix using only publications for each year.

    :param citation_direction : str, default `references`
        `references` : the fields are defined by a publication's references.
        `citations` : the fields are defined by a publication's citations.

    :param field_distance_metric : str, default `cosine`
        The interfield distance metric.  Valid entries come from sklearn.metrics.pairwise_distances:
        ‘cosine‘, ‘euclidean’, ‘l1’, ‘l2’, etc.

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.

    Returns
    -------
    Distance DataFrame
        if temporal is True
            DataFrame with 4 columns: iFieldId, jFieldId, Year, and FieldDistance
        if temporal is False
            DataFrame with 3 columns: iFieldId, jFieldId, FieldDistance

    """

    # now we map citing and cited to the source and target depending on which diretion was specified by `citation_direction'
    if citation_direction == 'references':
        pub2ref_rename_dict = {
            'CitedPublicationId': 'TargetId',
            'CitingPublicationId': 'SourceId'
        }
        year_col = 'CitingYear'
    elif citation_direction == 'citations':
        pub2ref_rename_dict = {
            'CitedPublicationId': 'SourceId',
            'CitingPublicationId': 'TargetId'
        }
        year_col = 'CitedYear'

    required_columns = ['CitedPublicationId', 'CitingPublicationId']
    if temporal:
        required_columns.append(year_col)
    check4columns(pub2ref_df, required_columns)
    pub2ref_df = pub2ref_df[required_columns].dropna().copy(deep=True)

    check4columns(pub2field_df, ['PublicationId', 'FieldId'])
    pub2field_df = pub2field_df.copy(deep=True)

    # to leverage matrix operations we need to map fields to the rows/cols of the matrix
    field2int = {
        fid: i
        for i, fid in enumerate(np.sort(pub2field_df['FieldId'].unique()))
    }
    int2field = {i: fid for fid, i in field2int.items()}
    pub2field_df['FieldId'] = [
        field2int[fid] for fid in pub2field_df['FieldId'].values
    ]
    Nfields = len(field2int)

    pub2ref_df.rename(columns=pub2ref_rename_dict, inplace=True)

    # the assignment of a publication to a field is 1/(number of fields) when normalized, and 1 otherwise
    if pub2field_norm:
        pub2nfields = pub2field_df.groupby(
            'PublicationId')['FieldId'].nunique()
    else:
        pub2nfields = defaultdict(lambda: 1)
    pub2field_df['PubFieldContribution'] = [
        1.0 / pub2nfields[pid] for pid in pub2field_df['PublicationId'].values
    ]

    distance_df = []

    # differeniate between the temporal and the static RS
    if temporal:

        for y, ydf in pub2ref_df.groupby(year_col):
            # merge the references to the fields for the source fields
            ydf = ydf.merge(
                pub2field_df,
                how='left',
                left_on='SourceId',
                right_on='PublicationId').rename(
                    columns={
                        'FieldId': 'SourceFieldId',
                        'PubFieldContribution': 'SourcePubFieldContribution'
                    })
            del ydf['PublicationId']

            ydf = ydf.merge(
                pub2field_df,
                how='left',
                left_on='TargetId',
                right_on='PublicationId').rename(
                    columns={
                        'FieldId': 'TargetFieldId',
                        'PubFieldContribution': 'TargetPubFieldContribution'
                    })
            del ydf['PublicationId']

            # drop any citation relationships for which we dont have field information
            ydf.dropna(inplace=True)

            # we need to use integer ids to map to the matrix
            ydf[['SourceFieldId',
                 'TargetFieldId']] = ydf[['SourceFieldId',
                                          'TargetFieldId']].astype(int)

            # in the field2field distance matrix, the weighted contribution from a source publication in multiple fields
            # is the product of the source and target contributions
            ydf['SourcePubFieldContribution'] = ydf[
                'SourcePubFieldContribution'] * ydf[
                    'TargetPubFieldContribution']

            # calculate the field representation vectors for this year only
            yfield2field_mat = dataframe2bipartite(
                df=ydf,
                rowname='SourceFieldId',
                colname='TargetFieldId',
                shape=(Nfields, Nfields),
                weightname='SourcePubFieldContribution')

            # now compute the distance matrix for this year only
            distance_matrix = pairwise_distances(yfield2field_mat,
                                                 metric=field_distance_metric)
            nnzrow, nnzcol = np.nonzero(distance_matrix)
            for isource, itarget in zip(nnzrow, nnzcol):
                if isource < itarget:
                    distance_df.append([
                        int2field[isource], int2field[itarget], y,
                        distance_matrix[isource, itarget]
                    ])

        distance_df = pd.DataFrame(
            distance_df,
            columns=['iFieldId', 'jFieldId', year_col, 'FieldDistance'])

    else:

        field2field_mat = spsparse.coo_matrix((Nfields, Nfields))

        nref = int(pub2ref_df.shape[0] / 10.0**6) + 1
        for itab in range(nref):
            tabdf = pub2ref_df.loc[0 * 10**6:(0 + 1) * 10**6]

            tabdf = tabdf.merge(
                pub2field_df,
                how='left',
                left_on='SourceId',
                right_on='PublicationId').rename(
                    columns={
                        'FieldId': 'SourceFieldId',
                        'PubFieldContribution': 'SourcePubFieldContribution'
                    })
            del tabdf['PublicationId']

            tabdf = tabdf.merge(
                pub2field_df,
                how='left',
                left_on='TargetId',
                right_on='PublicationId').rename(
                    columns={
                        'FieldId': 'TargetFieldId',
                        'PubFieldContribution': 'TargetPubFieldContribution'
                    })
            del tabdf['PublicationId']

            # drop any citation relationships for which we dont have field information
            tabdf.dropna(inplace=True)

            # we need to use integer ids to map to the matrix
            tabdf[['SourceFieldId', 'TargetFieldId'
                   ]] = tabdf[['SourceFieldId', 'TargetFieldId']].astype(int)

            # in the field2field distance matrix, the weighted contribution from a source publication in multiple fields
            # is the product of the source and target contributions
            tabdf['SourcePubFieldContribution'] = tabdf[
                'SourcePubFieldContribution'] * tabdf[
                    'TargetPubFieldContribution']

            # calculate the field representation vectors
            field2field_mat += dataframe2bipartite(
                df=tabdf,
                rowname='SourceFieldId',
                colname='TargetFieldId',
                shape=(Nfields, Nfields),
                weightname='SourcePubFieldContribution')

        # now compute the distance matrix
        distance_matrix = pairwise_distances(field2field_mat,
                                             metric=field_distance_metric)
        sources, targets = np.nonzero(distance_matrix)
        for isource, itarget in zip(sources, targets):
            if isource < itarget:
                distance_df.append([
                    int2field[isource], int2field[itarget],
                    distance_matrix[isource, itarget]
                ])

        distance_df = pd.DataFrame(
            distance_df, columns=['iFieldId', 'jFieldId', 'FieldDistance'])

    return distance_df
Пример #10
0
def cocitation_network(pub2ref_df,
                       focus_pub_ids=None,
                       focus_constraint='citing',
                       temporal=False,
                       show_progress=False):
    """
    Create the co-citation network.

    Parameters
    ----------
    :param pub2ref_df : DataFrame
        A DataFrame with the links between authors and publications.

    :param focus_pub_ids : numpy array or list, default None
        A list of the PublicationIds to seed the cocitation-network.

    :param focus_constraint : str, default `citing`
        If focus_author_ids is not None:
            `citing` : the `focus_pub_ids' defines the citation set, giving only the co-citations between the references
                of the publications from this set.
            `cited` : the `focus_pub_ids' defines the cocitation node set.
            'egocited' : the `focus_pub_ids' defines a seed set, such that all other publications must have been co-citeed with
                at least one publication from this set.

    :param temporal : bool, default False
        If True, compute the adjacency matrix using only publications for each year.

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.


    Returns
    -------
    coo_matrix or dict of coo_matrix
        If temporal == False:
            The adjacency matrix for the co-citation network

        If temporal == True:
            A dictionary with key for each year, and value of the adjacency matrix for the cocitation network induced
            by citing publications in that year.

    pub2int, dict
        A mapping of PublicationIds to the row/column of the adjacency matrix.

    """
    required_columns = ['CitedPublicationId', 'CitingPublicationId']
    if temporal:
        required_columns.append('CitingYear')
    check4columns(pub2ref_df, required_columns)
    pub2ref_df = pub2ref_df[required_columns].dropna()

    if not focus_pub_ids is None:
        focus_pub_ids = np.sort(focus_pub_ids)

        # identify the subset of the publications we need to form the network
        if focus_constraint == 'citing':
            # take only the links that have a citing publication from the `focus_pub_ids'
            pub2ref_df = pub2ref_df.loc[isin_sorted(
                pub2ref_df['CitingPublicationId'].values, focus_pub_ids)]

        elif focus_constraint == 'cited':
            # take only the links that have a cited publication from the `focus_pub_ids'
            pub2ref_df = pub2ref_df.loc[isin_sorted(
                pub2ref_df['CitedPublicationId'].values, focus_pub_ids)]

        elif focus_constraint == 'egocited':
            # take all publications that cite one of the publications in `focus_pub_ids'
            focus_citing_pubs = np.sort(pub2ref_df.loc[isin_sorted(
                pub2ref_df['CitedPublicationId'].values,
                focus_pub_ids)]['CitingPublicationId'].unique())
            # then take all the links that have a citing publication from the `focus_citing_pubs'
            pub2ref_df = pub2ref_df.loc[isin_sorted(
                pub2ref_df['CitingPublicationId'].values, focus_citing_pubs)]
            del focus_citing_pubs

    pub2ref_df.drop_duplicates(
        subset=['CitingPublicationId', 'CitedPublicationId'], inplace=True)

    if pub2ref_df.shape[0] > 0:
        #  map cited publications to the rows of the bipartite adj mat
        cited2int = {
            pid: i
            for i, pid in enumerate(
                np.sort(pub2ref_df['CitedPublicationId'].unique()))
        }
        Ncited = pub2ref_df['CitedPublicationId'].nunique()

        pub2ref_df['CitedPublicationId'] = [
            cited2int[pid] for pid in pub2ref_df['CitedPublicationId'].values
        ]

        #  map citing publications to the columns of the bipartite adj mat
        citing2int = {
            pid: i
            for i, pid in enumerate(
                np.sort(pub2ref_df['CitingPublicationId'].unique()))
        }
        Nciting = pub2ref_df['CitingPublicationId'].nunique()

        pub2ref_df['CitingPublicationId'] = [
            citing2int[pid] for pid in pub2ref_df['CitingPublicationId'].values
        ]

        if temporal:
            years = np.sort(pub2ref_df['CitingYear'].unique())

            temporal_adj = {}
            for y in years:
                bipartite_adj = dataframe2bipartite(
                    pub2ref_df.loc[pub2ref_df['CitingYear'] == y],
                    'CitedPublicationId', 'CitingPublicationId',
                    (Ncited, Nciting))

                adj_mat = project_bipartite_mat(bipartite_adj,
                                                project_to='row')

                # remove diagonal entries
                adj_mat.setdiag(0)
                adj_mat.eliminate_zeros()

                temporal_adj[y] = adj_mat

            return temporal_adj, cited2int

        else:
            bipartite_adj = dataframe2bipartite(pub2ref_df,
                                                'CitedPublicationId',
                                                'CitingPublicationId',
                                                (Ncited, Nciting))

            adj_mat = project_bipartite_mat(bipartite_adj, project_to='row')

            # remove diagonal entries
            adj_mat.setdiag(0)
            adj_mat.eliminate_zeros()

            return adj_mat, cited2int

    else:
        return spsparse.coo_matrix(), {}
Пример #11
0
def coauthorship_network(paa_df,
                         focus_author_ids=None,
                         focus_constraint='authors',
                         temporal=False,
                         show_progress=False):
    """
    Create the co-authorship network.

    Parameters
    ----------
    :param paa_df : DataFrame
        A DataFrame with the links between authors and publications.

    :param focus_author_ids : numpy array or list, default None
        A list of the AuthorIds to seed the coauthorship-network.

    :param focus_constraint : str, default `authors`
        If focus_author_ids is not None:
            `authors` : the `focus_author_ids' defines the node set, giving only the co-authorships between authors in the set.
            `publications` : the publication history of `focus_author_ids' defines the edge set, giving the co-authorhips where at least
                                one author from `focus_author_ids' was involved.
            'ego' : the `focus_author_ids' defines a seed set, such that all authors must have co-authored at least one publication with
                                an author from `focus_author_ids', but co-authorships are also found between the second-order author sets.

    :param temporal : bool, default False
        If True, compute the adjacency matrix using only publications for each year.

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.


    Returns
    -------
    coo_matrix or dict of coo_matrix
        If temporal == False:
            The adjacency matrix for the co-authorship network

        If temporal == True:
            A dictionary with key for each year, and value of the adjacency matrix for the co-authorship network induced by publications in that year.

    author2int, dict
        A mapping of AuthorIds to the row/column of the adjacency matrix.

    """
    required_columns = ['AuthorId', 'PublicationId']
    if temporal:
        required_columns.append('Year')
    check4columns(paa_df, required_columns)
    paa_df = paa_df[required_columns].dropna()

    if not focus_author_ids is None:
        focus_author_ids = np.sort(focus_author_ids)

        # identify the subset of the publications we need to form the network
        if focus_constraint == 'authors':
            # take only the publication-author links that have an author from the `focus_author_ids'
            paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values,
                                            focus_author_ids)]

        elif focus_constraint == 'publications':
            # take all publications authored by an author from the `focus_author_ids'
            focus_pubs = np.sort(paa_df.loc[isin_sorted(
                paa_df['AuthorId'].values,
                focus_author_ids)]['PublicationId'].unique())
            # then take only the subset of publication-author links inducded by these publications
            paa_df = paa_df.loc[isin_sorted(paa_df['PublicationId'].values,
                                            focus_pubs)]
            del focus_pubs

        elif focus_constraint == 'ego':
            # take all publications authored by an author from the `focus_author_ids'
            focus_pubs = np.sort(paa_df.loc[isin_sorted(
                paa_df['AuthorId'].values,
                focus_author_ids)]['PublicationId'].unique())
            # then take all authors who contribute to this subset of publications
            focus_author_ids = np.sort(paa_df.loc[isin_sorted(
                paa_df['PublicationId'].values,
                focus_pubs)]['AuthorId'].unique())
            del focus_pubs
            # finally take the publication-author links that have an author from the above ego subset
            paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values,
                                            focus_author_ids)]

    paa_df.drop_duplicates(subset=['AuthorId', 'PublicationId'], inplace=True)

    #  map authors to the rows of the bipartite adj mat
    author2int = {
        aid: i
        for i, aid in enumerate(np.sort(paa_df['AuthorId'].unique()))
    }
    Nauthors = paa_df['AuthorId'].nunique()

    paa_df['AuthorId'] = [author2int[aid] for aid in paa_df['AuthorId'].values]

    #  map publications to the columns of the bipartite adj mat
    pub2int = {
        pid: i
        for i, pid in enumerate(np.sort(paa_df['PublicationId'].unique()))
    }
    Npubs = paa_df['PublicationId'].nunique()

    paa_df['PublicationId'] = [
        pub2int[pid] for pid in paa_df['PublicationId'].values
    ]

    if temporal:
        years = np.sort(paa_df['Year'].unique())

        temporal_adj = {}
        for y in years:
            bipartite_adj = dataframe2bipartite(
                paa_df.loc[paa_df['Year'] == y], 'AuthorId', 'PublicationId',
                (Nauthors, Npubs))

            adj_mat = project_bipartite_mat(bipartite_adj, project_to='row')

            # remove diagonal entries
            adj_mat.setdiag(0)
            adj_mat.eliminate_zeros()

            temporal_adj[y] = adj_mat

        return temporal_adj, author2int

    else:
        bipartite_adj = dataframe2bipartite(paa_df, 'AuthorId',
                                            'PublicationId', (Nauthors, Npubs))

        adj_mat = project_bipartite_mat(bipartite_adj, project_to='row')

        # remove diagonal entries
        adj_mat.setdiag(0)
        adj_mat.eliminate_zeros()

        return adj_mat, author2int
Пример #12
0
def cociting_network(pub2ref_df,
                     focus_pub_ids=None,
                     focus_constraint='citing',
                     temporal=False,
                     show_progress=False):
    """
    Create the co-citing network.  Each node is a publication, two publications are linked if they cite the same article.


    Parameters
    ----------

    pub2ref_df : DataFrame
        A DataFrame with the links between authors and publications.

    focus_pub_ids : numpy array or list, default None
        A list of the PublicationIds to seed the cocitation-network.

    focus_constraint : str, default 'citing'
        If focus_author_ids is not None
            - 'citing' : the 'focus_pub_ids' defines the citation set, giving only the co-citations between the references
                of the publications from this set.
            - 'cited' : the 'focus_pub_ids' defines the cocitation node set.

    show_progress : bool, default False
        If True, show a progress bar tracking the calculation.


    Returns
    -------

    coo_matrix or dict of coo_matrix
        The adjacency matrix for the co-citing network

    pub2int, dict
        A mapping of PublicationIds to the row/column of the adjacency matrix.



    |
    

    """
    required_columns = ['CitedPublicationId', 'CitingPublicationId']
    check4columns(pub2ref_df, required_columns)
    pub2ref_df = pub2ref_df[required_columns].dropna()

    if not focus_pub_ids is None:
        focus_pub_ids = np.sort(focus_pub_ids)

        # identify the subset of the publications we need to form the network
        if focus_constraint == 'citing':
            # take only the links that have a citing publication from the `focus_pub_ids'
            pub2ref_df = pub2ref_df.loc[isin_sorted(
                pub2ref_df['CitingPublicationId'].values, focus_pub_ids)]

        elif focus_constraint == 'cited':
            # take only the links that have a cited publication from the `focus_pub_ids'
            pub2ref_df = pub2ref_df.loc[isin_sorted(
                pub2ref_df['CitedPublicationId'].values, focus_pub_ids)]

    pub2ref_df.drop_duplicates(
        subset=['CitingPublicationId', 'CitedPublicationId'], inplace=True)

    if pub2ref_df.shape[0] > 0:
        #  map cited publications to the rows of the bipartite adj mat
        cited2int = {
            pid: i
            for i, pid in enumerate(
                np.sort(pub2ref_df['CitedPublicationId'].unique()))
        }
        Ncited = pub2ref_df['CitedPublicationId'].nunique()

        pub2ref_df['CitedPublicationId'] = [
            cited2int[pid] for pid in pub2ref_df['CitedPublicationId'].values
        ]

        #  map citing publications to the columns of the bipartite adj mat
        citing2int = {
            pid: i
            for i, pid in enumerate(
                np.sort(pub2ref_df['CitingPublicationId'].unique()))
        }
        Nciting = pub2ref_df['CitingPublicationId'].nunique()

        pub2ref_df['CitingPublicationId'] = [
            citing2int[pid] for pid in pub2ref_df['CitingPublicationId'].values
        ]

        bipartite_adj = dataframe2bipartite(pub2ref_df, 'CitedPublicationId',
                                            'CitingPublicationId',
                                            (Ncited, Nciting))

        adj_mat = project_bipartite_mat(bipartite_adj, project_to='col')

        # remove diagonal entries
        adj_mat.setdiag(0)
        adj_mat.eliminate_zeros()

        return adj_mat, cited2int

    else:
        return spsparse.coo_matrix(), {}