Пример #1
0
    def define_search_sources(self, verbose=False):
        """Define .search_sources.

        Parameters
        ----------
        verbose : bool (optional, default=False)
            Whether to report on the progress of the process.
        """
        # Get list of source IDs of scientist's own sources
        own_source_ids, _ = zip(*self.sources)
        # Select sources in scientist's main field
        df = self.field_source
        same_field = df["asjc"] == self.main_field[0]
        # Select sources of same type as those in scientist's main field
        same_sources = same_field & df["source_id"].isin(own_source_ids)
        main_types = set(df[same_sources]["type"])
        same_type = same_field & df["type"].isin(main_types)
        source_ids = df[same_type]["source_id"].unique()
        selected = df[df["source_id"].isin(source_ids)].copy()
        selected["asjc"] = selected["asjc"].astype(str) + " "
        grouped = selected.groupby("source_id").sum()["asjc"].to_frame()
        # Deselect sources with alien fields
        mask = grouped["asjc"].apply(
            lambda s: any(x for x in s.split() if int(x) not in self.fields))
        grouped = grouped[~mask]
        sources = set((s, self.source_names.get(s)) for s in grouped.index)
        # Add own sources
        sources.update(self.sources)
        # Finalize
        self._search_sources = sorted(list(sources))
        types = "; ".join(list(main_types))
        text = "Found {} sources matching main field {} and type(s) {}".format(
            len(self._search_sources), self.main_field[0], types)
        custom_print(text, verbose)
        return self
Пример #2
0
    def define_search_group(self,
                            stacked=False,
                            verbose=False,
                            refresh=False,
                            ignore_first_id=False):
        """Define search_group.

        Parameters
        ----------
        stacked : bool (optional, default=False)
            Whether to combine searches in few queries or not.  Cached
            files with most likely not be resuable.  Set to True if you
            query in distinct fields or you want to minimize API key usage.

        verbose : bool (optional, default=False)
            Whether to report on the progress of the process.

        refresh : bool (optional, default=False)
            Whether to refresh cached search files.

        ignore_first_id: boolean (optional, default=False)
            If True, the authors in the first year of publication of the
            scientist are not selected based on their Author ID but based on
            their surname and first name.
        """
        # Checks
        if not self.search_sources:
            text = "No search sources defined.  Please run "\
                   ".define_search_sources() first."
            raise Exception(text)
        self._ignore_first_id = ignore_first_id
        if ignore_first_id and not self.period:
            self._ignore_first_id = False
            warn("ignore_first_id set back to False: period is None or "
                 "the first year of the period is before the first year "
                 "of publication of the scientist.")

        # Query journals
        params = {
            "self": self,
            "stacked": stacked,
            "refresh": refresh,
            "verbose": verbose
        }
        today, then, negative = search_group_from_sources(**params)

        # Finalize and select
        group = today
        if not self._ignore_first_id:
            group = today.intersection(then)
        negative.update({str(i) for i in self.identifier})
        negative.update({str(i) for i in self.coauthors})
        self._search_group = sorted(list(group - negative))
        text = "Found {:,} authors for search_group".format(
            len(self._search_group))
        custom_print(text, verbose)
        return self
Пример #3
0
    def inform_matches(self, fields=None, verbose=False, refresh=False):
        """Add information to matches to aid in selection process.

        Parameters
        ----------
        fields : iterable (optional, default=None)
            Which information to provide.  Allowed values are "first_name",
            "surname", "first_year", "num_coauthors", "num_publications",
            "num_citations", "num_coauthors_period", "num_publications_period",
            "num_citations_period", "subjects", "affiliation_country",
            "affiliation_id", "affiliation_name", "affiliation_type",
            "language", "num_cited_refs".  If None, will use all
            available fields.

        verbose : bool (optional, default=False)
            Whether to report on the progress of the process.

        refresh : bool (optional, default=False)
            Whether to refresh cached results (if they exist) or not. If int
            is passed and stacked=False, results will be refreshed if they are
            older than that value in number of days.

        Notes
        -----
        Matches including corresponding information are available through
        property `.matches`.

        Raises
        ------
        ValueError
            If `fields` contains invalid keywords.
        """
        # Checks
        if not self._matches:
            text = "No matches defined.  Please run .find_matches() first."
            raise Exception(text)
        allowed_fields = ["first_name", "surname", "first_year",
                          "num_coauthors", "num_publications", "num_citations",
                          "num_coauthors_period", "num_publications_period",
                          "num_citations_period", "subjects",
                          "affiliation_country", "affiliation_id",
                          "affiliation_name", "affiliation_type", "language",
                          "num_cited_refs"]
        if fields:
            invalid = [x for x in fields if x not in allowed_fields]
            if invalid:
                text = "Parameter fields contains invalid keywords: " +\
                       ", ".join(invalid)
                raise ValueError(text)
        else:
            fields = allowed_fields

        text = f"Providing information for {len(self._matches):,} matches..."
        custom_print(text, verbose)
        matches = inform_matches(self, fields, verbose, refresh)
        self._matches = matches
Пример #4
0
    def define_search_sources(self, verbose=False):
        """Define .search_sources.

        Within the list of search sources sosia will search for matching
        scientists.  A search source is of the same main field as
        the original scientist, the same types (journal, conference
        proceeding, etc.), and must not be related to fields alien to the
        original scientist.

        Parameters
        ----------
        verbose : bool (optional, default=False)
            Whether to report on the progress of the process.
        """
        df = self.field_source
        # Sources in scientist's main field
        same_field = df["asjc"] == self.main_field[0]
        # Types of Scientist's sources
        own_source_ids, _ = zip(*self.sources)
        same_sources = df["source_id"].isin(own_source_ids)
        main_types = df[same_sources]["type"].unique()
        same_type = df["type"].isin(main_types)
        # Select source IDs
        selected_ids = df[same_field & same_type]["source_id"].unique()
        selected = df[df["source_id"].isin(selected_ids)].copy()
        selected["asjc"] = selected["asjc"].astype(int).astype(str) + " "
        grouped = (selected.groupby("source_id")
                           .sum()["asjc"]
                           .to_frame())
        # Deselect sources with alien fields
        grouped["asjc"] = grouped["asjc"].astype(str).str.split().apply(set)
        fields = set(str(f) for f in self.fields)
        no_alien_field = grouped["asjc"].apply(lambda s: len(s - fields) == 0)
        grouped = grouped[no_alien_field]
        # Add source names
        sources = set((s, self.source_names.get(s)) for s in grouped.index)
        # Add own sources
        sources.update(self.sources)
        # Finalize
        self._search_sources = sorted(sources)
        text = f"Found {len(sources):,} sources matching main field "\
               f"{self.main_field[0]} and source type(s) {'; '.join(main_types)}"
        custom_print(text, verbose)
        return self
Пример #5
0
    def find_matches(self, stacked=False, verbose=False, refresh=False):
        """Find matches within search_group based on four criteria:
        1. Started publishing in about the same year
        2. Has about the same number of publications in the treatment year
        3. Has about the same number of coauthors in the treatment year
        4. Has about the same number of citations in the treatment year
        5. Works in the same field as the scientist's main field

        Parameters
        ----------
        stacked : bool (optional, default=False)
            Whether to combine searches in few queries or not.  Cached
            files will most likely not be reusable.  Set to True if you
            query in distinct fields or you want to minimize API key usage.

        verbose : bool (optional, default=False)
            Whether to report on the progress of the process.

        refresh : bool (optional, default=False)
            Whether to refresh cached results (if they exist) or not. If int
            is passed and stacked=False, results will be refreshed if they are
            older than that value in number of days.

        Notes
        -----
        Matches are available through property `.matches`.
        """
        # Checks
        if not self.search_group:
            text = "No search group defined.  Please run "\
                   ".define_search_group() first."
            raise Exception(text)
        if not isinstance(refresh, bool) and stacked:
            refresh = False
            warn("refresh parameter must be boolean when stacked=True.  "
                 "Continuing with refresh=False.")

        # Find matches
        matches = find_matches(self, stacked, verbose, refresh)
        text = f"Found {len(matches):,} author(s) matching all criteria"
        custom_print(text, verbose)
        self._matches = sorted([auth_id for auth_id in matches])
Пример #6
0
    def define_search_group(self, stacked=False, verbose=False, refresh=False):
        """Define search_group.

        Parameters
        ----------
        stacked : bool (optional, default=False)
            Whether to combine searches in few queries or not.  Cached
            files with most likely not be reusable.  Set to True if you
            query in distinct fields or you want to minimize API key usage.

        verbose : bool (optional, default=False)
            Whether to report on the progress of the process.

        refresh : bool (optional, default=False)
            Whether to refresh cached results (if they exist) or not.
        """
        # Checks
        if not self.search_sources:
            text = "No search sources defined.  Please run "\
                   ".define_search_sources() first."
            raise Exception(text)

        # Query journals
        params = {
            "original": self,
            "stacked": stacked,
            "refresh": refresh,
            "verbose": verbose
        }
        search_group = search_group_from_sources(**params)

        # Remove own IDs and coauthors
        search_group -= set(self.identifier)
        search_group -= {str(i) for i in self.coauthors}

        # Finalize
        self._search_group = sorted(search_group)
        text = f"Found {len(search_group):,} authors for search_group"
        custom_print(text, verbose)
        return self
Пример #7
0
def search_group_from_sources(self, stacked, verbose, refresh=False):
    """Define groups of authors based on publications from a set of sources.

    Parameters
    ----------
    self : sosia.Original
        The object of the Scientist to search information for.

    verbose : bool (optional, default=False)
        Whether to report on the progress of the process.

    refresh : bool (optional, default=False)
        Whether to refresh cached search files.

    Returns
    -------
    today, then, negative : set
        Set of authors publishing in three periods: During the year of
        treatment, during years to match on, and during years before the
        first publication.
    """
    # Filtering variables
    min_year = self.first_year - self.year_margin
    max_year = self.first_year + self.year_margin
    if self.period:
        _margin_setter = self.publications_period
    else:
        _margin_setter = self.publications
    max_pubs = max(margin_range(len(_margin_setter), self.pub_margin))
    years = list(range(min_year, max_year + 1))
    search_years = [min_year - 1]
    if not self._ignore_first_id:
        search_years.extend(range(min_year, max_year + 1))
    search_sources, _ = zip(*self.search_sources)

    # Verbose variables
    n = len(search_sources)
    text = "Searching authors for search_group in {} sources...".format(n)
    custom_print(text, verbose)
    today = set()
    then = set()
    negative = set()

    if stacked:  # Make use of SQL cache
        # Year provided (select also based on location)
        # Get already cached sources from cache
        sources_ay = DataFrame(list(product(search_sources,
                                            [self.active_year])),
                               columns=["source_id", "year"])
        _, _search = sources_in_cache(sources_ay, refresh=refresh, afid=True)
        res = query_year(self.active_year,
                         _search.source_id.tolist(),
                         refresh,
                         verbose,
                         afid=True)
        cache_insert(res, table="sources_afids")
        sources_ay, _ = sources_in_cache(sources_ay,
                                         refresh=refresh,
                                         afid=True)
        # Authors publishing in provided year and locations
        mask = None
        if self.search_affiliations:
            mask = sources_ay.afid.isin(self.search_affiliations)
        today = flat_set_from_df(sources_ay, "auids", mask)
        # Years before active year
        # Get already cached sources from cache
        sources_ys = DataFrame(list(product(search_sources, search_years)),
                               columns=["source_id", "year"])
        _, sources_ys_search = sources_in_cache(sources_ys, refresh=refresh)
        missing_years = set(sources_ys_search.year.tolist())
        # Eventually add information for missing years to cache
        for y in missing_years:
            mask = sources_ys_search.year == y
            _sources_search = sources_ys_search[mask].source_id.tolist()
            res = query_year(y, _sources_search, refresh, verbose)
            cache_insert(res, table="sources")
        # Get full cache
        sources_ys, _ = sources_in_cache(sources_ys, refresh=False)
        # Authors publishing in year(s) of first publication
        if not self._ignore_first_id:
            mask = sources_ys.year.between(min_year, max_year, inclusive=True)
            then = flat_set_from_df(sources_ys, "auids", mask)
        # Authors with publications before
        mask = sources_ys.year < min_year
        negative = flat_set_from_df(sources_ys, "auids", mask)
    else:
        auth_count = []
        print_progress(0, n, verbose)
        for i, source_id in enumerate(search_sources):
            info = query_journal(source_id, [self.active_year] + years,
                                 refresh)
            today.update(info[str(self.active_year)])
            if not self._ignore_first_id:
                for y in years:
                    then.update(info[str(y)])
            for y in range(int(min(info.keys())), min_year):
                negative.update(info[str(y)])
            for y in info:
                if int(y) <= self.active_year:
                    auth_count.extend(info[str(y)])
            print_progress(i + 1, n, verbose)
        c = Counter(auth_count)
        negative.update({a for a, npub in c.items() if npub > max_pubs})

    return today, then, negative
Пример #8
0
def filter_pub_counts(group,
                      ybefore,
                      yupto,
                      npapers,
                      yfrom=None,
                      verbose=False):
    """Filter authors based on restrictions in the number of
    publications in different periods, searched by query_size.

    Parameters
    ----------
    group : list of str
        Scopus IDs of authors to be filtered.

    ybefore : int
        Year to be used as first year. Publications on this year and before
        need to be 0.

    yupto : int
        Year up to which to count publications.

    npapers : list
        List of count of publications, minimum and maximum.

    yfrom : int (optional, default=None)
        If provided, publications are counted only after this year.
        Publications are still set to 0 before ybefore.

    Returns
    -------
    group : list of str
        Scopus IDs filtered.

    pubs_counts : list of int
        List of count of publications within the period provided for authors
        in group.

    older_authors : list of str
        Scopus IDs filtered out because have publications before ybefore.

    Notes
    -----
    It uses cached values first, and searches for more data if needed.
    """
    group = [int(x) for x in group]
    years_check = [ybefore, yupto]
    if yfrom:
        years_check.extend([yfrom - 1])
    authors = DataFrame(list(product(group, years_check)),
                        columns=["auth_id", "year"],
                        dtype="int64")
    authors_size = author_size_in_cache(authors)
    au_skip = []
    group_tocheck = [x for x in group]
    older_authors = []
    pubs_counts = []
    # use information in cache
    if not authors_size.empty:
        # authors that can be already removed because older
        mask = ((authors_size.year <= ybefore) & (authors_size.n_pubs > 0))
        remove = (authors_size[mask]["auth_id"].drop_duplicates().tolist())
        older_authors.extend(remove)
        au_remove = [x for x in remove]
        # remove if number of pubs in year is in any case too small
        mask = ((authors_size.year >= yupto) &
                (authors_size.n_pubs < min(npapers)))
        remove = (authors_size[mask]["auth_id"].drop_duplicates().tolist())
        au_remove.extend(remove)
        # authors with no pubs before min year
        mask = (((authors_size.year == ybefore) & (authors_size.n_pubs == 0)))
        au_ok_miny = (authors_size[mask]["auth_id"].drop_duplicates().tolist())
        # check publications in range
        if yfrom:
            # adjust count by substracting the count before period; keep
            # only authors for which it is possible
            mask = (authors_size.year == yfrom - 1)
            authors_size_bef = authors_size[mask]
            authors_size_bef["year"] = yupto
            authors_size_bef.columns = ["auth_id", "year", "n_pubs_bef"]
            bef_auth = set(authors_size_bef["auth_id"])
            mask = ((authors_size["auth_id"].isin(bef_auth)) &
                    (authors_size["year"] == yupto))
            authors_size = authors_size[mask]
            authors_size = authors_size.merge(authors_size_bef,
                                              "left",
                                              on=["auth_id", "year"])
            authors_size = authors_size.fillna(0)
            authors_size["n_pubs"] -= authors_size["n_pubs_bef"]
        # authors that can be already removed because of pubs count
        mask = (((authors_size.year >= yupto) &
                 (authors_size.n_pubs < min(npapers))) |
                ((authors_size.year <= yupto) &
                 (authors_size.n_pubs > max(npapers))))
        remove = (authors_size[mask]["auth_id"].drop_duplicates().tolist())
        au_remove.extend(remove)
        # authors with pubs count within the range before the given year
        mask = (((authors_size.year == yupto) &
                 (authors_size.n_pubs >= min(npapers))) &
                (authors_size.n_pubs <= max(npapers)))
        au_ok_year = authors_size[mask][["auth_id",
                                         "n_pubs"]].drop_duplicates()
        # authors ok (match both conditions)
        au_ok = list(set(au_ok_miny).intersection(set(au_ok_year["auth_id"])))
        mask = au_ok_year["auth_id"].isin(au_ok)
        pubs_counts = au_ok_year[mask]["n_pubs"].tolist()
        # authors that match only the first condition, but the second is
        # not known, can skip the first cindition check.
        au_skip = [x for x in au_ok_miny if x not in au_remove + au_ok]
        group = [x for x in group if x not in au_remove]
        group_tocheck = [x for x in group if x not in au_skip + au_ok]
    text = "Left with {} authors based on size information already in "\
           "cache.\n{} to check\n".format(len(group), len(group_tocheck))
    custom_print(text, verbose)
    # Verify that publications before minimum year are 0
    if group_tocheck:
        text = "Searching through characteristics of {:,} authors...".format(
            len(group_tocheck))
        custom_print(text, verbose)
        print_progress(0, len(group_tocheck), verbose)
        to_loop = [x for x in group_tocheck]  # Temporary copy
        for i, au in enumerate(to_loop):
            q = "AU-ID({}) AND PUBYEAR BEF {}".format(au, ybefore + 1)
            size = base_query("docs", q, size_only=True)
            tp = (au, ybefore, size)
            cache_insert(tp, table="author_size")
            print_progress(i + 1, len(to_loop), verbose)
            if not size == 0:
                group.remove(au)
                group_tocheck.remove(au)
                older_authors.append(au)
        text = "Left with {} authors based on size information before "\
               "minium year\n Filtering based on size query before "\
               "provided year\n".format(len(group))
        custom_print(text, verbose)
    # Verify that publications before the given year falle in range
    group_tocheck.extend(au_skip)
    n = len(group_tocheck)
    if group_tocheck:
        text = "Searching through characteristics of {:,} authors".format(n)
        custom_print(text, verbose)
        print_progress(0, n, verbose)
        for i, au in enumerate(group_tocheck):
            q = "AU-ID({}) AND PUBYEAR BEF {}".format(au, yupto + 1)
            n_pubs_yupto = base_query("docs", q, size_only=True)
            tp = (au, yupto, n_pubs_yupto)
            cache_insert(tp, table="author_size")
            # Eventually decrease publication count
            if yfrom and n_pubs_yupto >= min(npapers):
                q = "AU-ID({}) AND PUBYEAR BEF {}".format(au, yfrom)
                n_pubs_yfrom = base_query("docs", q, size_only=True)
                tp = (au, yfrom - 1, n_pubs_yfrom)
                cache_insert(tp, table="author_size")
                n_pubs_yupto -= n_pubs_yfrom
            if n_pubs_yupto < min(npapers) or n_pubs_yupto > max(npapers):
                group.remove(au)
            else:
                pubs_counts.append(n_pubs_yupto)
            print_progress(i + 1, n, verbose)
    return group, pubs_counts, older_authors
Пример #9
0
def search_group_from_sources(original,
                              stacked=False,
                              verbose=False,
                              refresh=False):
    """Define groups of authors based on publications from a set of sources.

    Parameters
    ----------
    original : sosia.Original
        The object of the Scientist to search information for.

    stacked : bool (optional, default=False)
        Whether to use fewer queries that are not reusable, or to use modular
        queries of the form "SOURCE-ID(<SID>) AND PUBYEAR IS <YYYY>".

    verbose : bool (optional, default=False)
        Whether to report on the progress of the process.

    refresh : bool (optional, default=False)
        Whether to refresh cached search files.

    Returns
    -------
    group : set
        Set of authors publishing in year of treatment, in years around
        first publication, and not before the latter period.
    """
    # Define variables
    search_sources, _ = zip(*original.search_sources)
    text = f"Defining 'search_group' using up to {len(search_sources):,} sources..."
    custom_print(text, verbose)

    # Retrieve author list for today
    sources_today = pd.DataFrame(product(search_sources,
                                         [original.active_year]),
                                 columns=["source_id", "year"])
    auth_today = get_authors_from_sourceyear(sources_today,
                                             original.sql_conn,
                                             refresh=refresh,
                                             stacked=stacked,
                                             verbose=verbose)
    mask = None
    if original.search_affiliations:
        mask = auth_today["afid"].isin(original.search_affiliations)
    today = flat_set_from_df(auth_today, "auids", condition=mask)

    # Authors active around year of first publication
    min_year = original.first_year - original.first_year_margin
    max_year = original.first_year + original.first_year_margin
    then_years = [min_year - 1]
    if not original.first_year_name_search:
        then_years.extend(range(min_year, max_year + 1))
    sources_then = pd.DataFrame(product(search_sources, then_years),
                                columns=["source_id", "year"])
    auth_then = get_authors_from_sourceyear(sources_then,
                                            original.sql_conn,
                                            refresh=refresh,
                                            stacked=stacked,
                                            verbose=verbose)
    mask = auth_then["year"].between(min_year, max_year)
    then = flat_set_from_df(auth_then, "auids", condition=mask)

    # Remove authors active before
    mask = auth_then["year"] < min_year
    before = flat_set_from_df(auth_then, "auids", condition=mask)
    today -= before

    # Compile group
    group = today
    if not original.first_year_name_search:
        group = today.intersection(then)
    return {int(a) for a in group}
Пример #10
0
def find_matches(original, stacked, verbose, refresh):
    """Find matches within the search group.

    Parameters
    ----------
    original : sosia.Original()
        The object containing information for the original scientist to
        search for.  Attribute search_group needs to exist.
    
    stacked : bool (optional, default=False)
        Whether to combine searches in few queries or not.  Cached
        files will most likely not be reusable.  Set to True if you
        query in distinct fields or you want to minimize API key usage.

    verbose : bool (optional, default=False)
        Whether to report on the progress of the process.

    refresh : bool (optional, default=False)
        Whether to refresh cached results (if they exist) or not. If int
        is passed and stacked=False, results will be refreshed if they are
        older than that value in number of days.
    """
    # Variables
    _years = range(original.first_year - original.first_year_margin,
                   original.first_year + original.first_year_margin + 1)
    _npapers = margin_range(len(original.publications), original.pub_margin)
    _max_papers = max(_npapers)
    _ncits = margin_range(original.citations, original.cits_margin)
    _max_cits = max(_ncits)
    _ncoauth = margin_range(len(original.coauthors), original.coauth_margin)
    _max_coauth = max(_ncoauth)
    if original.period:
        _npapers = margin_range(len(original.publications_period),
                                original.pub_margin)
        _ncits = margin_range(original.citations_period, original.cits_margin)
        _ncoauth = margin_range(len(original.coauthors_period),
                                original.coauth_margin)
    text = "Searching through characteristics of "\
           f"{len(original.search_group):,} authors..."
    custom_print(text, verbose)
    conn = original.sql_conn

    # First round of filtering: minimum publications and main field
    # create df of authors
    authors = get_authors(original.search_group,
                          original.sql_conn,
                          verbose=verbose)
    same_field = authors['areas'].str.startswith(original.main_field[1])
    enough_pubs = authors['documents'].astype(int) >= int(min(_npapers))
    group = sorted(authors[same_field & enough_pubs]["auth_id"].tolist())
    text = f"Left with {len(group):,} authors with sufficient "\
           "number of publications and same main field"
    custom_print(text, verbose)

    # Second round of filtering:
    # Check having no publications before minimum year, and if 0, the
    # number of publications in the relevant period.
    params = {
        "group": group,
        "ybefore": min(_years) - 1,
        "yupto": original.year,
        "npapers": _npapers,
        "yfrom": original._period_year,
        "verbose": verbose,
        "conn": conn
    }
    group, _, _ = filter_pub_counts(**params)
    # Screen out profiles with too many publications over the full period
    if original.period:
        params.update({
            "npapers": [1, _max_papers],
            "yfrom": None,
            "group": group
        })
        group, _, _ = filter_pub_counts(**params)
    text = f"Left with {len(group):,} researchers"
    custom_print(text, verbose)

    # Third round of filtering: citations (in the FULL period)
    authors = pd.DataFrame({"auth_id": group, "year": original.year})
    auth_cits, missing = retrieve_author_info(authors, conn, "author_ncits")
    if not missing.empty:
        total = missing.shape[0]
        text = f"Counting citations of {total:,} authors..."
        custom_print(text, verbose)
        missing['n_cits'] = 0
        print_progress(0, total, verbose)
        start = 0
        for i, au in missing.iterrows():
            n_cits = count_citations([str(au['auth_id'])], original.year + 1)
            missing.at[i, 'n_cits'] = n_cits
            print_progress(i + 1, total, verbose)
            if i % 100 == 0 or i == len(missing) - 1:
                insert_data(missing.iloc[start:i + 1],
                            conn,
                            table="author_ncits")
                start = i
    auth_cits = pd.concat([auth_cits, missing])
    auth_cits['auth_id'] = auth_cits['auth_id'].astype("uint64")
    # Keep if citations are in range
    custom_print("Filtering based on count of citations...", verbose)
    mask = auth_cits["n_cits"].between(min(_ncits), _max_cits)
    group = auth_cits[mask]['auth_id'].tolist()

    # Fourth round of filtering: Download publications, verify coauthors
    # (in the FULL period) and first year
    text = f"Left with {len(group):,} authors\nFiltering based on "\
           "coauthor count..."
    custom_print(text, verbose)
    authors = pd.DataFrame({
        "auth_id": group,
        "year": original.year
    },
                           dtype="uint64")
    _, author_year_search = retrieve_author_info(authors, conn, "author_year")
    matches = []

    if not author_year_search.empty:
        q = Template(f"AU-ID($fill) AND PUBYEAR BEF {original.year + 1}")
        auth_year_group = author_year_search["auth_id"].tolist()
        params = {
            "group": auth_year_group,
            "template": q,
            "refresh": refresh,
            "joiner": ") OR AU-ID(",
            "q_type": "docs",
            "verbose": verbose,
            "stacked": stacked
        }
        res = stacked_query(**params)
        res = build_dict(res, auth_year_group)
        if res:
            # res can become empty after build_dict if a au_id is old
            res = pd.DataFrame.from_dict(res, orient="index")
            res["year"] = original.year
            res = res[["year", "first_year", "n_pubs", "n_coauth"]]
            res.index.name = "auth_id"
            res = res.reset_index()
            insert_data(res, original.sql_conn, table="author_year")
    authors_year, _ = retrieve_author_info(authors, conn, "author_year")
    # Check for number of coauthors within margin
    mask = authors_year["n_coauth"].between(min(_ncoauth), _max_coauth)
    # Check for year of first publication within range
    if not original.first_year_name_search:
        same_start = authors_year["first_year"].between(
            min(_years), max(_years))
        mask = mask & same_start
    # Filter
    matches = sorted(authors_year[mask]["auth_id"].tolist())

    text = f"Left with {len(matches)} authors"
    custom_print(text, verbose)
    if original.period:
        text = "Filtering based on citations and coauthor count during period..."
        custom_print(text, verbose)
        # Further screen matches based on period cits and coauths
        to_loop = [m for m in matches]  # temporary copy
        for m in to_loop:
            res = base_query("docs",
                             f"AU-ID({m})",
                             refresh=refresh,
                             fields=["eid", "author_ids", "coverDate"])
            pubs = [
                p for p in res if
                original._period_year <= int(p.coverDate[:4]) <= original.year
            ]
            coauths = set(extract_authors(pubs)) - {str(m)}
            if not (min(_ncoauth) <= len(coauths) <= max(_ncoauth)):
                matches.remove(m)
                continue
            eids_period = [p.eid for p in pubs]
            n_cits = count_citations(eids_period, original.year + 1, [str(m)])
            if not (min(_ncits) <= n_cits <= max(_ncits)):
                matches.remove(m)

    # Eventually filter on affiliations
    if original.search_affiliations:
        text = "Filtering based on affiliations..."
        custom_print(text, verbose)
        matches[:] = [
            m for m in matches if same_affiliation(original, m, refresh)
        ]
    return matches
Пример #11
0
    def inform_matches(self,
                       fields=None,
                       verbose=False,
                       refresh=False,
                       stop_words=None,
                       **tfidf_kwds):
        """Add information to matches to aid in selection process.

        Parameters
        ----------
        fields : iterable (optional, default=None)
            Which information to provide. Allowed values are "first_year",
            "num_coauthors", "num_publications", "num_citations", "country",
            "language", "reference_sim", "abstract_sim".  If None, will
            use all available fields.

        verbose : bool (optional, default=False)
            Whether to report on the progress of the process.

        refresh : bool (optional, default=False)
            Whether to refresh cached results (if they exist) or not. If int
            is passed and stacked=False, results will be refreshed if they are
            older than that value in number of days.

        stop_words : list (optional, default=None)
            A list of words that should be filtered in the analysis of
            abstracts.  If None uses the list of English stopwords
            by nltk, augmented with numbers and interpunctuation.

        tfidf_kwds : keywords
            Parameters to pass to TfidfVectorizer from the sklearn package
            for abstract vectorization.  Not used when `information=False` or
            or when "abstract_sim" is not in `information`.  See
            https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
            for possible values.

        Notes
        -----
        Matches including corresponding information are available through
        property `.matches`.

        Raises
        ------
        fields
            If fields contains invalid keywords.
        """
        # Checks
        if not self._matches:
            text = "No matches defined.  Please run .find_matches() first."
            raise Exception(text)
        allowed_fields = [
            "first_name", "surname", "first_year", "num_coauthors",
            "num_publications", "num_citations", "num_coauthors_period",
            "num_publications_period", "num_citations_period", "subjects",
            "country", "affiliation_id", "affiliation", "language",
            "reference_sim", "abstract_sim"
        ]
        if fields:
            invalid = [x for x in fields if x not in allowed_fields]
            if invalid:
                text = "Parameter fields contains invalid keywords: " +\
                       ", ".join(invalid)
                raise ValueError(text)
        else:
            fields = allowed_fields

        custom_print("Providing additional information...", verbose)
        matches = inform_matches(self, fields, stop_words, verbose, refresh,
                                 **tfidf_kwds)
        self._matches = matches
Пример #12
0
def query_pubs_by_sourceyear(source_ids,
                             year,
                             stacked=False,
                             refresh=False,
                             verbose=False):
    """Get authors lists for each source in a particular year.

    Parameters
    ----------
    source_ids : list
        List of Scopus IDs of sources to search.

    year : str or int
        The year of the search.

    stacked : bool (optional, default=False)
        Whether to use fewer queries that are not reusable, or to use modular
        queries of the form "SOURCE-ID(<SID>) AND PUBYEAR IS <YYYY>".

    refresh : bool (optional, default=False)
        Whether to refresh cached files if they exist, or not.

    verbose : bool (optional, default=False)
        Whether to print information on the search progress.
    """
    # Dummy return value
    columns = ["source_id", "year", "auids", "afid"]
    dummy = pd.DataFrame(columns=columns)

    # Search authors
    msg = f"... parsing Scopus information for {year}..."
    custom_print(msg, verbose)
    q = Template(f"SOURCE-ID($fill) AND PUBYEAR IS {year}")
    params = {
        "group": [str(x) for x in sorted(source_ids)],
        "joiner": " OR ",
        "refresh": refresh,
        "q_type": "docs",
        "template": q,
        "verbose": verbose,
        "stacked": stacked
    }
    res = stacked_query(**params)

    # Verify data is not empty
    if res:
        res = pd.DataFrame(res).dropna(subset=["author_ids"])
        if res.empty:
            return dummy
    else:
        return dummy

    # Group data
    res = expand_affiliation(res)
    if res.empty:
        return dummy
    res["year"] = year
    res["author_ids"] = res["author_ids"] + ";"
    grouping_cols = ["source_id", "year", "afid"]
    res = (res.groupby(grouping_cols)[[
        "author_ids"
    ]].apply(sum).reset_index().rename(columns={"author_ids": "auids"}))
    res["auids"] = res["auids"].str.strip(";")
    return res
Пример #13
0
    def find_matches(self,
                     stacked=False,
                     verbose=False,
                     stop_words=STOPWORDS,
                     information=True,
                     refresh=False,
                     **tfidf_kwds):
        """Find matches within search_group based on four criteria:
        1. Started publishing in about the same year
        2. Has about the same number of publications in the year of treatment
        3. Has about the same number of coauthors in the year of treatment
        4. Has about the same number of citations in the year of treatment
        5. Works in the same field as the scientist's main field

        Parameters
        ----------
        stacked : bool (optional, default=False)
            Whether to combine searches in few queries or not.  Cached
            files will most likely not be resuable.  Set to True if you
            query in distinct fields or you want to minimize API key usage.

        verbose : bool (optional, default=False)
            Whether to report on the progress of the process.

        stop_words : list (optional, default=STOPWORDS)
            A list of words that should be filtered in the analysis of
            abstracts.  Default list is the list of english stopwords
            by nltk, augmented with numbers and interpunctuation.

        information : bool or iterable (optional, default=True)
            Whether to return additional information on the matches that may
            help in the selection process.  If an iterable of keywords is
            provied, only return information for these keywords.  Allowed
            values are "first_year", "num_coauthors", "num_publications",
            "num_citations", "country", "language",
            "reference_sim", "abstract_sim".

        refresh : bool (optional, default=False)
            Whether to refresh cached search files.

        tfidf_kwds : keywords
            Parameters to pass to TfidfVectorizer from the sklearn package
            for abstract vectorization.  Not used when `information=False` or
            or when "abstract_sim" is not in `information`.  See
            https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
            for possible values.

        Returns
        -------
        matches : list
            A list of Scopus IDs of scientists matching all the criteria (if
            information is False) or a list of namedtuples with the Scopus ID
            and additional information (if information is True).

        Raises
        ------
        ValueError
            If information is not bool and contains invalid keywords.
        """
        # Checks
        info_keys = [
            "first_name", "surname", "first_year", "num_coauthors",
            "num_publications", "num_citations", "num_coauthors_period",
            "num_publications_period", "num_citations_period", "subjects",
            "country", "affiliation_id", "affiliation", "language",
            "reference_sim", "abstract_sim"
        ]
        if isinstance(information, bool):
            if information:
                keywords = info_keys
            elif self.search_affiliations:
                information = True
                keywords = ["affiliation_id"]
            else:
                keywords = None
        else:
            keywords = information
            invalid = [x for x in keywords if x not in info_keys]
            if invalid:
                text = ("Parameter information contains invalid keywords: ",
                        ", ".join(invalid))
                raise ValueError(text)
            if self.search_affiliations and "affiliation_id" not in keywords:
                keywords.append("affiliation_id")
        # Variables
        _years = range(self.first_year - self.year_margin,
                       self.first_year + self.year_margin + 1)
        if self.period:
            _npapers = margin_range(len(self.publications_period),
                                    self.pub_margin)
            _ncits = margin_range(self.citations_period, self.cits_margin)
            _ncoauth = margin_range(len(self.coauthors_period),
                                    self.coauth_margin)
            _npapers_full = margin_range(len(self.publications),
                                         self.pub_margin)
            _ncits_full = margin_range(self.citations, self.cits_margin)
            _ncoauth_full = margin_range(len(self.coauthors),
                                         self.coauth_margin)
        else:
            _npapers = margin_range(len(self.publications), self.pub_margin)
            _ncits = margin_range(self.citations, self.cits_margin)
            _ncoauth = margin_range(len(self.coauthors), self.coauth_margin)
        n = len(self.search_group)
        text = "Searching through characteristics of {:,} authors".format(n)
        custom_print(text, verbose)

        # First round of filtering: minimum publications and main field
        # create df of authors
        authors = query_author_data(self.search_group, verbose=verbose)
        same_field = (authors.areas.str.startswith(self.main_field[1]))
        enough_pubs = (authors.documents.astype(int) >= int(min(_npapers)))
        group = authors[same_field & enough_pubs]["auth_id"].tolist()
        group.sort()
        n = len(group)
        text = "Left with {} authors\nFiltering based on provided "\
               "conditions...".format(n)
        custom_print(text, verbose)

        # Second round of filtering:
        # Check having no publications before minimum year, and if 0, the
        # number of publications in the relevant period.
        params = {
            "group": group,
            "ybefore": min(_years) - 1,
            "yupto": self.year,
            "npapers": _npapers,
            "yfrom": self.year_period,
            "verbose": verbose
        }
        group, _, _ = filter_pub_counts(**params)
        # Also screen out ids with too many publications over the full period
        if self.period:
            params.update({
                "npapers": [1, max(_npapers_full)],
                "yfrom": None,
                "group": group
            })
            group, _, _ = filter_pub_counts(**params)

        # Third round of filtering: citations (in the FULL period).
        authors = pd.DataFrame({"auth_id": group, "year": self.year})
        _, authors_cits_search = author_cits_in_cache(authors)
        text = "Search and filter based on count of citations\n{} to search "\
               "out of {}\n".format(len(authors_cits_search), len(group))
        custom_print(text, verbose)
        if not authors_cits_search.empty:
            authors_cits_search['n_cits'] = 0
            print_progress(0, len(authors_cits_search), verbose)
            for i, au in authors_cits_search.iterrows():
                q = "REF({}) AND PUBYEAR BEF {} AND NOT AU-ID({})".format(
                    au['auth_id'], self.year + 1, au['auth_id'])
                n = base_query("docs", q, size_only=True)
                authors_cits_search.at[i, 'n_cits'] = n
                print_progress(i + 1, len(authors_cits_search), verbose)
            cache_insert(authors_cits_search, table="author_cits_size")
        auth_cits_incache, _ = author_cits_in_cache(
            authors[["auth_id", "year"]])
        # keep if citations are in range
        mask = ((auth_cits_incache.n_cits <= max(_ncits)) &
                (auth_cits_incache.n_cits >= min(_ncits)))
        if self.period:
            mask = ((auth_cits_incache.n_cits >= min(_ncits)) &
                    (auth_cits_incache.n_cits <= max(_ncits_full)))
        group = (auth_cits_incache[mask]['auth_id'].tolist())

        # Fourth round of filtering: Download publications, verify coauthors
        # (in the FULL period) and first year.
        n = len(group)
        text = "Left with {} authors\nFiltering based on coauthors "\
               "number...".format(n)
        custom_print(text, verbose)
        authors = pd.DataFrame({
            "auth_id": group,
            "year": self.year
        },
                               dtype="uint64")
        _, author_year_search = author_year_in_cache(authors)
        matches = []
        if stacked:  # Combine searches
            if not author_year_search.empty:
                q = Template(
                    "AU-ID($fill) AND PUBYEAR BEF {}".format(self.year + 1))
                auth_year_group = author_year_search.auth_id.tolist()
                params = {
                    "group": auth_year_group,
                    "res": [],
                    "template": q,
                    "refresh": refresh,
                    "joiner": ") OR AU-ID(",
                    "q_type": "docs"
                }
                if verbose:
                    params.update({"total": len(auth_year_group)})
                res, _ = stacked_query(**params)
                res = build_dict(res, auth_year_group)
                if res:
                    # res can become empty after build_dict if a au_id is old
                    res = pd.DataFrame.from_dict(res, orient="index")
                    res["year"] = self.year
                    res = res[["year", "first_year", "n_pubs", "n_coauth"]]
                    res.index.name = "auth_id"
                    res = res.reset_index()
                    cache_insert(res, table="author_year")
            author_year_cache, _ = author_year_in_cache(authors)
            if self._ignore_first_id:
                # only number of coauthors should be big enough
                enough = (author_year_cache.n_coauth >= min(_ncoauth))
                notoomany = (author_year_cache.n_coauth <= max(_ncoauth_full))
                mask = enough & notoomany
            elif self.period:
                # number of coauthors should be "big enough" and first year in
                # window
                same_start = (author_year_cache.first_year.between(
                    min(_years), max(_years)))
                enough = (author_year_cache.n_coauth >= min(_ncoauth))
                notoomany = (author_year_cache.n_coauth <= max(_ncoauth_full))
                mask = same_start & enough & notoomany
            else:
                # all restrictions apply
                same_start = (author_year_cache.first_year.between(
                    min(_years), max(_years)))
                same_coauths = (author_year_cache.n_coauth.between(
                    min(_ncoauth), max(_ncoauth)))
                mask = same_start & same_coauths
            matches = author_year_cache[mask]["auth_id"].tolist()
        else:  # Query each author individually
            for i, au in enumerate(group):
                print_progress(i + 1, len(group), verbose)
                res = base_query("docs",
                                 "AU-ID({})".format(au),
                                 refresh=refresh)
                res = [
                    p for p in res
                    if p.coverDate and int(p.coverDate[:4]) <= self.year
                ]
                # Filter
                min_year = int(min([p.coverDate[:4] for p in res]))
                authids = [p.author_ids for p in res if p.author_ids]
                authors = set([a for p in authids for a in p.split(";")])
                n_coauth = len(authors) - 1  # Subtract 1 for focal author
                if self._ignore_first_id and (n_coauth < max(_ncoauth)):
                    # only number of coauthors should be big enough
                    continue
                elif (self.period and ((n_coauth < max(_ncoauth)) or
                                       (min_year not in _years))):
                    # number of coauthors should be "big enough" and first year
                    # in window
                    continue
                elif ((len(res) not in _npapers) or (min_year not in _years)
                      or (n_coauth not in _ncoauth)):
                    continue
                matches.append(au)

        if self.period:
            text = "Left with {} authors\nFiltering based on exact period "\
                   "citations and coauthors...".format(len(matches))
            custom_print(text, verbose)
            # Further screen matches based on period cits and coauths
            to_loop = [m for m in matches]  # temporary copy
            for m in to_loop:
                q = "AU-ID({})".format(m)
                res = base_query("docs",
                                 "AU-ID({})".format(m),
                                 refresh=refresh,
                                 fields=["eid", "author_ids", "coverDate"])
                pubs = [
                    p for p in res if int(p.coverDate[:4]) <= self.year
                    and int(p.coverDate[:4]) >= self.year_period
                ]
                coauths = set(get_authors(pubs)) - {str(m)}
                if not (min(_ncoauth) <= len(coauths) <= max(_ncoauth)):
                    matches.remove(m)
                    continue
                eids_period = [p.eid for p in pubs]
                cits = count_citations(search_ids=eids_period,
                                       pubyear=self.year + 1,
                                       exclusion_key="AU-ID",
                                       exclusion_ids=[str(m)])
                if not (min(_ncits) <= cits <= max(_ncits)):
                    matches.remove(m)
        text = "Found {:,} author(s) matching all criteria".format(
            len(matches))
        custom_print(text, verbose)

        # Possibly add information to matches
        if keywords and len(matches) > 0:
            custom_print("Providing additional information...", verbose)
            profiles = [
                Scientist([str(a)],
                          self.year,
                          period=self.period,
                          refresh=refresh) for a in matches
            ]
            matches = inform_matches(profiles, self, keywords, stop_words,
                                     verbose, refresh, **tfidf_kwds)
        if self.search_affiliations:
            matches = [
                m for m in matches if len(
                    set(m.affiliation_id.replace(" ", "").split(";")).
                    intersection([str(a) for a in self.search_affiliations]))
            ]
        return matches
Пример #14
0
def filter_pub_counts(group,
                      conn,
                      ybefore,
                      yupto,
                      npapers,
                      yfrom=None,
                      verbose=False):
    """Filter authors based on restrictions in the number of
    publications in different periods, searched by query_size.

    Parameters
    ----------
    conn : sqlite3 connection
        Standing connection to a SQLite3 database.

    group : list of str
        Scopus IDs of authors to be filtered.

    ybefore : int
        Year to be used as first year. Publications on this year and before
        need to be 0.

    yupto : int
        Year up to which to count publications.

    npapers : list
        List of count of publications, minimum and maximum.

    yfrom : int (optional, default=None)
        If provided, publications are counted only after this year.
        Publications are still set to 0 before ybefore.

    verbose : bool (optional, default=False)
        Whether to print information on the search progress.

    Returns
    -------
    group : list of str
        Scopus IDs filtered.

    pubs_counts : list of int
        List of count of publications within the period provided for authors
        in group.

    older_authors : list of str
        Scopus IDs filtered out because have publications before ybefore.
    """
    from itertools import product

    from pandas import DataFrame

    group = [int(x) for x in group]
    years_check = [ybefore, yupto]
    if yfrom:
        years_check.extend([yfrom - 1])
    authors = DataFrame(product(group, years_check),
                        dtype="uint64",
                        columns=["auth_id", "year"])
    auth_npubs, _ = retrieve_author_info(authors, conn, "author_pubs")
    au_skip = []
    group_tocheck = set(group)
    older_authors = []
    pubs_counts = []
    # Use information in database
    if not auth_npubs.empty:
        # Remove authors based on age
        mask = ((auth_npubs["year"] <= ybefore) & (auth_npubs["n_pubs"] > 0))
        au_remove = set(auth_npubs[mask]["auth_id"].unique())
        older_authors.extend(au_remove)
        # Remove if number of pubs in year is in any case too small
        mask = ((auth_npubs["year"] >= yupto) &
                (auth_npubs["n_pubs"] < min(npapers)))
        au_remove.update(auth_npubs[mask]["auth_id"])
        # Authors with no pubs before min year
        mask = ((auth_npubs["year"] == ybefore) & (auth_npubs["n_pubs"] == 0))
        au_ok_miny = set(auth_npubs[mask]["auth_id"].unique())
        # Check publications in range
        if yfrom:
            # Keep authors where subtracting publications from before period
            # from publication count is possible
            mask = auth_npubs["year"] == yfrom - 1
            rename = {"n_pubs": "n_pubs_bef"}
            auth_npubs_bef = auth_npubs[mask].copy().rename(columns=rename)
            auth_npubs_bef["year"] = yupto
            auth_npubs = (auth_npubs.merge(auth_npubs_bef,
                                           "inner",
                                           on=["auth_id", "year"]).fillna(0))
            auth_npubs["n_pubs"] -= auth_npubs["n_pubs_bef"]
        # Remove authors because of their publication count
        mask = (((auth_npubs["year"] >= yupto) &
                 (auth_npubs["n_pubs"] < min(npapers))) |
                ((auth_npubs["year"] <= yupto) &
                 (auth_npubs["n_pubs"] > max(npapers))))
        remove = auth_npubs[mask]["auth_id"]
        au_remove.update(remove)
        # Authors with pubs count within the range before the given year
        mask = (((auth_npubs["year"] == yupto) &
                 (auth_npubs["n_pubs"] >= min(npapers))) &
                (auth_npubs["n_pubs"] <= max(npapers)))
        au_ok_year = auth_npubs[mask][["auth_id", "n_pubs"]].drop_duplicates()
        # Keep authors that match both conditions
        au_ok = au_ok_miny.intersection(au_ok_year["auth_id"].unique())
        mask = au_ok_year["auth_id"].isin(au_ok)
        pubs_counts = au_ok_year[mask]["n_pubs"].tolist()
        # Skip citation check for authors that match only the first condition,
        # with the second being unknown
        au_skip = set([x for x in au_ok_miny if x not in au_remove | au_ok])
        group = [x for x in group if x not in au_remove]
        group_tocheck = set([x for x in group if x not in au_skip | au_ok])

    # Verify that publications before minimum year are 0
    if group_tocheck:
        n = len(group_tocheck)
        text = f"Obtaining information for {n:,} authors without sufficient "\
               "information in database..."
        custom_print(text, verbose)
        print_progress(0, n, verbose)
        to_loop = [x for x in group_tocheck]  # Temporary copy
        for i, auth_id in enumerate(to_loop):
            npubs_ybefore = auth_npubs_retrieve_insert(auth_id, ybefore, conn)
            if npubs_ybefore:
                group.remove(auth_id)
                group_tocheck.remove(auth_id)
                older_authors.append(auth_id)
            print_progress(i + 1, n, verbose)
        text = f"Left with {len(group):,} authors based on publication "\
               f"information before {ybefore}"
        custom_print(text, verbose)

    # Verify that publications before the given year fall in range
    group_tocheck.update(au_skip)
    if group_tocheck:
        n = len(group_tocheck)
        text = f"Counting publications of {n:,} authors before {yupto+1}..."
        custom_print(text, verbose)
        print_progress(0, n, verbose)
        for i, au in enumerate(group_tocheck):
            n_pubs_yupto = auth_npubs_retrieve_insert(au, yupto, conn)
            # Eventually decrease publication count
            if yfrom and n_pubs_yupto >= min(npapers):
                n_pubs_yfrom = auth_npubs_retrieve_insert(au, yfrom - 1, conn)
                n_pubs_yupto -= n_pubs_yfrom
            if n_pubs_yupto < min(npapers) or n_pubs_yupto > max(npapers):
                group.remove(au)
            else:
                pubs_counts.append(n_pubs_yupto)
            print_progress(i + 1, n, verbose)
    return group, pubs_counts, older_authors