def get_schoolar_data(author_name,
                      cache_folder="scholarly",
                      affiliation='UBC'):
    output_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 "..", "resources", cache_folder)
    cached = os.path.join(output_folder, format_author(author_name))
    from_cache = False
    final_data = []
    if not os.path.isfile(cached):

        try:
            # Retrieve the author's data, fill-in, and print
            search_query = scholarly.search_author(
                f'{author_name} {affiliation}')
            author = scholarly.fill(next(search_query))

            # Print the titles of the author's publications
            titles = [pub['bib']['title'] for pub in author['publications']]

            final_data = []
            for title in titles:
                logger.info("Processing " + Fore.YELLOW + title +
                            Style.RESET_ALL)
                ret = get_publication(title)
                retries = 0
                while not ret['success'] and retries < MAX_RETRIES_ON_ERROR:
                    retries += 1
                    msg = "Error while querying CrossRef API ({}), retrying ({})...".format(
                        ret["exception"], retries)
                    logger.info(Fore.RED + msg + Style.RESET_ALL)
                    ret = get_publication(title)
                    sleep(3)

                if ret['success']:
                    ret['original_title'] = title
                    final_data.append(ret)
                else:
                    logger.info(Fore.RED + '> Failed' + Style.RESET_ALL)

            final_data = list(
                filter(lambda k: k['result']['similarity'] >= 0.7, final_data))
            final_data = sorted(final_data,
                                key=lambda k: k['result']['similarity'],
                                reverse=True)

            with open(cached, 'w') as fo:
                json.dump(final_data, fo, indent=4, sort_keys=True)
        except StopIteration:
            logger.info(Fore.RED + 'no more schoolar data available' +
                        Style.RESET_ALL)
            with open(cached, 'w') as fo:
                json.dump(final_data, fo, indent=4, sort_keys=True)
        except Exception as ex:
            logger.exception(str(ex))
    else:
        with open(cached, 'r') as fo:
            final_data = json.load(fo)
            from_cache = True

    return final_data, from_cache
def wikidata_keywords(author_name, keyword_authors, authors_keywords):
    resources_folder = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), "..", "resources")

    author_file = format_author(author_name)
    resources_folder = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), "..", "resources")

    gdata = load_google_data(author_file, resources_folder)

    for row in gdata:
        try:
            title = None
            if row['title']:
                title = next(iter(row['title']))
            if not title and 'original_title' in row:
                title = row['original_title']

            if not title:
                continue

            main_subject = get_publication_subject(
                author_name, row, title=title)  # main subject,
            if not main_subject:
                continue

            for keyword in main_subject:
                keyword_authors[keyword].append(author_name)
                authors_keywords[author_name].append(keyword)

        except Exception as ex:
            logger.exception(ex)
def publications_info(author_name, test=False):
    logger.info('\nProcessing' + Fore.YELLOW + f' {author_name}' + Style.RESET_ALL)

    resources_folder = os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
        "..", "resources"
    )
    author_file = format_author(author_name)
    output_folder = os.path.join(resources_folder, "wikidata")
    cached = os.path.join(output_folder, author_file)

    if not os.path.isfile(cached):
        publication_lst = merge_sources(
            load_google_data(author_file, resources_folder),
            load_crossref_data(author_file, resources_folder)
        )
        logger.info("\tFetching publications on " + Fore.YELLOW + 'Wikidata' + Style.RESET_ALL)

        in_wikidata, not_in_wikidata = [], []
        for publication in publication_lst:
            data = wikidata_api.get_publication(publication["DOI"])
            if data:
                in_wikidata.append(data)
            else:
                not_in_wikidata.append(publication)

        author_wikidata_id = get_wikidata_author_id(author_name, in_wikidata)
        detailed_wikidata_lst = get_wikidata_detailed_publications(in_wikidata)

        final_data = dict(
            author=author_name,
            wikidata_id=author_wikidata_id,
            wikidata=detailed_wikidata_lst,
            missing_data=not_in_wikidata
        )

        if not test:
            with open(cached, 'w') as fo:
                json.dump(final_data, fo, indent=4, sort_keys=True)
        logger.info(Fore.YELLOW + '> done' + Style.RESET_ALL)
    else:
        logger.info(Fore.YELLOW + '> already on cache' + Style.RESET_ALL)
def wikidata_import(author_name, test=False):
    resources_folder = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), "..", "resources")
    author_file = format_author(author_name)
    output_folder = os.path.join(resources_folder, "wikidata")
    cached = os.path.join(output_folder, author_file)

    result = []
    if os.path.isfile(cached):
        with open(cached) as fo:
            data = json.load(fo)
            new_entry = dict(
                Len=data['author'],
                P31='Q5',  # instance of = human
                P106='Q1650915',  # occupation = researcher
                P463='Q106489997',  # member of = Future Water Cluster
                P2561=data['author'],  # name
            )
            result.append(new_entry)

    return result
def publications_info(author_name, test=False):
    os.chdir(os.path.dirname(os.path.realpath(__file__)))

    wikidata_id = None
    output_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 "..", "resources", "crossref")
    cached = os.path.join(output_folder, format_author(author_name))

    if not os.path.isfile(cached):
        logger.info('Processing' + Fore.YELLOW + f' {author_name}' +
                    Style.RESET_ALL)
        logger.info("Fetching author " + Fore.YELLOW + 'on crossref' +
                    Style.RESET_ALL)
        author_data = crossref_api.get_author(author_name,
                                              max_results=1000,
                                              sim_threshold=0.90)

        with open(cached, 'w') as fo:
            json.dump(author_data, fo, indent=4, sort_keys=True)
        return author_data
    else:
        # logger.info(Fore.YELLOW + f'loaded {author_name} from cache' + Style.RESET_ALL)
        with open(cached, 'r') as fo:
            return json.load(fo)