Пример #1
0
def get_author(authorname, numcitations=-1):
    print("Searching Scholarly...")
    best_guess = None
    try:
        search_query = scholarly.search_author(authorname)

        if numcitations == -1:  # we have no citation data for this author
            while True:
                try:
                    best_guess = author_next_util(search_query)
                    break
                except StopIteration:  # end of author list reached (list has no members)
                    break
                except ZeroDivisionError:  # request limit for ip address reached
                    print("Limit Reached... setting new proxy")
                    set_new_proxy()
                    search_query = scholarly.search_author(authorname)
                except:  # timeout
                    break
        else:
            while True:
                try:

                    author = author_next_util(search_query)
                    #print(author.affiliation)
                    #print(extract_geo_from_text(author.affiliation))
                    if best_guess is None:
                        best_guess = author
                    else:
                        try:  # in case .citedby not contained
                            if abs(numcitations -
                                   author.citedby) < abs(numcitations -
                                                         best_guess.citedby):
                                best_guess = author
                        except:
                            pass
                except StopIteration:  # end of author list reached
                    break
                except ZeroDivisionError:  # request limit
                    print("Limit Reached... setting new proxy")
                    set_new_proxy()
                    search_query = scholarly.search_author(authorname)
                except:  # timeout
                    break

    except:
        return None

    return best_guess
Пример #2
0
def fetch_citer_by_author(author):
    # first, get the author entry

    # check if item is a name or an id
    #if name
    if True:
        author_gen = scholarly.search_author(author)
    #else (if id)
    else:
        author_gen = scholarly.search_author_id(author)

    # what happens if there is more then one other?
    # check for that and throw an error
    matches = list(author_gen)

    assert (len(matches) == 1), "Author query not unique"
    author = matches[0]
    author.fill()

    # second, fetch all publications

    #authors_publications = [pub.bib['title'] for pub in author.publications]
    #print(authors_publications)

    # third, use these publications to query for citations
    for kk, pub in enumerate(author.publications):
        print(kk)
        # fetch the full publictation entry to get citations
        pub.fill()
        citers = [citation.bib['title'] for citation in pub.citedby]

    return citers
Пример #3
0
def plot_citations(author_name):
    m = Basemap(projection='mill', lon_0=180)
    m.drawmapboundary(fill_color='aqua')
    m.fillcontinents(color='coral', lake_color='aqua')

    search_query = scholarly.search_author(author_name)
    author = next(search_query).fill()
    print(author)
    for pub in [author.publications[0]]:
        print('Title: ', pub.bib['title'])
        pub = pub.fill()
        sleep(45)
        for citation in pub.citedby:
            print(citation)
            sleep(45)
            firstAuthorId = None
            while firstAuthorId is None or len(citation.bib['author_id']) == 0:
                firstAuthorId = citation.bib['author_id'].pop()
            if firstAuthorId is None:
                continue
            print(firstAuthorId)
            author = scholarly.search_author_id(firstAuthorId)
            sleep(45)
            lat, lon = get_location(author.affiliation)
            x, y = m(float(lon), float(lat))
            m.plot(x, y, marker='D')
    plt.show()
Пример #4
0
def get_author(author_name):
    try:
        print("author_name : ",author_name)
        # Search by author name and return a generator of Author objects
        search_query = scholarly.search_author(author_name)
        # Populate the Author with information from their profile
        author = next(search_query).fill()

    except Exception as exception:
        print("exception")
        return jsonify(sucess=False,
                       message='author was not found',
                       status=HTTPStatus.NOT_FOUND.value,
                       detatil=HTTPStatus.NOT_FOUND.description,
                       ), HTTPStatus.NOT_FOUND

    try:
        # cache author information for tests
        file_name = author_name.replace(' ', '_').lower() + ".json"
        with open("cache/" + file_name, 'w') as file:
            file.write(author.toJSON())

    except Exception as exception:
        print("exception")
        return jsonify(sucess=False,
                       message=str(exception),
                       status=HTTPStatus.INTERNAL_SERVER_ERROR.value,
                       detatil=HTTPStatus.INTERNAL_SERVER_ERROR.description,
                       ), HTTPStatus.INTERNAL_SERVER_ERROR

    return Response(author.toJSON(), HTTPStatus.OK, mimetype='application/json')
Пример #5
0
def quick_fetch_author(name):
    search_query = scholarly.search_author(name)
    author = scholarly.fill(next(search_query),
                            sections=['publications', 'coauthors'])
    iterator = 0

    data = {}
    publications = []
    coauthors = []

    for auth in author['coauthors']:
        coauthors.append(auth['name'])

    for pub in author['publications']:
        pub_info = {}
        make_attribute(pub_info, 'title', pub, 'bib')
        make_attribute(pub_info, 'num_citations', pub, 'plain')
        make_attribute(pub_info, 'pub_year', pub, 'bib')
        pub_info['_id'] = iterator
        iterator += 1
        publications.append(pub_info)

    make_attribute(data, 'name', author, 'plain')
    make_attribute(data, 'coauthors', coauthors, 'obj')
    make_attribute(data, 'affiliation', author, 'plain')
    make_attribute(data, 'email_domain', author, 'plain')
    make_attribute(data, 'interests', author, 'plain')
    make_attribute(data, 'citedby', author, 'plain')
    make_attribute(data, 'number_of_publications', len(publications), 'obj')
    make_attribute(data, 'publications', publications, 'obj')
    return data
Пример #6
0
def main(author_name):
    """ Print all publications as JSON to STDOUT """
    data = {}
    data['publications'] = []

    author = scholarly.fill(next(scholarly.search_author(author_name)))
    for pub in author['publications']:
        pub_details = scholarly.fill(pub)['bib']
        data['publications'].append({
            'authors':
            reformat_coauthors(pub_details['author'].split(' and ')),
            'year':
            pub_details.get('pub_year', ''),
            'title':
            pub_details.get('title', ''),
            'journal':
            pub_details.get('journal', ''),
            'volume':
            pub_details.get('volume', ''),
            'issue':
            pub_details.get('issue', ''),
            'pages':
            pub_details.get('pages', ''),
            'citations':
            pub.get('num_citations', 0),
            'pub_url':
            pub.get('pub_url', ''),
            #'eprint_url': pub.get('pub_url', '') # seems to be same as pub_url
        })

    output_json(data)
Пример #7
0
def find_bbe_coauthors(
        name, institution="Caltech", start=2015, verbose=True):
    search_query = scholarly.search_author(name + ", " + institution)
    author = scholarly.fill(next(search_query))

    coauthors = set()
    for i, pub in enumerate(author['publications']):
        # Make sure this is within the date range that we care about
        try:
            if int(pub['bib']['pub_year']) < start:
                # Skip this entry
                continue
            elif verbose:
                print(i, end=" ", flush=True)
        except KeyError:
            continue
            
        # Get the full data
        pub = scholarly.fill(pub)
        
        for author in pub['bib']['author'].split("and"):
            coauthors.add(author.strip())

    print("")
    return coauthors
Пример #8
0
 def read_author_data(author_name):
     print("reading data for {0:s}...".format(author_name))
     scholarly.use_tor(9150, 9151, '')
     author = next(scholarly.search_author(author_name)).fill()
     a_data = {
         "name":
         name,
         "citedby":
         author.citedby,
         "citedby5y":
         author.citedby5y,
         "hindex":
         author.hindex,
         "hindex5y":
         author.hindex5y,
         "i10index":
         author.i10index,
         "i10index5y":
         author.i10index5y,
         "url_picture":
         author.url_picture,
         "pubs": [{
             "title":
             pub.bib['title'],
             "year":
             pub.bib['year'] if "year" in pub.bib else -1,
             "citedby":
             pub.citedby if hasattr(pub, "citedby") else 0,
             "link":
             pub.id_citations if hasattr(pub, "id_citations") else ""
         } for pub in author.publications]
     }
     return a_data
Пример #9
0
def getAuthors(name, max_range=5):

    search_query = scholarly.search_author(name)
    authors_summary = []
    for i in range(0, max_range):
        result = next(search_query, None)
        if result is None:
            break
        authors_summary.append({
            "name": result.name,
            "affiliation": result.affiliation,
            "url_picture": result.url_picture,
            "id": result.id,
        })

        # these parameters aren't present in all `Author` instances,
        # so we check if they exist before adding them to the author instance
        other_params = [
            "citedby", "i10index", "hindex", "coauthors", "interests"
        ]

        for param in other_params:
            if param in dir(result):
                authors_summary[-1][param] = getattr(result, param)

    json = {"author_search_result": authors_summary}
    return json
Пример #10
0
 def test_single_author(self):
     query = 'Steven A. Cholewiak'
     authors = [a for a in scholarly.search_author(query)]
     self.assertGreaterEqual(len(authors), 1)
     author = authors[0].fill()
     self.assertEqual(author.name, u'Steven A. Cholewiak, PhD')
     self.assertEqual(author.id, u'4bahYMkAAAAJ')
def get_gs_data(author, titles):
    current_author = author

    search_query = scholarly.search_author(author)
    author = next(search_query).fill()

    for i in range(0, len(titles)):
        # ------------ ADD ALL INFO TO DB ------------
        cites = author.publications[i].bib['cites']
        title = author.publications[i].bib['title']
        # url = info.bib['url']
        year = 0
        if 'year' in author.publications[i].bib:
            year = author.publications[i].bib['year']
        else:
            year = 0
        # authors = info.bib['author'] # => List of all authors
        # abstract = info.bib['abstract']
        # if year == 'NA':
        #     year = 0

        paper = models.Paper.objects.create(
            title=title,
            co_author=current_author,
            citations_google_scholar=int(cites),
            publication_year=int(year),
            # abstract = abstract,
            # url = url,
        )
        paper.save()
        print(f'{title}->{cites}')
Пример #12
0
def get_schoolar_data(author_name,
                      cache_folder="scholarly",
                      affiliation='UBC'):
    output_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 "..", "resources", cache_folder)
    cached = os.path.join(output_folder, format_author(author_name))
    from_cache = False
    final_data = []
    if not os.path.isfile(cached):

        try:
            # Retrieve the author's data, fill-in, and print
            search_query = scholarly.search_author(
                f'{author_name} {affiliation}')
            author = scholarly.fill(next(search_query))

            # Print the titles of the author's publications
            titles = [pub['bib']['title'] for pub in author['publications']]

            final_data = []
            for title in titles:
                logger.info("Processing " + Fore.YELLOW + title +
                            Style.RESET_ALL)
                ret = get_publication(title)
                retries = 0
                while not ret['success'] and retries < MAX_RETRIES_ON_ERROR:
                    retries += 1
                    msg = "Error while querying CrossRef API ({}), retrying ({})...".format(
                        ret["exception"], retries)
                    logger.info(Fore.RED + msg + Style.RESET_ALL)
                    ret = get_publication(title)
                    sleep(3)

                if ret['success']:
                    ret['original_title'] = title
                    final_data.append(ret)
                else:
                    logger.info(Fore.RED + '> Failed' + Style.RESET_ALL)

            final_data = list(
                filter(lambda k: k['result']['similarity'] >= 0.7, final_data))
            final_data = sorted(final_data,
                                key=lambda k: k['result']['similarity'],
                                reverse=True)

            with open(cached, 'w') as fo:
                json.dump(final_data, fo, indent=4, sort_keys=True)
        except StopIteration:
            logger.info(Fore.RED + 'no more schoolar data available' +
                        Style.RESET_ALL)
            with open(cached, 'w') as fo:
                json.dump(final_data, fo, indent=4, sort_keys=True)
        except Exception as ex:
            logger.exception(str(ex))
    else:
        with open(cached, 'r') as fo:
            final_data = json.load(fo)
            from_cache = True

    return final_data, from_cache
Пример #13
0
 def search_author(name):
     while True:
         search_query = scholarly.search_author(name)
         author = next(search_query).fill()
         if author:
             break
     return author
Пример #14
0
 def test_search_author_filling_author_publications(self):
     """
      Download a few publications for author and check that abstracts are
      populated with lengths within the expected limits. This process
      checks the process of filling a publication that is derived
      from the author profile page.
      """
     query = 'Ipeirotis'
     authors = [a for a in scholarly.search_author(query)]
     self.assertGreaterEqual(len(authors), 1)
     author = authors[0].fill()
     # Check that we can fill without problem the first two publications
     publications = author.publications[:2]
     for i in publications:
         i.fill()
     self.assertEqual(len(publications), 2)
     abstracts_populated = [
         'abstract' in p.bib.keys() for p in publications
     ]
     # Check that all publications have the abstract field populated
     self.assertTrue(all(abstracts_populated))
     # Check that the abstracts have reasonable lengths
     abstracts_length = [len(p.bib['abstract']) for p in publications]
     abstracts_check = [1000 > n > 500 for n in abstracts_length]
     self.assertTrue(all(abstracts_check))
Пример #15
0
def get_all_coauthors(author_name, min_year, max_year, max_coauthors,
                      include_no_year):
    """ Get a set of all coauthors """
    author = scholarly.fill(next(scholarly.search_author(author_name)))
    all_coauthors = set()
    for pub in author['publications']:
        # Evaluate if publication year is indicated (if not, ignore depending
        # on presence of --include_no_year flag)
        if 'pub_year' in pub['bib']:
            pub_year = int(pub['bib']['pub_year'])
        elif include_no_year:
            pub_year = max_year
        else:
            pub_year = min_year - 1

        # Evaluate whether publication falls within indicated timerange
        if min_year <= pub_year <= max_year:
            coauthors = scholarly.fill(pub)['bib']['author'].split(' and ')
            # Evaluate if number of coauthors meets optional threshold
            if len(coauthors) <= max_coauthors:
                [
                    all_coauthors.add(reformat_name(coauthor))
                    for coauthor in coauthors
                ]
    return all_coauthors
Пример #16
0
def getAuthorsPublications(name, _range=None):
    search_query = scholarly.search_author(name)
    author = next(search_query).fill(["publications"])
    author_pubs = author.publications

    #determine range
    if _range is not None:
        try:
            _range = min(int(_range), len(author_pubs))
        except:
            json = {"message": "Invalid range argument."}
            return json
    else:
        _range = 5

    #create publications array
    pubs = []
    for i in range(0, _range):
        try:
            bib = author_pubs[len(author_pubs) - i - 1].fill().bib
        except:
            bib = author_pubs[len(author_pubs) - i - 1].bib

        pub = {
            "title": bib.get("title", "unknown"),
            "author": bib.get("author", "unknown"),
            "summary": bib.get("abstract", "Summary not provided."),
            "year": bib.get("year", "unknown"),
            "url": bib.get("url", "#")
        }
        pubs.append(pub)

    #return json object
    json = {"publications": pubs}
    return json
Пример #17
0
def scholarlyBookAuthor():
    # try:
    query = request.form['query']
    search_query = scholarly.search_author(query)
    string = '['
    author = next(search_query).fill()
    for pub in author.publications:
        search_book = scholarly.search_pubs(pub.bib['title'])
        book = next(search_book)
        # print (book)
        url = ''
        try:
            url = book['blb']
        except:
            url = ''

        print(url)
        # print (basename(url))
    # for i in range(5):
    #     try:
    #         author = next(search_query)
    #         string += str(author) + ","
    #     except:
    #         print('')
    #
    # if (len(string) > 0):
    #     string = string[:-1]
    #     string=string+"]"

    return (str(string))
Пример #18
0
 def test_search_author_multiple_authors(self):
     """
     As of May 12, 2020 there are at least 24 'Cattanis's listed as authors
     and Giordano Cattani is one of them
     """
     authors = [a.name for a in scholarly.search_author('cattani')]
     self.assertGreaterEqual(len(authors), 24)
     self.assertIn(u'Giordano Cattani', authors)
Пример #19
0
 def test_search_author_single_author(self):
     query = 'Steven A. Cholewiak'
     authors = [a for a in scholarly.search_author(query)]
     self.assertGreaterEqual(len(authors), 1)
     author = authors[0].fill()
     self.assertEqual(author.name, u'Steven A. Cholewiak, PhD')
     self.assertEqual(author.id, u'4bahYMkAAAAJ')
     pub = author.publications[2].fill()
     self.assertEqual(pub.id_citations, u'4bahYMkAAAAJ:ufrVoPGSRksC')
Пример #20
0
 def test_search_author_single_author(self):
     query = 'Steven A. Cholewiak'
     authors = [a for a in scholarly.search_author(query)]
     self.assertGreaterEqual(len(authors), 1)
     author = scholarly.fill(authors[0])
     self.assertEqual(author['name'], u'Steven A. Cholewiak, PhD')
     self.assertEqual(author['scholar_id'], u'4bahYMkAAAAJ')
     pub = scholarly.fill(author['publications'][2])
     self.assertEqual(pub['author_pub_id'], u'4bahYMkAAAAJ:LI9QrySNdTsC')
Пример #21
0
 def _search_author_by_name(self, name):
     authors = []
     while True:
         try:
             authors = [a for a in scholarly.search_author(name)]
             #print("Retrieved authors")
             return authors
         except Exception as e:
             print("trying new proxy")
             self._set_new_proxy()
Пример #22
0
def proxied_search_author(author):
    while True:
        try:
            search_query = scholarly.search_author(author)
            print("Got the results of the query")
            return search_query
        except Exception as e:
            print(e)
            print("Trying new proxy")
            set_new_proxy()
Пример #23
0
def add_search():
    search_form = SearchForm(formdata=request.args)

    academics = []

    if search_form.search.data:
        for a in islice(scholarly.search_author(search_form.search.data), 0, 10):
            academics.append(a)

    return render_template("ui/add.html", academics=academics, search_form=search_form)
Пример #24
0
    def get(self):

        parser = reqparse.RequestParser()
        parser.add_argument('name', required=True)

        name = parser.parse_args()['name']

        search_query = scholarly.search_author(name)
        author = next(search_query).fill()
        cites_per_year = author.cites_per_year
        return {"cites_per_year": cites_per_year}
Пример #25
0
def get_submitter_info(submitter, paper_title):
    try:
        query = scholarly.search_author(submitter)
        while True:
            author = next(query).fill()
            for pub in author.publications:
                title_match_score = SequenceMatcher(a=pub.bib["title"], b=paper_title).ratio()
                if title_match_score >= 0.9:
                    return author
    except:
        return None
Пример #26
0
def save_csv():
    _file = open("output.csv", "w+")
    search_query = scholarly.search_author("Mayken Espinoza-Andaluz")
    author = scholarly.fill(next(search_query))
    _file.write("title|authors|year|abstract\n")
    for pub in author["publications"]:

        title = pub["bib"]["title"]
        year = pub["bib"]["pub_year"]
        abstract = pub["bib"]["title"]
        _file.write(f"{title}|{authors}|{year}|{abstract}\n")
    _file.close()
Пример #27
0
def get_author(author,university=""):
  url_part = "https://scholar.google.co.in/citations?user="******", "+university if university!='' else ''))
  try:
    authorResult = next(authorSearch)
  except:
    return "Not Found"
  authorRaw = scholarly.fill(authorResult,sections=['basics','indices','publications'])
  authorDetails = {'name':authorRaw['name'],'affiliation':authorRaw['affiliation'],'email_domain':authorRaw['email_domain'],'interests':authorRaw['interests']
                  ,'publications':len(authorRaw['publications']),'citedby':authorRaw['citedby'],'hindex':authorRaw['hindex'],'i10index':authorRaw['i10index']
                  ,'gscholar_url':url_part+authorRaw['scholar_id']}
  return authorDetails
Пример #28
0
def fetch_citations(author,
                    filesave="citations.json",
                    proxy="",
                    proxy_list=""):
    """ Fetch citations from google scholar using scholarly """
    if proxy != "":
        print("Setting up proxy ", proxy)
        scholarly.use_proxy(scholarly.SingleProxy(http=proxy, https=proxy))
    if proxy_list != "":
        lproxies = open(proxy_list, 'r').readlines()

        def proxy_gen():
            if proxy_gen.counter >= len(lproxies):
                raise IndexError("We ran out of proxies...")
            proxy = lproxies[proxy_gen.counter]
            if not proxy.startswith("http"):
                proxy = "http://" + proxy
            proxy_gen.counter += 1
            return proxy

        proxy_gen.counter = 0
        scholarly.use_proxy(proxy_gen)

    print("Looking up " + author)
    search = scholarly.search_author(author)
    author = scholarly.fill(next(search))
    publications = []

    for i, pub in enumerate(author['publications']):
        cites = pub['num_citations']  # often this gets messed up upon .fill()
        if "pub_year" in pub['bib']:
            pubyear = pub['bib'][
                "pub_year"]  # also this gets messed up upon .fill()
            pub = scholarly.fill(pub)
            pub['bib']["pub_year"] = pubyear
        else:
            pub = scholarly.fill(pub)
            if not "pub_year" in pub.bib:
                # skip publications that really don't have a year,
                # they probably are crap that was picked up by the search robot
                continue

        pub['num_citations'] = cites
        print("Fetching: " + str(i) + "/" + str(len(author['publications'])) +
              ": " + pub['bib']["title"] + " (" + str(pub['bib']["pub_year"]) +
              ")")
        pub['bib'].pop("abstract", None)
        pub.pop("source", None)
        publications.append(pub)
    f = open(filesave, "w")
    f.write(json.dumps(publications))
    f.close()
Пример #29
0
def get_author(search):
    """
        Queries google scholar to find an author given a 
        search string. If != 0 results are found it gives an error
    """

    authors =  list(scholarly.search_author(search))
    if len(authors) > 1:
        raise ValueError(f'Found >1 authors with search string: {searc}, try something more specifc')
    elif not authors:
        raise ValueError(f'Could not find authors with search string: {search}')

    return authors[0].fill(sections=['basics', 'indices', 'publications'])
Пример #30
0
def test_scholar():
    ''' test de la connexion google scholar '''
    pp = pprint.PrettyPrinter(2)
    tab_author = ['xxxx bidule', 'Steven A. Cholewiak']
    for author in tab_author:
        generator = scholarly.search_author(author)
        doc = get_author(generator)
        if doc is None:
            print('Author not found:' + author)
        else:
            print('Author found:' + author)
            pp.pprint(doc)
    '''