Пример #1
0
def search_allengines(query, headers, _pages, _gs_pages, _acm_pages,
                      _els_pages, records, _title, _keyword, _abstract,
                      _search_yr, _from_yr, _to_yr_, logging_flag, data):
    # Search all engines
    try:
        '--- Engines for Title, Keyword and Abstract---'
        search_googleScholar(query, headers, _gs_pages, records, _title,
                             _keyword, _abstract, scrpr_api, _from_yr, _to_yr_,
                             logging_flag, data)
        search_msAcademic(query, headers, _pages, records, _title, _keyword,
                          _abstract, ms_api, _from_yr, _to_yr_, logging_flag,
                          data)
        search_core(query, headers, _pages, records, _title, _keyword,
                    _abstract, core_api, _search_yr, logging_flag, data)
        search_pubMed(query, headers, _pages, _title, _keyword, _abstract,
                      _from_yr, _to_yr_, logging_flag, data)
        search_acmlibrary(query, headers, _acm_pages, records, _title,
                          _keyword, _abstract, _from_yr, _to_yr_, logging_flag,
                          data)

        '--- Engines only for Keyword and Abstract ---'
        search_PlosOne(query, headers, _pages, records, _title, _keyword,
                       _abstract, _from_yr, _to_yr_, logging_flag, data)
        search_academia(query, headers, _pages, records, _title, _keyword,
                        _abstract, _search_yr, logging_flag, data)
        search_scopus(query, headers, _els_pages, records, _title, _keyword,
                      _abstract, scp_api, _from_yr, _to_yr_, logging_flag,
                      data)
        search_springer(query, headers, _pages, records, _title, _keyword,
                        _abstract, spr_api, _search_yr, logging_flag, data)
        search_sciDirect(query, headers, _pages, records, _title, _keyword,
                         _abstract, sd1_api, sd2_api, _from_yr, _to_yr_,
                         logging_flag, data)
        return data
    except Exception as e:  # raise e
        pass
        exception_type, exception_object, exception_traceback = sys.exc_info()
        filename = exception_traceback.tb_frame.f_code.co_filename
        line_number = exception_traceback.tb_lineno
        logger.writeError(e, None, "Search Exception :", logging_flag,
                          filename, line_number)
Пример #2
0
def search_engines(x, data=None):
    # Call Search Modules
    try:
        if len(x) != 0:

            # call the search function for all
            try:
                if 0 in x:
                    data = searchAllEngines.search_allengines(
                        query, headers, _pages, _gs_pages, _acm_pages,
                        _els_pages, records, _title, _keyword, _abstract,
                        _search_yr, _from_yr, _to_yr_, logging_flag, data)
                    SaveOutput.saveOutput(data, out, output_path)
            except Exception as e:  # raise e
                pass  # print('error:', e)

            try:
                if 0 not in x:
                    data = searchSpecificEngine.search_engines(
                        x, query, headers, _pages, _gs_pages, _acm_pages,
                        _els_pages, records, _title, _keyword, _abstract,
                        _search_yr, _from_yr, _to_yr_, logging_flag, data)
                    SaveOutput.saveOutput(data, out, output_path)
            except Exception as e:  # raise e
                pass
                exception_type, exception_object, exception_traceback = sys.exc_info(
                )
                filename = exception_traceback.tb_frame.f_code.co_filename
                line_number = exception_traceback.tb_lineno
                logger.writeError(e, None, "Google Scholar", logging_flag,
                                  filename, line_number)
        else:
            print('Select search engine!')
            exit

    except Exception as e:  # raise e
        pass
Пример #3
0
def search_sciDirect(query, headers, _pages, records, _title, _keyword, _abstract, sd1_api, sd2_api, _from_yr, _to_yr_,
                     logging_flag, data):
    if _pages > 3:
        _pages = 3

    if _title:
        url = 'https://www.sciencedirect.com/search/api?qs=%22' + query + '%22&apiKey=' + sd1_api

        # response object
        response = requests.get(url, headers={'User-agent': 'your bot 0.1'}, timeout=30)
        soup = BeautifulSoup(response.content, 'lxml')
        obj = json.loads(soup.text)

        print('Searching in Science Direct...')
        # set the counter for records count
        count = 0
        for i in tqdm(range(1)):

            # Find required attributes in the response object
            for item in obj['searchResults']:
                try:
                    publish_date = str(item['publicationDate'])

                    # get document ID from the result first
                    doi = item['doi']

                    # call again api with DOI to the get the attriutes
                    url2 = 'https://api.elsevier.com/content/article/doi/' + doi + '?apiKey=' + sd2_api
                    response1 = requests.get(url2, headers=headers, timeout=30)
                    soup1 = BeautifulSoup(response1.content, 'lxml')
                    if "prism:Issn" and "prism:issn" "prism:eIssn" and "prism:eissn" not in soup1.find_all('coredata'):
                        issn = str(['No information found'])

                    # Find required attributes in the response object
                    for item in soup1.find_all('coredata'):
                        resp_obj = {"entities": {"Search Engine": "Science Direct Search Engine",
                                                 "Attributes found": "DOI, Title, URLs, Authors, Publication Name, "
                                                                     "ISSN, Type, Published date, Abstract",
                                                 "items": [
                                                     {"DOI": item.find_all('prism:doi')[0].get_text(),
                                                      "Title": item.find_all('dc:title')[0].get_text().strip(),
                                                      "URLs": item.find_all('prism:url')[0].get_text(),
                                                      "Authors": item.find_all('dc:creator')[0].get_text(),
                                                      "Publication Name": item.find_all('prism:publicationname')[
                                                          0].get_text(),
                                                      "ISSN": issn,
                                                      # "ISSN": item.find_all('prism:issn')[0].get_text(),
                                                      "Cited count": str(['No information found']),
                                                      "Affiliation": str(['No information found ']),
                                                      "Type": item.find_all('document-type'),
                                                      "Published date": publish_date,
                                                      "Abstract": str(item.find_all('dc:description')[
                                                                          0].get_text().strip()).replace('\n',
                                                                                                         '').replace(
                                                          '  ', '')
                                                      }
                                                 ]}}
                        count += 1
                        # append dict object data
                        data.append(resp_obj)
                except Exception as e:  # raise e
                    pass
                    exception_type, exception_object, exception_traceback = sys.exc_info()
                    filename = exception_traceback.tb_frame.f_code.co_filename
                    line_number = exception_traceback.tb_lineno
                    logger.writeError(e, None, _engine, logging_flag, filename, line_number)

        time.sleep(1)
        logger.writeRecords(query, None, _engine, count, count, logging_flag)
        print(f'Finished with total {count} records returned.')
        return data
    if not _from_yr:
        if _keyword or _abstract:
            for j in tqdm(range(1)):
                print('Searching in Science Direct...')
                # set the counter for records count
                count = 0
                for i in range(_pages):
                    url = 'https://api.elsevier.com/content/search/sciencedirect?query=' + query + '&apiKey=' + sd1_api + '&start=' + str(
                        i) + '&count=10'

                    # response object
                    response = requests.get(url, headers={'User-agent': 'your bot 0.1'})
                    soup = BeautifulSoup(response.content, 'lxml')
                    obj = json.loads(soup.text)

                    if 'entry' in obj['search-results']:
                        # Find required attributes in the response object
                        for item in obj['search-results']['entry']:
                            try:
                                publish_date = str(item['load-date']).split('T', -1)[0]

                                # get document ID from the result first
                                doi = item['prism:doi']

                                # call again api with DOI to the get the attriutes
                                url2 = 'https://api.elsevier.com/content/article/doi/' + doi + '?apiKey=' + sd2_api
                                response1 = requests.get(url2, headers=headers)
                                soup1 = BeautifulSoup(response1.content, 'lxml')
                                if "prism:Issn" and "prism:issn" "prism:eIssn" and "prism:eissn" not in soup1.find_all(
                                        'coredata'):
                                    issn = str(['No information found'])

                                # Find required attributes in the response object
                                for item in soup1.find_all('coredata'):
                                    resp_obj = {"entities": {"Search Engine": "Science Direct Search Engine",
                                                             "Attributes found": "DOI, Title, URLs, Authors, "
                                                                                 "Publication Name, ISSN, Type, "
                                                                                 "Published date, Abstract",
                                                             "items": [
                                                                 {"DOI": item.find_all('prism:doi')[0].get_text(),
                                                                  "Title": item.find_all('dc:title')[
                                                                      0].get_text().strip(),
                                                                  "URLs": item.find_all('prism:url')[0].get_text(),
                                                                  "Authors": item.find_all('dc:creator')[0].get_text(),
                                                                  "Publication Name":
                                                                      item.find_all('prism:publicationname')[
                                                                          0].get_text(),
                                                                  "ISSN": issn,
                                                                  # "ISSN": item.find_all('prism:issn')[0].get_text(),
                                                                  "Cited count": str(['No information found']),
                                                                  "Affiliation": str(['No information found ']),
                                                                  "Type": item.find_all('document-type'),
                                                                  "Published date": publish_date,
                                                                  "Abstract": str(item.find_all('dc:description')[
                                                                                      0].get_text().strip()).replace(
                                                                      '\n',
                                                                      '').replace(
                                                                      '  ', '')
                                                                  }
                                                             ]}}
                                    count += 1
                                    # append dict object data
                                    data.append(resp_obj)
                            except Exception as e:  # raise e
                                pass
                                exception_type, exception_object, exception_traceback = sys.exc_info()
                                filename = exception_traceback.tb_frame.f_code.co_filename
                                line_number = exception_traceback.tb_lineno
                                logger.writeError(e, None, _engine, logging_flag, filename, line_number)
        time.sleep(1)
        logger.writeRecords(query, None, _engine, count, count, logging_flag)
        print(f'Finished with total {count} records returned.')
        return data
    else:
        if _keyword or _abstract:
            for i in tqdm(range(1)):
                print('Searching in Science Direct...')

                # set the counter for records count
                count = 0
                for i in range(_pages):
                    url = 'https://api.elsevier.com/content/search/sciencedirect?query=' + query + '&date=' + _from_yr + '-' + _to_yr_ + '&apiKey=' + sd1_api + '&start=' + str(
                        i) + '&count=10'

                    # response object
                    response = requests.get(url, headers={'User-agent': 'your bot 0.1'})
                    soup = BeautifulSoup(response.content, 'lxml')
                    obj = json.loads(soup.text)
                    if 'entry' in obj['search-results']:
                        # Find required attributes in the response object
                        for item in obj['search-results']['entry']:
                            try:
                                publish_date = str(item['load-date']).split('T', -1)[0]

                                # get document ID from the result first
                                doi = item['prism:doi']

                                # call again api with DOI to the get the attriutes
                                url2 = 'https://api.elsevier.com/content/article/doi/' + doi + '?apiKey=' + sd2_api
                                response1 = requests.get(url2, headers=headers)
                                soup1 = BeautifulSoup(response1.content, 'lxml')
                                if "prism:Issn" and "prism:issn" "prism:eIssn" and "prism:eissn" not in soup1.find_all(
                                        'coredata'):
                                    issn = str(['No information found'])
                                # Find required attributes in the response object
                                for item in soup1.find_all('coredata'):
                                    resp_obj = {"entities": {"Search Engine": "Science Direct Search Engine",
                                                             "Attributes found": "DOI, Title, URLs, Authors, "
                                                                                 "Publication Name, ISSN, Type, "
                                                                                 "Published date, Abstract",
                                                             "items": [
                                                                 {"DOI": item.find_all('prism:doi')[0].get_text(),
                                                                  "Title": item.find_all('dc:title')[
                                                                      0].get_text().strip(),
                                                                  "URLs": item.find_all('prism:url')[0].get_text(),
                                                                  "Authors": item.find_all('dc:creator')[0].get_text(),
                                                                  "Publication Name":
                                                                      item.find_all('prism:publicationname')[
                                                                          0].get_text(),
                                                                  "ISSN": issn,
                                                                  # "ISSN": item.find_all('prism:issn')[0].get_text(),
                                                                  "Cited count": str(['No information found']),
                                                                  "Affiliation": str(['No information found ']),
                                                                  "Type": item.find_all('document-type'),
                                                                  "Published date": publish_date,
                                                                  "Abstract": str(item.find_all('dc:description')[
                                                                                      0].get_text().strip()).replace(
                                                                      '\n',
                                                                      '').replace(
                                                                      '  ', '')
                                                                  }
                                                             ]}}
                                    count += 1
                                    # append dict object data
                                    data.append(resp_obj)
                            except Exception as e:  # raise e
                                pass
                                exception_type, exception_object, exception_traceback = sys.exc_info()
                                filename = exception_traceback.tb_frame.f_code.co_filename
                                line_number = exception_traceback.tb_lineno
                                logger.writeError(e, None, _engine, logging_flag, filename, line_number)
            time.sleep(1)
            logger.writeRecords(query, None, _engine, count, count, logging_flag)
            print(f'Finished with total {count} records returned.')
            return data
Пример #4
0
def search_academia(query, headers, _pages, records, _title, _keyword,
                    _abstract, _search_yr, logging_flag, data):
    if _title:
        print('Searching in Academia...')
        q = query.title().replace(' ', '_')
        url = 'https://www.academia.edu/search?q=' + query

        # response object
        response = requests.get(url, headers=headers, timeout=30)

        count = 0
        if response.status_code == 200:  # check for ok response
            soup = BeautifulSoup(response.content, 'html.parser')

            for i in tqdm(range(1)):
                # Find required attributes in the response object
                for item in soup.find_all('div', class_='a-fadeInDown'):
                    abs = ''
                    try:

                        # few records doesnt have summary attribute so check them
                        if bool(
                                item.find_all('div',
                                              class_='work-card--abstract')):
                            # if 'summarized' in soup.find_all('div', class_='u-borderBottom1'):
                            abs = item.find_all('div',
                                                class_='work-card--abstract')

                        else:
                            abs = ['No information found']

                        resp_obj = {
                            "entities": {
                                "Search Engine":
                                "Academia Search Engine",
                                "Attributes found":
                                "Title, URLs, Authors, Abstract",
                                "items": [{
                                    "DOI": ['No information found'],
                                    "Title":
                                    item.find_all('div',
                                                  class_='work-card--title')
                                    [0].get_text(),
                                    "URLs":
                                    item.select('a')[0]['href'],
                                    "Authors":
                                    item.find_all(
                                        'div', class_='work-card--author-name')
                                    [0].get_text(),
                                    "Publication Name":
                                    str(
                                        item.find_all(
                                            'div',
                                            class_='work-card--publish-wrapper'
                                        )[0].get_text()).split(',', 1)[1],
                                    "ISSN": ['No information found'],
                                    "Cited count": ['No information found'],
                                    "Affiliation": ['No information found '],
                                    "Type": ['No information found'],
                                    "Published date":
                                    str(
                                        item.find_all(
                                            'div',
                                            class_='work-card--publish-wrapper'
                                        )[0].get_text()).split(',', 1)[0],
                                    "Abstract":
                                    abs
                                }]
                            }
                        }
                        count += 1
                        data.append(resp_obj)
                        # append dict object data

                    except Exception as e:  # raise e
                        pass
                        logger.writeError("Logging Erorr:" + str(e), None,
                                          _engine, logging_flag)

            else:
                pass
            time.sleep(1)
            logger.writeRecords(query, None, _engine, count, count,
                                logging_flag)
            print(f'Finished with total {count} records returned.')
            return data

    if _keyword or _abstract:

        print('Searching in Academia...')
        if _search_yr:
            print(
                'Date parameter search either not supported or not available in this search engine!'
            )
        else:
            count = 0
            for i in tqdm(range(1)):

                for i in range(_pages):
                    # url = 'https://www.academia.edu/search?q=' + query
                    q = query.title().replace(' ', '_')
                    url = 'https://www.academia.edu/Documents/in/' + q + '?page=' + str(
                        i)

                    # response object
                    response = requests.get(url, headers=headers, timeout=30)

                    if response.status_code == 200:  # check for ok response
                        soup = BeautifulSoup(response.content, 'html.parser')

                        # Find required attributes in the response object
                        for item in soup.find_all('div',
                                                  class_='u-borderBottom1'):
                            abs = ''
                            try:
                                # try:
                                # few records doesnt have summary attribute so check them
                                if bool(item.select('.summarized')):
                                    # if 'summarized' in soup.find_all('div', class_='u-borderBottom1'):
                                    abs = item.select(
                                        '.summarized')[0].get_text()

                                    # except Exception as e:  # raise e
                                elif bool(item.select('.summary')):
                                    abs = item.select('.summary')[0].get_text()
                                else:
                                    abs = ['No information found']

                                resp_obj = {
                                    "entities": {
                                        "Search Engine":
                                        "Academia Search Engine",
                                        "Attributes found":
                                        "Title, URLs, Authors, Abstract",
                                        "items": [{
                                            "DOI": ['No information found'],
                                            "Title":
                                            item.select('a')[0].get_text(),
                                            "URLs":
                                            item.select('a')[0]['href'],
                                            "Authors":
                                            item.select(
                                                '.u-fw700')[0].get_text(),
                                            "Publication Name":
                                            ['No information found'],
                                            "ISSN": ['No information found'],
                                            "Cited count":
                                            ['No information found'],
                                            "Affiliation":
                                            ['No information found '],
                                            "Type": ['No information found'],
                                            "Published date":
                                            ['No information found'],
                                            "Abstract":
                                            abs
                                        }]
                                    }
                                }
                                count += 1
                                data.append(resp_obj)
                                # append dict object data

                            except Exception as e:  # raise e
                                # pass
                                logger.writeError("Logging Erorr:" + str(e),
                                                  None, _engine, logging_flag)

                        else:
                            pass
            time.sleep(1)
            logger.writeRecords(query, None, _engine, count, count,
                                logging_flag)
            print(f'Finished with total {count} records returned.')
            return data
Пример #5
0
def search_googleScholar(query, headers, _gs_pages, records, _title, _keyword,
                         _abstract, scrpr_api, _from_yr, _to_yr_, logging_flag,
                         data):
    rec = 0
    if _title:
        # request url
        url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=%22' + query + '%22&btnG='

        # response object
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'lxml')
        print('Searching in Google Scholar...')
        # set the counter for records count
        count = 0
        for i in tqdm(range(1)):

            # Find required attributes in the response object by checking tag [data-lid]'))
            for item in soup.select('[data-lid]'):
                try:
                    if bool(item.select('.gs_or_ggsm')):
                        cc = str(
                            re.findall(
                                r'\d+',
                                str(item.select('.gs_fl')
                                    [1].get_text()))).split(',', 1)[0].replace(
                                        '[', '')
                    else:
                        cc = str(
                            re.findall(
                                r'\d+',
                                str(item.select('.gs_fl')
                                    [0].get_text()))).split(',', 1)[0].replace(
                                        '[', '')

                    if bool(item.select('.gs_ct1')):
                        type = str(item.select('.gs_ct1')[0].get_text())
                    else:
                        type = str(['Research Article'])

                    resp_obj = {
                        "entities": {
                            "Search Engine":
                            "Google Scholar",
                            "Attributes found":
                            "Title, URLs, Authors, Cited count, Type, Published "
                            "date, Abstract",
                            "items": [{
                                "DOI":
                                str(['No information found']),
                                "Title":
                                item.select('h3')[0].get_text(),
                                "URLs":
                                item.select('a')[0]['href'],
                                "Authors":
                                re.sub(
                                    "[^A-Za-z]", " ",
                                    str(item.select('.gs_a')
                                        [0].get_text()).split('-', 1)[0]),
                                "Publication Name":
                                str(['No information found']),
                                "ISSN":
                                str(['No information found']),
                                "Cited count":
                                cc,
                                "Affiliation":
                                str(['No information found']),
                                "Type":
                                type,
                                "Published date":
                                str(
                                    re.findall(
                                        r'\d+',
                                        str(
                                            item.select('.gs_a')
                                            [0].get_text()))).strip(),
                                "Abstract":
                                item.select('.gs_rs')[0].get_text()
                            }]
                        }
                    }
                    # append dict object data
                    count += 1
                    data.append(resp_obj)
                except Exception as e:  # raise e
                    pass
                    exception_type, exception_object, exception_traceback = sys.exc_info(
                    )
                    filename = exception_traceback.tb_frame.f_code.co_filename
                    line_number = exception_traceback.tb_lineno
                    logger.writeError(e, None, _engine, logging_flag, filename,
                                      line_number)
        time.sleep(1)

        print(f'Finished with total {count} records returned.')
        logger.writeRecords(query, None, _engine, "1", count, logging_flag)
        return data

    if _keyword or _abstract:
        if _gs_pages != 0:
            pages = pagination(_gs_pages)
        else:
            pages = 1

        # search for dates
        if _from_yr:

            # use of scraper api to avoid IP block issue by Google scholar
            client = ScraperAPIClient(scrpr_api)
            count = 0

            for i in tqdm(range(1)):
                print("Searching Google Scholar Engine now please wait...")
                url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&as_ylo=' + _from_yr + '&as_yhi=' + _to_yr_ + '&btnG='

                response = client.get(url,
                                      headers={'User-agent': 'your bot 0.1'})

                if response.status_code != 200:
                    print("Request failed with status", response.status_code)
                    logger.writeError(
                        "Logging Error:" + str(response.status_code), None,
                        _engine, logging_flag)

                else:
                    soup = BeautifulSoup(response.content, 'lxml')

                    # count no of records returned by google scholar
                    for item in soup.find_all('div', class_='gs_ab_st'):
                        rec = \
                        str(item.find_all('div', id='gs_ab_md')[0].get_text()).split(' ', 1)[1].replace(',', "").split(
                            ' ', 1)[0]

                        pages = 1
                        if _gs_pages != 0:
                            pages = pagination(_gs_pages)
                        else:
                            pages = pagination(rec)

                    # check if records are greater than 1000 or not
                    if int(pages) > 100:
                        print(
                            "NOTE:Google Scholar returns data for max 1000 records irrespective of total records. "
                            "Total No of total records found :", rec,
                            "\n Fetching records details now...")

                        pages = 100
                        for i in range(pages):

                            url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&as_ylo=' + _from_yr + '&as_yhi=' + _to_yr_ + '&btnG=&start=' + str(
                                i) + '0'

                            # response = requests.get(url, proxies={"http": proxy, "https": proxy}, headers=headers)
                            response = client.get(
                                url, headers={'User-agent': 'your bot 0.1'})
                            soup = BeautifulSoup(response.content, 'lxml')
                            # Find required attributes in the response object by checking tag [data-lid]'))
                            for item in soup.select('[data-lid]'):
                                try:
                                    try:
                                        if bool(
                                                item.select('.gs_rs')
                                            [0].get_text()):
                                            abstract = item.select(
                                                '.gs_rs')[0].get_text()
                                        else:
                                            abstract = str(
                                                ['No information found'])
                                    except:
                                        abstract = str(
                                            ['No information found'])
                                        pass
                                    try:
                                        if bool(item.select('.gs_or_ggsm')):
                                            cc = \
                                                str(re.findall(r'\d+', str(item.select('.gs_fl')[1].get_text()))).split(
                                                    ',',
                                                    1)[
                                                    0].replace('[', '')
                                        else:
                                            cc = \
                                                str(re.findall(r'\d+', str(item.select('.gs_fl')[0].get_text()))).split(
                                                    ',',
                                                    1)[
                                                    0].replace('[', '')
                                    except:
                                        cc = str(['No information found'])
                                        pass
                                    try:
                                        if bool(item.select('.gs_ct1')):
                                            type = str(
                                                item.select('.gs_ct1')
                                                [0].get_text())
                                        else:
                                            type = str(['Research Article'])
                                    except:
                                        type = str(['No information found'])
                                        pass

                                    # response object
                                    resp_obj = {
                                        "entities": {
                                            "Search Engine":
                                            "Google Scholar",
                                            "Attributes found":
                                            "Title, URLs, Authors, Cited count, "
                                            "Type, Published date, Abstract",
                                            "items": [{
                                                "DOI":
                                                str(['No information found']),
                                                "Title":
                                                item.select('h3')
                                                [0].get_text(),
                                                "URLs":
                                                item.select('a')[0]['href'],
                                                "Authors":
                                                re.sub(
                                                    "[^A-Za-z]", " ",
                                                    str(
                                                        item.select('.gs_a')
                                                        [0].get_text()).split(
                                                            '-', 1)[0]),
                                                "Publication Name":
                                                str(['No information found']),
                                                "ISSN":
                                                str(['No information found']),
                                                "Cited count":
                                                cc,
                                                "Affiliation":
                                                str(['No information found']),
                                                "Type":
                                                type,
                                                "Published date":
                                                str(
                                                    re.findall(
                                                        r'\d+',
                                                        str(
                                                            item.select(
                                                                '.gs_a')
                                                            [0].get_text()))
                                                ).strip(),
                                                "Abstract":
                                                abstract
                                            }]
                                        }
                                    }
                                    # append dict object data
                                    count += 1
                                    data.append(resp_obj)
                                except Exception as e:  # raise e
                                    pass
                                    exception_type, exception_object, exception_traceback = sys.exc_info(
                                    )
                                    filename = exception_traceback.tb_frame.f_code.co_filename
                                    line_number = exception_traceback.tb_lineno
                                    logger.writeError(e, None, _engine,
                                                      logging_flag, filename,
                                                      line_number)

                    else:
                        for i in range(pages):

                            url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&as_ylo=' + _from_yr + '&as_yhi=' + _to_yr_ + '&btnG=&start=' + str(
                                i) + '0'

                            response = client.get(
                                url, headers={'User-agent': 'your bot 0.1'})
                            if response.status_code != 200:
                                print("Request failed with stauts",
                                      response.status_code)
                                logger.writeError(
                                    "Logging Erorr:" +
                                    str(response.status_code), None, _engine,
                                    logging_flag)

                            else:
                                soup = BeautifulSoup(response.content, 'lxml')

                                # Find required attributes in the response object by checking tag [data-lid]'))
                                for item in soup.select('[data-lid]'):
                                    try:
                                        try:
                                            if bool(
                                                    item.select('.gs_rs')
                                                [0].get_text()):
                                                abstract = item.select(
                                                    '.gs_rs')[0].get_text()
                                            else:
                                                abstract = str(
                                                    ['No information found'])
                                        except:
                                            abstract = str(
                                                ['No information found'])
                                            pass
                                        try:
                                            if bool(item.select(
                                                    '.gs_or_ggsm')):
                                                cc = \
                                                str(re.findall(r'\d+', str(item.select('.gs_fl')[1].get_text()))).split(
                                                    ',', 1)[
                                                    0].replace('[', '')
                                            else:
                                                cc = \
                                                str(re.findall(r'\d+', str(item.select('.gs_fl')[0].get_text()))).split(
                                                    ',', 1)[
                                                    0].replace('[', '')
                                        except:
                                            cc = str(['No information found'])
                                            pass
                                        try:
                                            if bool(item.select('.gs_ct1')):
                                                type = str(
                                                    item.select('.gs_ct1')
                                                    [0].get_text())
                                            else:
                                                type = str(
                                                    ['Research Article'])
                                        except:
                                            type = str(
                                                ['No information found'])
                                            pass

                                        resp_obj = {
                                            "entities": {
                                                "Search Engine":
                                                "Google Scholar",
                                                "Attributes found":
                                                "Title, URLs, Authors, Cited "
                                                "count, Type, Published date, "
                                                "Abstract",
                                                "items": [{
                                                    "DOI":
                                                    str([
                                                        'No information found'
                                                    ]),
                                                    "Title":
                                                    item.select(
                                                        'h3')[0].get_text(),
                                                    "URLs":
                                                    item.select('a')[0]
                                                    ['href'],
                                                    "Authors":
                                                    re.sub(
                                                        "[^A-Za-z]", " ",
                                                        str(
                                                            item.select(
                                                                '.gs_a')[0].
                                                            get_text()).split(
                                                                '-', 1)[0]),
                                                    "Publication Name":
                                                    str([
                                                        'No information found'
                                                    ]),
                                                    "ISSN":
                                                    str([
                                                        'No information found'
                                                    ]),
                                                    "Cited count":
                                                    cc,
                                                    "Affiliation":
                                                    str([
                                                        'No information found'
                                                    ]),
                                                    "Type":
                                                    type,
                                                    "Published date":
                                                    str(
                                                        re.findall(
                                                            r'\d+',
                                                            str(
                                                                item.select(
                                                                    '.gs_a')
                                                                [0].get_text())
                                                        )).strip(),
                                                    "Abstract":
                                                    abstract
                                                }]
                                            }
                                        }
                                        # append dict object data
                                        count += 1
                                        data.append(resp_obj)
                                    except Exception as e:  # raise e
                                        pass
                                        exception_type, exception_object, exception_traceback = sys.exc_info(
                                        )
                                        filename = exception_traceback.tb_frame.f_code.co_filename
                                        line_number = exception_traceback.tb_lineno
                                        logger.writeError(
                                            e, None, _engine, logging_flag,
                                            filename, line_number)
                    time.sleep(1)

                    print(f'Finished with total {count} records returned.')
                    logger.writeRecords(query, None, _engine, rec, count,
                                        logging_flag)
                    return data

        # search without dates
        else:
            print("Searching Google Scholar Engine now please wait...")
            client = ScraperAPIClient(scrpr_api)
            count = 0
            for i in tqdm(range(1)):
                for i in range(pages):
                    # request url
                    url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&btnG=&start=' + str(
                        i) + '0'
                    # response object
                    response = client.get(
                        url, headers={'User-agent': 'your bot 0.1'})

                    if response.status_code != 200:
                        print("Request failed with stauts",
                              response.status_code)
                        logger.writeError(
                            "Logging Erorr:" + str(response.status_code), None,
                            _engine, logging_flag)

                    soup = BeautifulSoup(response.content, 'lxml')

                    # Find required attributes in the response object by checking tag [data-lid]'))
                    for item in soup.select('[data-lid]'):
                        try:

                            try:
                                if bool(item.select('.gs_rs')[0].get_text()):
                                    abstract = item.select(
                                        '.gs_rs')[0].get_text()
                                else:
                                    abstract = str(['No information found'])
                            except:
                                abstract = str(['No information found'])
                                pass
                            try:
                                if bool(item.select('.gs_or_ggsm')):
                                    cc = \
                                        str(re.findall(r'\d+', str(item.select('.gs_fl')[1].get_text()))).split(',', 1)[
                                            0].replace('[', '')
                                else:
                                    cc = \
                                        str(re.findall(r'\d+', str(item.select('.gs_fl')[0].get_text()))).split(',', 1)[
                                            0].replace('[', '')
                            except:
                                cc = str(['No information found'])
                                pass
                            try:
                                if bool(item.select('.gs_ct1')):
                                    type = str(
                                        item.select('.gs_ct1')[0].get_text())
                                else:
                                    type = str(['Research Article'])
                            except:
                                type = str(['No information found'])
                                pass

                            resp_obj = {
                                "entities": {
                                    "Search Engine":
                                    "Google Scholar",
                                    "Attributes found":
                                    "Title, URLs, Authors, Cited count, Type, "
                                    "Published date, Abstract",
                                    "items": [{
                                        "DOI":
                                        str(['No information found']),
                                        "Title":
                                        item.select('h3')[0].get_text(),
                                        "URLs":
                                        item.select('a')[0]['href'],
                                        "Authors":
                                        re.sub(
                                            "[^A-Za-z]", " ",
                                            str(
                                                item.select('.gs_a')
                                                [0].get_text()).split('-',
                                                                      1)[0]),
                                        "Publication Name":
                                        str(['No information found']),
                                        "ISSN":
                                        str(['No information found']),
                                        "Cited count":
                                        cc,
                                        "Affiliation":
                                        str(['No information found']),
                                        "Type":
                                        type,
                                        "Published date":
                                        str(
                                            re.findall(
                                                r'\d+',
                                                str(
                                                    item.select('.gs_a')
                                                    [0].get_text()))).strip(),
                                        "Abstract":
                                        abstract
                                    }]
                                }
                            }
                            # append dict object data
                            count += 1
                            data.append(resp_obj)
                        except Exception as e:  # raise e
                            pass
                            exception_type, exception_object, exception_traceback = sys.exc_info(
                            )
                            filename = exception_traceback.tb_frame.f_code.co_filename
                            line_number = exception_traceback.tb_lineno
                            logger.writeError(e, None, _engine, logging_flag,
                                              filename, line_number)
            time.sleep(1)

            print(f'Finished with total {count} records returned.')
            logger.writeRecords(query, None, _engine, rec, count, logging_flag)
            return data
Пример #6
0
def search_core(query, headers, _pages, records, _title, _keyword, _abstract, core_api, _search_yr, logging_flag, data):
    if _title:
        print('Searching in CORE...')
        url = 'https://core.ac.uk:443/api-v2/articles/search/%22' + query + '%22?page=1&pageSize=10&apiKey=' + core_api
        # response object
        response = requests.get(url, headers=headers, timeout=30)
        soup = BeautifulSoup(response.content, 'lxml')

        # convert soup object into json
        obj = json.loads(soup.text)

        # set the counter for records count
        count = 0
        for i in tqdm(range(1)):

            if obj['data'] is not None:

                # Find required attributes in the response object
                for item in obj['data']:
                    try:
                        if 'publisher' not in obj:
                            publisher = ['No Information']
                        else:
                            publisher = item['publisher']

                        resp_obj = {"entities": {"Search Engine": "CORE Search Engine",
                                                 "Attributes found": "DOI, Title, URLs, Authors, Publication Name,"
                                                                     "Type, Published Date",
                                                 "items": [{"DOI": item['oai'],
                                                            "Title": item['title'],
                                                            "URLs": item['downloadUrl'],
                                                            "Authors": item['authors'],
                                                            "Publication Name": publisher,
                                                            "ISSN": ['No Information'],
                                                            "Cited count": ['No Information'],
                                                            "Affiliation": ['No Information'],
                                                            "Type": ['Article'],
                                                            # "Keywords": item['topics'],
                                                            "Published Date": item['datePublished'],
                                                            "Abstract": ['No Information']
                                                            }]}}
                        count += 1
                        # append dict object data
                        data.append(resp_obj)
                    except Exception as e:  # raise e
                        pass
                        exception_type, exception_object, exception_traceback = sys.exc_info()
                        filename = exception_traceback.tb_frame.f_code.co_filename
                        line_number = exception_traceback.tb_lineno
                        logger.writeError(e, None, _engine, logging_flag, filename, line_number)
            else:
                pass
                # print('error core:', e)
        time.sleep(1)
        logger.writeRecords(query, None, _engine, count, count, logging_flag)
        print(f'Finished with total {count} records returned.')
        return data

    if not _search_yr:
        if _keyword or _abstract:
            print('Searching in CORE...')
            count = 0
            for i in tqdm(range(1)):
                for i in range(_pages):
                    i += 1
                    url = 'https://core.ac.uk:443/api-v2/search/' + query + '?page=' + str(
                        i) + '&pageSize=20&year=' + _search_yr + '&apiKey=' + core_api

                    # response object
                    response = requests.get(url, headers=headers, timeout=30)
                    soup = BeautifulSoup(response.content, 'lxml')

                    # convert soup object into json
                    obj = json.loads(soup.text)

                    # set the counter for records count

                    if obj['data'] is not None:

                        # Find required attributes in the response object
                        for item in obj['data']:
                            try:
                                resp_obj = {"entities": {"Search Engine": "CORE Search Engine",
                                                         "Attributes found": "DOI, Title, URLs, Authors,Publication "
                                                                             "Name, IISN, Cited Count,Type, "
                                                                             "Published Date, Abstract",
                                                         "items": [{"DOI": item['_source']['doi'],
                                                                    "Title": item['_source']['title'],
                                                                    "URLs": item['_source']['urls'],
                                                                    "Authors": item['_source']['authors'],
                                                                    "Publication Name": item['_source']['publisher'],
                                                                    "ISSN": item['_source']['issn'],
                                                                    "Cited count": item['_source']['citationCount'],
                                                                    "Affiliation": ['No Information'],
                                                                    "Type": item['_type'],
                                                                    # "Keywords": item['topics'],
                                                                    "Published Date":
                                                                        str(item['_source']['datePublished']).split('T',
                                                                                                                    1)[
                                                                            0],
                                                                    "Abstract": str(
                                                                        item['_source']['description']).replace('\n',
                                                                                                                '')
                                                                    }]}}
                                count += 1
                                # append dict object data
                                data.append(resp_obj)
                            except Exception as e:  # raise e
                                pass
                                exception_type, exception_object, exception_traceback = sys.exc_info()
                                filename = exception_traceback.tb_frame.f_code.co_filename
                                line_number = exception_traceback.tb_lineno
                                logger.writeError(e, None, _engine, logging_flag, filename, line_number)
                    else:
                        pass

            time.sleep(1)
            logger.writeRecords(query, None, _engine, count, count, logging_flag)
            print(f'Finished with total {count} records returned.')
            return data
    else:
        print('Searching in CORE...')
        print("Date Parameter not supported in this CORE API!")
Пример #7
0
def search_pubMed(query, headers, _pages, _title, _keyword, _abstract,
                  _from_yr, _to_yr_, logging_flag, data):
    if _title:
        print('Searching in PubMed...')
        count = 0
        for i in tqdm(range(1)):

            for i in range(_pages):

                url = 'https://pubmed.ncbi.nlm.nih.gov/?term=%22' + query + '%22' + '&size=10&page=1'
                # response object
                response = requests.get(url, headers=headers, timeout=30)
                soup = BeautifulSoup(response.content, 'lxml')

                for item in soup.find_all('div', class_='article-page'):
                    try:
                        try:
                            # few records doesnt have summary attribute so check them
                            if bool(
                                    item.find_all(
                                        'div',
                                        class_='abstract')[0].get_text()):
                                abs = str(
                                    item.find_all('div', class_='abstract')
                                    [0].get_text()).strip().replace('\n', '')
                        except Exception as e:  # raise e
                            abs = ['No information found']

                        if bool(item.select('.secondary-date')):
                            pub_date = str(
                                item.find_all('span', class_='secondary-date')
                                [0].get_text()).split(';', -1)[0]
                        else:
                            pub_date = ['No information found']

                        resp_obj = {
                            "entities": {
                                "Search Engine":
                                "PubMed Engine",
                                "Attributes found":
                                "DOI,Title, URLs, Authors,Type, Published Date, Abstract",
                                "items": [{
                                    "DOI":
                                    str(
                                        item.find_all('span',
                                                      class_='citation-doi')
                                        [0].get_text()).replace('\n', ''),
                                    "Title":
                                    str(
                                        item.find_all('h1',
                                                      class_='heading-title')
                                        [0].get_text()).strip(),
                                    "URLs":
                                    'https://pubmed.ncbi.nlm.nih.gov' +
                                    item.find_all('a',
                                                  class_='id-link')[0]['href'],
                                    "Authors":
                                    str(
                                        item.find_all(
                                            'span', class_='authors-list-item')
                                        [0].get_text()).strip().replace(
                                            '\n', ''),
                                    "Publication Name":
                                    str(['No information found']),
                                    "ISSN":
                                    str(['No information found']),
                                    "Cited count":
                                    str(['No information found']),
                                    "Affiliation":
                                    str(['No information found ']),
                                    "Type":
                                    str(['article']),
                                    "Published date":
                                    pub_date,
                                    "Abstract":
                                    abs
                                }]
                            }
                        }
                        count += 1
                        data.append(resp_obj)

                    except Exception as e:  # raise e
                        pass
                        exception_type, exception_object, exception_traceback = sys.exc_info(
                        )
                        filename = exception_traceback.tb_frame.f_code.co_filename
                        line_number = exception_traceback.tb_lineno
                        logger.writeError(e, None, _engine, logging_flag,
                                          filename, line_number)

        time.sleep(1)
        logger.writeRecords(query, None, _engine, count, count, logging_flag)
        print(f'Finished with total {count} records returned.')
        # Enable if you want to get MESH terms of articles
        # print(f'Now fetching Mesh Terms for {count} records returned.')
        # getMeshTerms.getMeshIDs(data,_email)
        # print(f'MeshTerms File saved for {count} records in text format.')
        return data

    if _keyword or _abstract:
        print('Searching in PubMed...')
        count = 0
        authr_list = []

        if _from_yr:
            for i in tqdm(range(1)):
                for i in range(_pages):
                    i += 1
                    url = 'https://pubmed.ncbi.nlm.nih.gov/?term=' + query + '&filter=years.' + _from_yr + '-' + _to_yr_ + '&format=abstract&size=10&page=' + str(
                        i)
                    # response object
                    response = requests.get(url, headers=headers, timeout=30)
                    soup = BeautifulSoup(response.content, 'lxml')

                    for item in soup.select('div',
                                            class_='search-results-chunks'):
                        try:

                            try:
                                doi = str(
                                    item.find_all('span',
                                                  class_='citation-doi')
                                    [0].get_text()).split('doi', 1)[1].replace(
                                        '\n', '')
                            except Exception as e:  # raise e

                                doi = ['No information found']

                            try:

                                if bool(item.find_all('span', class_='cit')):
                                    pub_date = str(
                                        item.find_all(
                                            'span',
                                            class_='cit')[0].get_text()).split(
                                                ';', 1)[0].replace('\n', '')
                                else:
                                    pub_date = \
                                    str(item.find_all('span', class_='secondary-date')[0].get_text()).split('b', 1)[
                                        1].replace('\n', '')

                            except Exception as e:  # raise e

                                pub_date = ['No information found']

                            if bool(item.select('.copyright')):
                                pub_name = str(
                                    item.find_all('p', class_='copyright')
                                    [0].get_text()).strip()
                            else:
                                pub_name = ['No information found']

                            if bool(item.select('.authors-list')):
                                for i in range(
                                        len(
                                            item.find('div',
                                                      class_='authors-list').
                                            find_all('a'))):
                                    authr_list.append(
                                        str(
                                            item.find('div',
                                                      class_='authors-list'
                                                      ).find_all('a')
                                            [i].get_text()).strip())
                            else:
                                authr_list = ['No information found']

                            resp_obj = {
                                "entities": {
                                    "Search Engine":
                                    "PubMed Engine",
                                    "Attributes found":
                                    "DOI,Title, URLs, Authors,Type, Published Date,Publication Name,Affiliation, Abstract",
                                    "items": [{
                                        "DOI":
                                        doi,
                                        "Title":
                                        str(
                                            item.find_all(
                                                'h1', class_='heading-title')
                                            [0].get_text()).strip(),
                                        "URLs":
                                        'https://pubmed.ncbi.nlm.nih.gov' +
                                        item.find('h1', class_="heading-title"
                                                  ).find_all("a")[0]['href'],
                                        "Authors":
                                        authr_list,
                                        "Publication Name":
                                        pub_name,
                                        "ISSN":
                                        str(['No information found']),
                                        "Cited count":
                                        str(['No information found']),
                                        "Affiliation":
                                        str(
                                            item.select(
                                                'li[data-affiliation-id]')
                                            [0].get_text()),
                                        "Type":
                                        str(['article']),
                                        "Published date":
                                        pub_date,
                                        "Abstract":
                                        str(
                                            item.find_all(
                                                'div',
                                                class_='abstract-content')
                                            [0].get_text()).strip()
                                    }]
                                }
                            }

                            if (len(data) != 0):
                                if not (checkItem(
                                        data, resp_obj['entities']['items'][0]
                                    ['Title'])):
                                    count += 1
                                    data.append(resp_obj)

                            else:
                                count += 1
                                data.append(resp_obj)

                        except Exception as e:  # raise e
                            pass
                            exception_type, exception_object, exception_traceback = sys.exc_info(
                            )
                            filename = exception_traceback.tb_frame.f_code.co_filename
                            line_number = exception_traceback.tb_lineno
                            logger.writeError(e, None, _engine, logging_flag,
                                              filename, line_number)

            time.sleep(1)
            logger.writeRecords(query, None, _engine, count, count,
                                logging_flag)
            print(f'Finished with total {count} records returned.')
            # Enable if you want to get MESH terms of articles
            # print(f'Now fetching Mesh Terms for {count} records returned.')
            # getMeshTerms.getMeshIDs(data,_email)
            # print(f'MeshTerms File saved for {count} records in text format.')
            return data
        else:
            for i in tqdm(range(1)):
                for i in range(_pages):
                    i += 1
                    url = 'https://pubmed.ncbi.nlm.nih.gov/?term=' + query + '&format=abstract&size=10&page=' + str(
                        i)
                    # response object
                    response = requests.get(url, headers=headers, timeout=30)
                    soup = BeautifulSoup(response.content, 'lxml')

                    for item in soup.select('div', class_='search-results'):
                        try:

                            if bool(item.select('.secondary-date')):
                                pub_date = \
                                    str(item.find_all('span', class_='secondary-date')[0].get_text()).split('b', 1)[
                                        1].replace('\n', '')
                            else:
                                pub_date = ['No information found']

                            if bool(item.select('.copyright')):
                                pub_name = str(
                                    item.find_all('p', class_='copyright')
                                    [0].get_text()).strip()
                            else:
                                pub_name = ['No information found']

                            if bool(item.select('.authors-list')):
                                for i in range(
                                        len(
                                            item.find('div',
                                                      class_='authors-list').
                                            find_all('a'))):
                                    authr_list.append(
                                        str(
                                            item.find('div',
                                                      class_='authors-list'
                                                      ).find_all('a')
                                            [i].get_text()).strip())
                            else:
                                authr_list = ['No information found']

                            resp_obj = {
                                "entities": {
                                    "Search Engine":
                                    "PubMed Engine",
                                    "Attributes found":
                                    "DOI,Title, URLs, Authors,Type, Published Date,Publication Name,Affiliation, Abstract",
                                    "items": [{
                                        "DOI":
                                        str(
                                            item.find_all(
                                                'span', class_='citation-doi')
                                            [0].get_text()).split(
                                                'doi', 1)[1].replace('\n', ''),
                                        "Title":
                                        str(
                                            item.find_all(
                                                'h1', class_='heading-title')
                                            [0].get_text()).strip(),
                                        "URLs":
                                        'https://pubmed.ncbi.nlm.nih.gov' +
                                        item.find('h1', class_="heading-title"
                                                  ).find_all("a")[0]['href'],
                                        "Authors":
                                        authr_list,
                                        "Publication Name":
                                        pub_name,
                                        "ISSN":
                                        str(['No information found']),
                                        "Cited count":
                                        str(['No information found']),
                                        "Affiliation":
                                        str(
                                            item.select(
                                                'li[data-affiliation-id]')
                                            [0].get_text()),
                                        "Type":
                                        str(['article']),
                                        "Published date":
                                        pub_date,
                                        "Abstract":
                                        str(
                                            item.find_all(
                                                'div',
                                                class_='abstract-content')
                                            [0].get_text()).strip()
                                    }]
                                }
                            }
                            count += 1

                            data.append(resp_obj)
                            # print(data.items())

                        except Exception as e:  # raise e
                            pass
                            exception_type, exception_object, exception_traceback = sys.exc_info(
                            )
                            filename = exception_traceback.tb_frame.f_code.co_filename
                            line_number = exception_traceback.tb_lineno
                            logger.writeError(e, None, _engine, logging_flag,
                                              filename, line_number)

            time.sleep(1)
            logger.writeRecords(query, None, _engine, count, count,
                                logging_flag)
            print(f'Finished with total {count} records returned.')

            # Enable if you want to get MESH terms of articles
            # print(f'Now fetching Mesh Terms for {count} records returned.')
            # getMeshTerms.getMeshIDs(data,_email)
            # print(f'MeshTerms File saved for {count} records in text format.')
            return data
Пример #8
0
def search_acmlibrary(query, headers, _acm_pages, records, _title, _keyword,
                      _abstract, _from_yr, _to_yr_, logging_flag, data):
    query = processInputQuery(query)
    if _title:

        url = 'https://dl.acm.org/action/doSearch?AllField=%22' + query + '%22'

        # response object
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'lxml')
        # obj = json.loads(soup.text)

        print('Searching in ACM Library...')
        # set the counter for records count
        count = 0
        for i in tqdm(range(1)):

            ######## Find required attributes in the response object
            for item in soup.select(
                    'li', class_='search__item issue-item-container'):
                try:
                    resp_obj = {
                        "entities": {
                            "Search Engine":
                            "ACM Library Search Engine",
                            "Attributes found":
                            "DOI, Title, URLs, Authors, Citation count, Type, Published date, Abstract",
                            "items": [{
                                "DOI":
                                item.find("span",
                                          class_='hlFld-Title').find_all(
                                              'a')[0]['href'],
                                "Title":
                                item.find_all("h5", class_='issue-item__title')
                                [0].get_text().strip(),
                                "URLs":
                                item.find_all(
                                    "a", class_='issue-item__doi')[0]['href'],
                                "Authors":
                                item.find_all("ul", class_='truncate-list')
                                [0].get_text().strip().replace('\n', ''),
                                "Publication Name":
                                str(['No information found']),
                                "ISSN":
                                str(['No information found']),
                                "Cited count":
                                item.find("span", class_='citation').find_all(
                                    'span')[0].get_text(),
                                "Affiliation":
                                str(['No information found ']),
                                "Type":
                                item.find_all(
                                    "div",
                                    class_='issue-heading')[0].get_text(),
                                "Published date":
                                item.find("span",
                                          class_='dot-separator').find_all(
                                              'span')[0].get_text(),
                                "Abstract":
                                str(
                                    item.find_all(
                                        "div", class_='issue-item__abstract')
                                    [0].get_text()).strip().replace(
                                        '\n', '').replace('  ', '')
                            }]
                        }
                    }
                    count += 1
                    # append dict object data
                    data.append(resp_obj)
                except Exception as e:  # raise e
                    pass
                    exception_type, exception_object, exception_traceback = sys.exc_info(
                    )
                    filename = exception_traceback.tb_frame.f_code.co_filename
                    line_number = exception_traceback.tb_lineno
                    logger.writeError(e, None, _engine, logging_flag, filename,
                                      line_number)
        time.sleep(1)
        logger.writeRecords(query, None, _engine, count, count, logging_flag)
        print(f'Finished with total {count} records returned.')
        return data

    if _keyword or _abstract:

        print('Searching in ACM Library...')
        if (_acm_pages != 0):
            pages = pagination(_acm_pages)
        else:
            pages = 1

        if len(_from_yr) != 0:
            count = 0
            for i in tqdm(range(1)):

                url = 'https://dl.acm.org/action/doSearch?AllField=' + query + '&AfterYear=' + _from_yr + '&BeforeYear=' + _to_yr_ + '&pageSize=20&startPage=' + str(
                    i)

                # response object
                response = requests.get(url, headers=headers)
                soup = BeautifulSoup(response.content, 'lxml')

                # count no of records returned by engine
                for item in soup.find_all('span', class_='hitsLength'):
                    rec = str(
                        soup.find_all(
                            'span',
                            class_='hitsLength')[0].get_text()).replace(
                                ',', '').replace(" ", "")

                    pages = 1
                    if (_acm_pages != 0):
                        pages = pagination(_acm_pages)
                    else:
                        pages = pagination(int(rec))

                if int(pages) > 100:
                    print(
                        "NOTE:ACM Library returns data for max 2000 records irrespective of total records. Total No of total records found :",
                        rec, "\n Fetching records details now...")

                    pages = 50
                    for i in range(pages):
                        url = 'https://dl.acm.org/action/doSearch?AllField=' + query + '&AfterYear=' + _from_yr + '&BeforeYear=' + _to_yr_ + '&pageSize=20&startPage=' + str(
                            i)

                        response = requests.get(url, headers=headers)
                        soup = BeautifulSoup(response.content, 'lxml')
                        # Find required attributes in the response object
                        for item in soup.select(
                                'li',
                                class_='search__item issue-item-container'):
                            try:
                                resp_obj = {
                                    "entities": {
                                        "Search Engine":
                                        "ACM Library Search Engine",
                                        "Attributes found":
                                        "DOI, Title, URLs, Authors, Citation count, Type, Published date, Abstract",
                                        "items": [{
                                            "DOI":
                                            item.find(
                                                "span",
                                                class_='hlFld-Title').find_all(
                                                    'a')[0]['href'],
                                            "Title":
                                            item.find_all(
                                                "h5",
                                                class_='issue-item__title')
                                            [0].get_text().strip(),
                                            "URLs":
                                            item.find_all(
                                                "a", class_='issue-item__doi')
                                            [0]['href'],
                                            "Authors":
                                            item.find_all(
                                                "ul", class_='truncate-list')
                                            [0].get_text().strip().replace(
                                                '\n', ''),
                                            "Publication Name":
                                            str(['No information found']),
                                            "ISSN":
                                            str(['No information found']),
                                            "Cited count":
                                            item.find(
                                                "span",
                                                class_='citation').find_all(
                                                    'span')[0].get_text(),
                                            "Affiliation":
                                            str(['No information found']),
                                            "Type":
                                            item.find_all(
                                                "div", class_='issue-heading')
                                            [0].get_text(),
                                            "Published date":
                                            item.find("span",
                                                      class_='dot-separator').
                                            find_all('span')[0].get_text(),
                                            "Abstract":
                                            str(
                                                item.find_all(
                                                    "div",
                                                    class_=
                                                    'issue-item__abstract')[0].
                                                get_text()).strip().replace(
                                                    '\n',
                                                    '').replace('  ', '')
                                        }]
                                    }
                                }
                                count += 1
                                # append dict object data
                                data.append(resp_obj)
                            except Exception as e:  # raise e
                                pass
                                exception_type, exception_object, exception_traceback = sys.exc_info(
                                )
                                filename = exception_traceback.tb_frame.f_code.co_filename
                                line_number = exception_traceback.tb_lineno
                                logger.writeError(e, None, _engine,
                                                  logging_flag, filename,
                                                  line_number)
            else:
                for i in range(pages):
                    ######## Find required attributes in the response object
                    for item in soup.select(
                            'li', class_='search__item issue-item-container'):
                        try:
                            resp_obj = {
                                "entities": {
                                    "Search Engine":
                                    "ACM Library Search Engine",
                                    "Attributes found":
                                    "DOI, Title, URLs, Authors, Citation count, Type, Published date, Abstract",
                                    "items": [{
                                        "DOI":
                                        item.find(
                                            "span",
                                            class_='hlFld-Title').find_all(
                                                'a')[0]['href'],
                                        "Title":
                                        item.find_all(
                                            "h5", class_='issue-item__title')
                                        [0].get_text().strip(),
                                        "URLs":
                                        item.find_all("a",
                                                      class_='issue-item__doi')
                                        [0]['href'],
                                        "Authors":
                                        item.find_all("ul",
                                                      class_='truncate-list')
                                        [0].get_text().strip().replace(
                                            '\n', ''),
                                        "Publication Name":
                                        str(['No information found']),
                                        "ISSN":
                                        str(['No information found']),
                                        "Cited count":
                                        item.find("span",
                                                  class_='citation').find_all(
                                                      'span')[0].get_text(),
                                        "Affiliation":
                                        str(['No information found']),
                                        "Type":
                                        item.find_all("div",
                                                      class_='issue-heading')
                                        [0].get_text(),
                                        "Published date":
                                        item.find(
                                            "span",
                                            class_='dot-separator').find_all(
                                                'span')[0].get_text(),
                                        "Abstract":
                                        str(
                                            item.find_all(
                                                "div",
                                                class_='issue-item__abstract')
                                            [0].get_text()).strip().replace(
                                                '\n', '').replace('  ', '')
                                    }]
                                }
                            }
                            count += 1
                            # append dict object data
                            data.append(resp_obj)
                        except Exception as e:  # raise e
                            pass
                            exception_type, exception_object, exception_traceback = sys.exc_info(
                            )
                            filename = exception_traceback.tb_frame.f_code.co_filename
                            line_number = exception_traceback.tb_lineno
                            logger.writeError(e, None, _engine, logging_flag,
                                              filename, line_number)
            time.sleep(1)
            logger.writeRecords(query, None, _engine, count, count,
                                logging_flag)
            print(f'Finished with total {count} records returned.')
            return data

        else:
            count = 0
            for i in tqdm(range(1)):

                for i in range(pages):

                    url = 'https://dl.acm.org/action/doSearch?AllField=' + query + '&pageSize=20&startPage=' + str(
                        i)
                    # response object
                    response = requests.get(url, headers=headers)
                    soup = BeautifulSoup(response.content, 'lxml')
                    # obj = json.loads(soup.text)

                    # set the counter for records count

                    # Find required attributes in the response object
                    for item in soup.select(
                            'li', class_='search__item issue-item-container'):
                        try:
                            resp_obj = {
                                "entities": {
                                    "Search Engine":
                                    "ACM Library Search Engine",
                                    "Attributes found":
                                    "DOI, Title, URLs, Authors, Citation count, Type, Published date, Abstract",
                                    "items": [{
                                        "DOI":
                                        item.find(
                                            "span",
                                            class_='hlFld-Title').find_all(
                                                'a')[0]['href'],
                                        "Title":
                                        item.find_all(
                                            "h5", class_='issue-item__title')
                                        [0].get_text().strip(),
                                        "URLs":
                                        item.find_all("a",
                                                      class_='issue-item__doi')
                                        [0]['href'],
                                        "Authors":
                                        item.find_all("ul",
                                                      class_='truncate-list')
                                        [0].get_text().strip().replace(
                                            '\n', ''),
                                        "Publication Name":
                                        str(['No information found']),
                                        "ISSN":
                                        str(['No information found']),
                                        "Cited count":
                                        item.find("span",
                                                  class_='citation').find_all(
                                                      'span')[0].get_text(),
                                        "Affiliation":
                                        str(['No information found']),
                                        "Type":
                                        item.find_all("div",
                                                      class_='issue-heading')
                                        [0].get_text(),
                                        "Published date":
                                        item.find(
                                            "span",
                                            class_='dot-separator').find_all(
                                                'span')[0].get_text(),
                                        "Abstract":
                                        str(
                                            item.find_all(
                                                "div",
                                                class_='issue-item__abstract')
                                            [0].get_text()).strip().replace(
                                                '\n', '').replace('  ', '')
                                    }]
                                }
                            }
                            count += 1
                            # append dict object data
                            data.append(resp_obj)
                        except Exception as e:  # raise e
                            pass
                            exception_type, exception_object, exception_traceback = sys.exc_info(
                            )
                            filename = exception_traceback.tb_frame.f_code.co_filename
                            line_number = exception_traceback.tb_lineno
                            logger.writeError(e, None, _engine, logging_flag,
                                              filename, line_number)
            time.sleep(1)
            logger.writeRecords(query, None, _engine, count, count,
                                logging_flag)
            print(f'Finished with total {count} records returned.')
            return data
Пример #9
0
def search_engines(x, query, headers, _pages, _gs_pages, _acm_pages,
                   _els_pages, records, _title, _keyword, _abstract,
                   _search_yr, _from_yr, _to_yr_, logging_flag, data):
    # Search all engines
    try:
        if len(x) != 0:

            # Engines for Title, Keyword and Abstract #
            if 1 in x:
                search_googleScholar(query, headers, _gs_pages, records,
                                     _title, _keyword, _abstract, scrpr_api,
                                     _from_yr, _to_yr_, logging_flag, data)

            elif 2 in x:
                search_msAcademic(query, headers, _pages, records, _title,
                                  _keyword, _abstract, ms_api, _from_yr,
                                  _to_yr_, logging_flag, data)

            elif 3 in x:
                search_core(query, headers, _pages, records, _title, _keyword,
                            _abstract, core_api, _search_yr, logging_flag,
                            data)

            elif 4 in x:
                search_pubMed(query, headers, _pages, _title, _keyword,
                              _abstract, _from_yr, _to_yr_, logging_flag, data)

            elif 5 in x:
                search_acmlibrary(query, headers, _acm_pages, records, _title,
                                  _keyword, _abstract, _from_yr, _to_yr_,
                                  logging_flag, data)

            # Engines only for Keyword and Abstract #

            elif 6 in x:
                search_PlosOne(query, headers, _pages, records, _title,
                               _keyword, _abstract, _from_yr, _to_yr_,
                               logging_flag, data)

            elif 7 in x:
                search_academia(query, headers, _pages, records, _title,
                                _keyword, _abstract, _search_yr, logging_flag,
                                data)

            elif 8 in x:
                search_scopus(query, headers, _els_pages, records, _title,
                              _keyword, _abstract, scp_api, _from_yr, _to_yr_,
                              logging_flag, data)

            elif 9 in x:
                search_springer(query, headers, _pages, records, _title,
                                _keyword, _abstract, spr_api, _search_yr,
                                logging_flag, data)

            elif 10 in x:
                search_sciDirect(query, headers, _pages, records, _title,
                                 _keyword, _abstract, sd1_api, sd2_api,
                                 _from_yr, _to_yr_, logging_flag, data)

        else:
            print('Select search engine!')
            exit

    except Exception as e:  # raise e
        # pass
        exception_type, exception_object, exception_traceback = sys.exc_info()
        filename = exception_traceback.tb_frame.f_code.co_filename
        line_number = exception_traceback.tb_lineno
        logger.writeError(e, None, "MS Academic", logging_flag, filename,
                          line_number)

    return data
Пример #10
0
def search_msAcademic(query, headers, _pages, records, _title, _keyword,
                      _abstract, ms_api, _from_yr, _to_yr_, logging_flag,
                      data):
    q = str(re.sub('["!,*)@#%(&$_?.^]', '', query.lower()))

    # title search
    if _title:
        url1 = 'https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate?expr=Ti=%27' + q + '%27&model=latest' \
                                                                                                   '&count=10&offset' \
                                                                                                   '=0&attributes' \
                                                                                                   '=DOI,Ti,Y,BT,D,W,' \
                                                                                                   'PB,CC,AA.AuN,' \
                                                                                                   'AA.AuId,AA.DAfN,' \
                                                                                                   'AA.AfN,S,' \
                                                                                                   'AW&subscription' \
                                                                                                   '-key=' + ms_api
        # response object
        response = requests.get(url1, headers=headers)
        soup = BeautifulSoup(response.content, 'lxml')
        obj = json.loads(soup.text)

        print('Searching in Microsoft Academic...')
        # set the counter for records count
        count = 0
        for i in tqdm(range(1)):

            # Find required attributes in the response object
            for item in obj['entities']:
                try:
                    # extract abstract keywords from the response as it doesnt have a specific abstract attribute
                    if bool(str(item['AW'])):
                        abs_str = str(item['AW'])
                        abs_new = abs_str.replace(',', '').replace("'", '')
                    else:
                        abs_new = str(['No information found'])

                    if bool(item['S'][0]['U']):
                        urls = item['S'][0]['U']
                    else:
                        urls = str(['No information found'])

                    if bool(item['BT']):
                        if item['BT'] == 'a':
                            type = 'Journal/Article'
                        elif item['BT'] == 'b':
                            type = 'Book'
                        elif item['BT'] == 'p':
                            type = 'Conference Paper'
                        else:
                            type = str(['No information found'])
                    else:
                        type = str(['No information found'])
                    if 'DOI' not in obj:
                        doi = str(['No information found'])
                    else:
                        doi = item['DOI']
                    if 'PB' not in obj:
                        pb = str(['No information found'])
                    else:
                        pb = item['PB']

                    resp_obj = {
                        "entities": {
                            "Search Engine":
                            "Microsoft Academy",
                            "Attributes found":
                            "DOI, Title, URLs, Authors, Publication Name, Cited "
                            "count, Affiliation name, Type, Published date, "
                            "Abstract",
                            "items": [{
                                "DOI": doi,
                                "Title": item['Ti'],
                                "URLs": urls,
                                "Authors": item['AA'][0]['AuN'],
                                "Publication Name": pb,
                                "ISSN": str(['No Information found']),
                                "Cited count": item['CC'],
                                "Affiliation": item['AA'][0]['DAfN'],
                                "Type": type,
                                "Published date": item['D'],
                                "Abstract": abs_new
                            }]
                        }
                    }
                    count += 1
                    # append dict object data
                    data.append(resp_obj)
                except Exception as e:  # raise e
                    pass
                    exception_type, exception_object, exception_traceback = sys.exc_info(
                    )
                    filename = exception_traceback.tb_frame.f_code.co_filename
                    line_number = exception_traceback.tb_lineno
                    logger.writeError(e, None, _engine, logging_flag, filename,
                                      line_number)

        time.sleep(1)
        logger.writeRecords(query, None, _engine, count, count, logging_flag)
        print(f'Finished with total {count} records returned.')
        return data

    if (not _from_yr):
        # keyword search
        if _keyword or _abstract:
            print('Searching in Microsoft Academic...')
            count = 0
            for i in tqdm(range(1)):
                url1 = 'https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate?expr=Composite(F.FN=%27' + q + '%27)&model=latest&count=' + str(
                    records) + '&offset=0&attributes=DOI,Ti,Y,BT,D,W,PB,CC,AA.AuN,AA.AuId,AA.DAfN,AA.AfN,S,' \
                               'AW&subscription-key=' + ms_api

                # response object
                response = requests.get(url1, headers=headers)
                soup = BeautifulSoup(response.content, 'lxml')
                obj = json.loads(soup.text)
                # set the counter for records count

                # Find required attributes in the response object
                for item in obj['entities']:
                    try:
                        # extract abstract keywords from the response as it doesnt have a spefcific abstract attribute
                        if 'AW' in item:
                            abs_str = str(item['AW'])
                            abs_new = abs_str.replace(',', '').replace("'", '')
                        else:
                            abs_new = str(['No information found'])

                        if bool('S' in item):
                            urls = item['S'][0]['U']
                        else:
                            urls = str(['No information found'])

                        if bool('BT' in item):
                            if item['BT'] == 'a':
                                type = 'Journal/Article'
                            elif item['BT'] == 'b':
                                type = 'Book'
                            elif item['BT'] == 'p':
                                type = 'Conference Paper'
                            else:
                                type = str(['No information found'])
                        else:
                            type = str(['No information found'])
                        if 'DOI' not in obj:
                            doi = str(['No information found'])
                        else:
                            doi = item['DOI']
                        if 'PB' not in obj:
                            pb = str(['No information found'])
                        else:
                            pb = item['PB']

                        resp_obj = {
                            "entities": {
                                "Search Engine":
                                "Microsoft Academy",
                                "Attributes found":
                                "DOI, Title, URLs, Authors, Publication Name, Cited count, Affiliation name, Type, Published date, Abstract",
                                "items": [{
                                    "DOI": doi,
                                    "Title": item['Ti'],
                                    "URLs": urls,
                                    "Authors": item['AA'][0]['AuN'],
                                    "Publication Name": pb,
                                    "ISSN": str(['No Information found']),
                                    "Cited count": item['CC'],
                                    "Affiliation": item['AA'][0]['DAfN'],
                                    "Type": type,
                                    "Published date": item['D'],
                                    "Abstract": abs_new
                                }]
                            }
                        }
                        count += 1
                        # append dict object data
                        data.append(resp_obj)
                    except Exception as e:  # raise e
                        # pass
                        exception_type, exception_object, exception_traceback = sys.exc_info(
                        )
                        filename = exception_traceback.tb_frame.f_code.co_filename
                        line_number = exception_traceback.tb_lineno
                        logger.writeError(e, None, _engine, logging_flag,
                                          filename, line_number)

            time.sleep(1)
            logger.writeRecords(query, None, _engine, count, count,
                                logging_flag)
            print(f'Finished with total {count} records returned.')
            return data
    else:
        if _keyword or _abstract:

            print('Searching in Microsoft Academic...')
            count = 0
            for i in tqdm(range(1)):
                url1 = 'https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate?expr=And(Y=' + '[' + _from_yr + ',' + _to_yr_ + ']' + ',Composite(F.FN==%27' + q + '%27))' + '&model=latest&count=' + str(
                    records) + '&offset=0&attributes=DOI,Ti,Y,BT,D,W,PB,CC,AA.AuN,AA.AuId,AA.DAfN,AA.AfN,S,' \
                               'AW&subscription-key=' + ms_api
                # response object
                response = requests.get(url1, headers=headers)
                soup = BeautifulSoup(response.content, 'lxml')
                obj = json.loads(soup.text)
                # set the counter for records count

                # Find required attributes in the response object
                for item in obj['entities']:
                    try:
                        # extract abstract keywords from the response as it doesnt have a spefcific abstract attribute
                        if ('AW' in item):
                            abs_str = str(item['AW'])
                            abs_new = abs_str.replace(',', '').replace("'", '')
                        else:
                            abs_new = str(['No information found'])

                        if bool('S' in item):
                            urls = item['S'][0]['U']
                        else:
                            urls = str(['No information found'])

                        if bool('BT' in item):
                            if item['BT'] == 'a':
                                type = 'Journal/Article'
                            elif item['BT'] == 'b':
                                type = 'Book'
                            elif item['BT'] == 'p':
                                type = 'Conference Paper'
                            else:
                                type = str(['No information found'])
                        else:
                            type = str(['No information found'])
                        if 'DOI' not in obj:
                            doi = str(['No information found'])
                        else:
                            doi = item['DOI']
                        if 'PB' not in obj:
                            pb = str(['No information found'])
                        else:
                            pb = item['PB']

                        resp_obj = {
                            "entities": {
                                "Search Engine":
                                "Microsoft Academy",
                                "Attributes found":
                                "DOI, Title, URLs, Authors, Publication Name, "
                                "Cited count, Affiliation name, Type, "
                                "Published date, Abstract",
                                "items": [{
                                    "DOI": doi,
                                    "Title": item['Ti'],
                                    "URLs": urls,
                                    "Authors": item['AA'][0]['AuN'],
                                    "Publication Name": pb,
                                    "ISSN": str(['No Information found']),
                                    "Cited count": item['CC'],
                                    "Affiliation": item['AA'][0]['DAfN'],
                                    "Type": type,
                                    "Published date": item['D'],
                                    "Abstract": abs_new
                                }]
                            }
                        }
                        count += 1
                        # append dict object data
                        data.append(resp_obj)
                    except Exception as e:  # raise e
                        # pass
                        exception_type, exception_object, exception_traceback = sys.exc_info(
                        )
                        filename = exception_traceback.tb_frame.f_code.co_filename
                        line_number = exception_traceback.tb_lineno
                        logger.writeError(e, None, _engine, logging_flag,
                                          filename, line_number)

            time.sleep(1)
            logger.writeRecords(query, None, _engine, count, count,
                                logging_flag)
            print(f'Finished with total {count} records returned.')
            return data
Пример #11
0
def search_springer(query, headers, _pages, records, _title, _keyword, _abstract, spr_api, _search_yr, logging_flag,
                    data):
    print('Searching in Springer...')

    if not _search_yr:
        count = 0
        for i in tqdm(range(1)):

            for i in range(_pages):

                url = 'http://api.springernature.com/meta/v2/json?q=' + query + '&s=' + str(
                    i) + '&p=10&api_Key=' + spr_api

                # response object
                response = requests.get(url, headers=headers)
                soup = BeautifulSoup(response.content, 'lxml')
                obj = json.loads(soup.text)

                # set the counter for records count

                # Find required attributes in the response object
                for item in obj['records']:

                    if 'issn' in obj['records']:
                        issn = item['issn']
                    elif 'isbn' in obj['records']:
                        issn = item['isbn']
                    else:
                        issn = str(['No Information found'])

                        try:
                            resp_obj = {"entities": {"Search Engine": "Springer Search Engine",
                                                     "Attributes found": "DOI, Title, URLs, Authors, Publication "
                                                                         "Name, ISSN, Type, Published date, Abstract",
                                                     "items": [
                                                         {"DOI": item['identifier'],
                                                          "Title": item['title'],
                                                          "URLs": item['url'][0]['value'],
                                                          "Authors": item['creators'][0]['creator'],
                                                          "Publication Name": item['publicationName'],
                                                          "ISSN": issn,
                                                          "Cited count": str(['No Information found']),
                                                          "Affiliation": str(['No information found']),
                                                          "Type": item['contentType'],
                                                          "Published date": item['onlineDate'],
                                                          "Abstract": item['abstract']
                                                          }
                                                     ]}}
                            count += 1
                            # append dict object data
                            data.append(resp_obj)
                        except Exception as e:  # raise e
                            pass
                            exception_type, exception_object, exception_traceback = sys.exc_info()
                            filename = exception_traceback.tb_frame.f_code.co_filename
                            line_number = exception_traceback.tb_lineno
                            logger.writeError(e, None, _engine, logging_flag, filename, line_number)

        time.sleep(1)
        logger.writeRecords(query, None, _engine, count, count, logging_flag)
        print(f'Finished with total {count} records returned.')
        return data
    else:
        print("Date parameter either not supported or not available in Springer API!")
        return
Пример #12
0
def search_scopus(query, headers, _els_pages, records, _title, _keyword,
                  _abstract, scp_api, _from_yr, _to_yr_, logging_flag, data):
    query = processInputQuery(query)
    if _title:
        url = 'https://api.elsevier.com/content/search/scopus?query=%22' + query + '%22&apiKey=' + scp_api

        # response object
        response = requests.get(url, headers=headers, timeout=30)
        soup = BeautifulSoup(response.content, 'lxml')

        # convert resonse into josn
        obj = json.loads(soup.text)

        print('Searching in Elsevier Scopus...')
        # set the counter for records count
        count = 0
        for i in tqdm(range(1)):

            # Find required attributes in the response object
            for item in obj['search-results']['entry']:
                try:
                    if "prism:Issn" and "prism:issn" not in obj:
                        issn = item['prism:eIssn']
                    else:
                        issn = item['prism:issn']

                    resp_obj = {
                        "entities": {
                            "Search Engine":
                            "Elsevier SCOPUS Search Engine",
                            "Attributes found":
                            "DOI, Title, URLs, Authors, Publication Name, ISSN, "
                            "Cited count, Affiliation name, Type, "
                            "Published date, Abstract",
                            "items": [{
                                "DOI":
                                item['prism:doi'],
                                "Title":
                                item['dc:title'],
                                "URLs":
                                item['prism:url'],
                                "Authors":
                                item['dc:creator'],
                                "Publication Name":
                                item['prism:publicationName'],
                                "ISSN":
                                issn,
                                "Cited count":
                                item['citedby-count'],
                                "Affiliation":
                                item['affiliation'][0]['affilname'],
                                "Type":
                                item['subtypeDescription'],
                                "Published date":
                                item['prism:coverDate'],
                                "Abstract":
                                item['prism:publicationName']
                            }]
                        }
                    }
                    count += 1
                    # append dict object data
                    data.append(resp_obj)
                except Exception as e:  # raise e
                    pass
                    exception_type, exception_object, exception_traceback = sys.exc_info(
                    )
                    filename = exception_traceback.tb_frame.f_code.co_filename
                    line_number = exception_traceback.tb_lineno
                    logger.writeError(e, None, _engine, logging_flag, filename,
                                      line_number)
        time.sleep(1)
        logger.writeRecords(query, None, _engine, count, count, logging_flag)
        print(f'Finished with total {count} records returned.')
        return data
    if not _from_yr:
        if _keyword or _abstract:
            rec = 0
            if (_els_pages != 0):
                pages = pagination(_els_pages)
            else:
                pages = 1
            print('Searching in Elsevier Scopus...')
            count = 0
            for i in tqdm(range(1)):

                for i in range(pages):

                    url = 'https://api.elsevier.com/content/search/scopus?query=' + query + '&apiKey=' + scp_api + '&start=' + str(
                        i) + '&count=10'

                    # response object
                    response = requests.get(url, headers=headers, timeout=30)
                    soup = BeautifulSoup(response.content, 'lxml')

                    # convert response into json
                    obj = json.loads(soup.text)

                    # set the counter for records count

                    # Find required attributes in the response object
                    for item in obj['search-results']['entry']:
                        try:
                            if "prism:eIssn" in item:
                                issn = item['prism:eIssn']
                            elif "prism:Issn" or "prism:issn" in item:
                                issn = item['prism:issn']
                            else:
                                issn = str(['No information found'])

                            resp_obj = {
                                "entities": {
                                    "Search Engine":
                                    "Elsevier SCOPUS Search Engine",
                                    "Attributes found":
                                    "DOI, Title, URLs, Authors, Publication "
                                    "Name, ISSN, Cited count, Affiliation name, "
                                    "Type, Published date, Abstract",
                                    "items": [{
                                        "DOI":
                                        item['prism:doi'],
                                        "Title":
                                        item['dc:title'],
                                        "URLs":
                                        item['prism:url'],
                                        "Authors":
                                        item['dc:creator'],
                                        "Publication Name":
                                        item['prism:publicationName'],
                                        "ISSN":
                                        issn,
                                        "Cited count":
                                        item['citedby-count'],
                                        "Affiliation":
                                        item['affiliation'][0]['affilname'],
                                        "Type":
                                        item['subtypeDescription'],
                                        "Published date":
                                        item['prism:coverDate'],
                                        "Abstract":
                                        item['prism:publicationName']
                                    }]
                                }
                            }
                            count += 1
                            # append dict object data
                            data.append(resp_obj)
                        except Exception as e:  # raise e
                            pass
                            exception_type, exception_object, exception_traceback = sys.exc_info(
                            )
                            filename = exception_traceback.tb_frame.f_code.co_filename
                            line_number = exception_traceback.tb_lineno
                            logger.writeError(e, None, _engine, logging_flag,
                                              filename, line_number)
            time.sleep(1)
            logger.writeRecords(query, None, _engine, count, count,
                                logging_flag)
            print(f'Finished with total {count} records returned.')
            return data

    else:
        if _keyword or _abstract:
            print('Searching in Elsevier Scopus...')
            count = 0
            for i in tqdm(range(1)):

                url = 'https://api.elsevier.com/content/search/scopus?query=' + query + '&apiKey=' + scp_api + '&date=' + _from_yr + '-' + _to_yr_ + '&start=' + str(
                    i) + '&count=10'
                url = 'https://api.elsevier.com/content/search/scopus?query=' + query + '&apiKey=' + scp_api + '&date=' + _from_yr + '-' + _to_yr_ + '&start=' + str(
                    i) + '&count=10'
                # response object
                response = requests.get(url, headers=headers, timeout=30)
                soup = BeautifulSoup(response.content, 'lxml')

                # convert response into json
                obj = json.loads(soup.text)
                rec = obj['search-results']['opensearch:totalResults']
                if _els_pages != 0:
                    pages = pagination(_els_pages)
                else:
                    pages = pagination(rec)

                if int(pages) > 1000:
                    pages = 100
                    for i in range(pages):

                        url = 'https://api.elsevier.com/content/search/scopus?query=' + query + '&apiKey=' + scp_api + '&date=' + _from_yr + '-' + _to_yr_ + '&start=' + str(
                            i) + '&count=10'

                        # response object
                        response = requests.get(url,
                                                headers=headers,
                                                timeout=30)
                        soup = BeautifulSoup(response.content, 'lxml')

                        # convert response into json
                        obj = json.loads(soup.text)

                        # Find required attributes in the response object
                        for item in obj['search-results']['entry']:
                            try:
                                if "prism:eIssn" in item:
                                    issn = item['prism:eIssn']
                                elif "prism:Issn" or "prism:issn" in item:
                                    issn = item['prism:issn']
                                else:
                                    issn = str(['No information found'])

                                resp_obj = {
                                    "entities": {
                                        "Search Engine":
                                        "Elsevier SCOPUS Search Engine",
                                        "Attributes found":
                                        "DOI, Title, URLs, Authors, Publication "
                                        "Name, ISSN, Cited count, Affiliation "
                                        "name, Type, Published date, Abstract",
                                        "items": [{
                                            "DOI":
                                            item['prism:doi'],
                                            "Title":
                                            item['dc:title'],
                                            "URLs":
                                            item['prism:url'],
                                            "Authors":
                                            item['dc:creator'],
                                            "Publication Name":
                                            item['prism:publicationName'],
                                            "ISSN":
                                            issn,
                                            "Cited count":
                                            item['citedby-count'],
                                            "Affiliation":
                                            item['affiliation'][0]
                                            ['affilname'],
                                            "Type":
                                            item['subtypeDescription'],
                                            "Published date":
                                            item['prism:coverDate'],
                                            "Abstract":
                                            item['prism:publicationName']
                                        }]
                                    }
                                }
                                count += 1
                                # append dict object data
                                data.append(resp_obj)
                            except Exception as e:  # raise e
                                pass
                                exception_type, exception_object, exception_traceback = sys.exc_info(
                                )
                                filename = exception_traceback.tb_frame.f_code.co_filename
                                line_number = exception_traceback.tb_lineno
                                logger.writeError(e, None, _engine,
                                                  logging_flag, filename,
                                                  line_number)
                else:

                    for i in range(pages):

                        url = 'https://api.elsevier.com/content/search/scopus?query=' + query + '&apiKey=' + scp_api + '&date=' + _from_yr + '-' + _to_yr_ + '&start=' + str(
                            i) + '&count=10'

                        # response object
                        response = requests.get(url,
                                                headers=headers,
                                                timeout=30)
                        soup = BeautifulSoup(response.content, 'lxml')

                        # convert response into json
                        obj = json.loads(soup.text)

                        # Find required attributes in the response object
                        for item in obj['search-results']['entry']:
                            try:
                                if "prism:eIssn" in item:
                                    issn = item['prism:eIssn']
                                elif "prism:Issn" or "prism:issn" in item:
                                    issn = item['prism:issn']
                                else:
                                    issn = str(['No information found'])

                                resp_obj = {
                                    "entities": {
                                        "Search Engine":
                                        "Elsevier SCOPUS Search Engine",
                                        "Attributes found":
                                        "DOI, Title, URLs, Authors, Publication Name, ISSN, Cited count, Affiliation name, Type, Published date, Abstract",
                                        "items": [{
                                            "DOI":
                                            item['prism:doi'],
                                            "Title":
                                            item['dc:title'],
                                            "URLs":
                                            item['prism:url'],
                                            "Authors":
                                            item['dc:creator'],
                                            "Publication Name":
                                            item['prism:publicationName'],
                                            "ISSN":
                                            issn,
                                            "Cited count":
                                            item['citedby-count'],
                                            "Affiliation":
                                            item['affiliation'][0]
                                            ['affilname'],
                                            "Type":
                                            item['subtypeDescription'],
                                            "Published date":
                                            item['prism:coverDate'],
                                            "Abstract":
                                            item['prism:publicationName']
                                        }]
                                    }
                                }
                                count += 1
                                # append dict object data
                                data.append(resp_obj)
                            except Exception as e:  # raise e
                                pass
                                exception_type, exception_object, exception_traceback = sys.exc_info(
                                )
                                filename = exception_traceback.tb_frame.f_code.co_filename
                                line_number = exception_traceback.tb_lineno
                                logger.writeError(e, None, _engine,
                                                  logging_flag, filename,
                                                  line_number)
            time.sleep(1)
            logger.writeRecords(query, None, _engine, rec, count, logging_flag)
            print(f'Finished with total {count} records returned.')
            return data
Пример #13
0
def search_PlosOne(query, headers, _pages, records, _title, _keyword, _abstract, _from_yr, _to_yr_, logging_flag, data):
    if _title:
        print('Searching in PLOS ONE...')
        count = 0

        # search_PlosOne_title(query)
        url = 'http://api.plos.org/search?q=title:' + query + '&start=1&rows=' + str(records)
        # response object
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        obj = json.loads(soup.text)

        # set the counter for records count
        try:
            for i in tqdm(range(1)):

                # Find required attributes in the response object
                for item in obj['response']['docs']:
                    try:

                        resp_obj = {"entities": {"Search Engine": "PLOS Engine",
                                                 "Attributes found": "DOI, Title, URLs, Authors, ISSN, Type, "
                                                                     "Published date, Abstract",
                                                 "items": [
                                                     {"DOI": item['id'],
                                                      "Title": item['title_display'],
                                                      "URLs": 'https://doi.org/' + item['id'],
                                                      "Authors": item['author_display'],
                                                      "Publication Name": str(['No information found']),
                                                      # "Publication Name": item['publisher'],
                                                      "ISSN": item['eissn'],
                                                      "Cited count": str(['No information found']),
                                                      "Affiliation": str(['No information found ']),
                                                      "Type": item['article_type'],
                                                      "Published date": str(item['publication_date']).split('T', -1)[0],
                                                      "Abstract": str(item['abstract']).strip().replace('\n',
                                                                                                        '').replace(
                                                          '  ', '')
                                                      }
                                                 ]}}
                        count += 1
                        # append dict object data
                        data.append(resp_obj)
                    except Exception as e:  # raise e
                        pass
                        exception_type, exception_object, exception_traceback = sys.exc_info()
                        filename = exception_traceback.tb_frame.f_code.co_filename
                        line_number = exception_traceback.tb_lineno
                        logger.writeError(e, None, _engine, logging_flag, filename, line_number)

            time.sleep(1)
            logger.writeRecords(query, None, _engine, count, count, logging_flag)
            print(f'Finished with total {count} records returned.')
            return data
        except Exception as e:  # raise e
            pass
            time.sleep(1)
            print('Some error happend in PLOS Engine!')

    if not _from_yr:
        if _keyword or _abstract:

            print('Searching in PLOS ONE...')
            _rec = round(float(records))
            count = 0
            try:
                for i in tqdm(range(1)):

                    url = 'http://api.plos.org/search?q=' + query + '&start=1&rows=' + str(_rec)
                    # response object
                    response = requests.get(url, headers=headers)
                    soup = BeautifulSoup(response.content, 'html.parser')
                    obj = json.loads(soup.text)

                    for item in obj['response']['docs']:
                        try:

                            resp_obj = {"entities": {"Search Engine": "PLOS Engine",
                                                     "Attributes found": "DOI, Title, URLs, Authors, ISSN, Type, Published date, Abstract",
                                                     "items": [
                                                         {"DOI": item['id'],
                                                          "Title": item['title_display'],
                                                          "URLs": 'https://doi.org/' + item['id'],
                                                          "Authors": item['author_display'],
                                                          "Publication Name": str(['No information found']),
                                                          # "Publication Name": item['publisher'],
                                                          "ISSN": item['eissn'],
                                                          "Cited count": str(['No information found']),
                                                          "Affiliation": str(['No information found ']),
                                                          "Type": item['article_type'],
                                                          "Published date":
                                                              str(item['publication_date']).split('T', -1)[0],
                                                          "Abstract": str(item['abstract']).strip().replace('\n',
                                                                                                            '').replace(
                                                              '  ', '')
                                                          }
                                                     ]}}
                            count += 1
                            # append dict object data
                            data.append(resp_obj)
                        except Exception as e:  # raise e
                            pass
                            exception_type, exception_object, exception_traceback = sys.exc_info()
                            filename = exception_traceback.tb_frame.f_code.co_filename
                            line_number = exception_traceback.tb_lineno
                            logger.writeError(e, None, _engine, logging_flag, filename, line_number)

                time.sleep(1)
                logger.writeRecords(query, None, _engine, count, count, logging_flag)
                print(f'Finished with total {count} records returned.')
                return data

            except Exception as e:  # raise e
                pass
    else:
        if _keyword or _abstract:
            print('Searching in PLOS ONE...')
            _rec = round(float(records))
            count = 0
            try:
                for i in tqdm(range(1)):

                    url = 'http://api.plos.org/search?q=' + query + ' AND publication_date:[' + _from_yr + '-01-01T00:00:00Z TO ' + _to_yr_ + '-12-31T23:59:59Z]' + '&start=1&rows=' + str(
                        _rec)
                    # response object
                    response = requests.get(url, headers=headers)
                    soup = BeautifulSoup(response.content, 'html.parser')
                    obj = json.loads(soup.text)
                    for item in obj['response']['docs']:
                        try:

                            resp_obj = {"entities": {"Search Engine": "PLOS Engine",
                                                     "Attributes found": "DOI, Title, URLs, Authors, ISSN, Type, Published date, Abstract",
                                                     "items": [
                                                         {"DOI": item['id'],
                                                          "Title": item['title_display'],
                                                          "URLs": 'https://doi.org/' + item['id'],
                                                          "Authors": item['author_display'],
                                                          "Publication Name": str(['No information found']),
                                                          # "Publication Name": item['publisher'],
                                                          "ISSN": item['eissn'],
                                                          "Cited count": str(['No information found']),
                                                          "Affiliation": str(['No information found ']),
                                                          "Type": item['article_type'],
                                                          "Published date":
                                                              str(item['publication_date']).split('T', -1)[0],
                                                          "Abstract": str(item['abstract']).strip().replace('\n',
                                                                                                            '').replace(
                                                              '  ', '')
                                                          }
                                                     ]}}
                            count += 1
                            # append dict object data
                            data.append(resp_obj)
                        except Exception as e:  # raise e
                            pass
                            exception_type, exception_object, exception_traceback = sys.exc_info()
                            filename = exception_traceback.tb_frame.f_code.co_filename
                            line_number = exception_traceback.tb_lineno
                            logger.writeError(e, None, _engine, logging_flag, filename, line_number)

                time.sleep(1)
                logger.writeRecords(query, None, _engine, count, count, logging_flag)
                print(f'Finished with total {count} records returned.')
                return data

            except Exception as e:  # raise e
                pass
                exception_type, exception_object, exception_traceback = sys.exc_info()
                filename = exception_traceback.tb_frame.f_code.co_filename
                line_number = exception_traceback.tb_lineno
                logger.writeError(e, None, _engine, logging_flag, filename, line_number)