Пример #1
0
def click_more_details(driver):

    try:
        driver.find_element_by_xpath("//a[@id='institutionDetailsLink']").click()
    except:
        driver.find_element_by_xpath("//*[contains(text(), 'More details on this Institution')]").click()

    t = runi(4 + runi(-2, 1))
    logger.debug('opened more details link, waiting for {} s'.format(t))
    time.sleep(t)

    return driver
Пример #2
0
def remove_all_universities(driver):

    logger.debug('removing the universities from the list')
    time.sleep(2 + runi(-0.5, 0.5))

    try:
        driver.find_element_by_xpath("//*[contains(text(), 'Remove all entities from this section')]").click()
        time.sleep(5 + runi(-1,1))
        driver.find_element_by_xpath("//*[contains(text(), 'Remove all entities from this section')]").click()
        time.sleep(5 + runi(-1,1))
        logger.debug('all institutions removed succesfully')
    except:
        logger.warning('error has occured during removing all institutions from the list')
        time.sleep(3 + runi(-0.5, 0.5))


    return driver
Пример #3
0
def open_link(driver, link):

    try:

        driver.get(link)
        logger.debug('link opened successfully, {}'.format(link))

    except Exception as e:

        logger.warning('error during opening the link, {}'.format(link))
        raise


    t = runi(10 + runi(-3, 1))
    logger.debug('opened main link, waiting for {} s'.format(t))
    time.sleep(t)

    return driver
Пример #4
0
def open_scival_log_in(driver):

    link = 'https://scival.com/customer/authenticate/loginfull'

    try:

        driver.get(link)

        logger.debug('driver opened succesfully')

    except Exception as e:

        logger.warning('error has occured during opening browser')
        raise


    t = runi(10 + runi(-3, 1))
    logger.debug('opened main link, waiting for {} s'.format(t))
    time.sleep(t)

    return driver
Пример #5
0
def put_scival_credentials(driver):


    username = '******'
    password = read_credentials('password')

    username_field = driver.find_element_by_xpath("//input[@id='username']")
    t = runi(5 + runi(-3, 1))
    time.sleep(t)

    password_field = driver.find_element_by_xpath("//input[@id='password-input-password']")
    t = runi(4 + runi(-3, 1))
    time.sleep(t)

    username_field.send_keys(username)
    password_field.send_keys(password, Keys.RETURN)

    t = runi(10 + runi(-3, 1))
    time.sleep(t)

    return driver
Пример #6
0
def open_scopus_link(driver):
    '''
    open the advanced search of scopus
    '''
    # binary = FirefoxBinary('/usr/lib/firefox/firefox')
    # driver = webdriver.Firefox(firefox_binary=binary)
    driver.implicitly_wait(2)  # seconds

    # l = 'https://www.nobelprize.org/nobel_prizes/physics/laureates/index.html'
    main_scopus2 = 'https://www.scopus.com/sources?zone=&origin=NO%20ORIGIN%20DEFINED'
    main_search = 'https://www.scopus.com/search/form.uri?zone=TopNavBar&origin=sbrowse&display=basic'
    adv_search = 'https://www.scopus.com/search/form.uri?display=advanced&clear=t&origin=searchbasic&txGid=fc476bc6f6c3a112a577edd9f6f26e14'

    # to get to advanced search, we need to go through several links
    driver.get(main_scopus2)
    t = runi(10 + runi(-3, 1))
    logger.debug('opened main link, waiting for {} s'.format(t))
    time.sleep(t)
    # driver.implicitly_wait(10) # seconds

    close_pop_up_window(driver)

    driver.get(main_search)
    t = runi(10 + runi(-3, 1))
    logger.debug('opened search link, waiting for {} s'.format(t))
    time.sleep(t)
    # driver.implicitly_wait(10) # seconds

    close_pop_up_window(driver)

    driver.get(adv_search)
    t = runi(10 + runi(-3, 1))
    logger.debug('opened advanced search link, waiting for {} s'.format(t))
    time.sleep(t)
    # driver.implicitly_wait(10) # seconds

    close_pop_up_window(driver)

    return driver
Пример #7
0
    reg_com_year = re.compile(reg_exp_year)


    xpath_est = '//tbody//tr//th[text()="Established"]'
    xpath_founded = '//tbody//tr//th[text()="Founded"]'

    xpath_est_date = '//tbody//tr//th[text()="Established"]/following-sibling::td'
    xpath_est_date_short = './following-sibling::td'

    # driver.find_element_by_xpath('//tbody//tr//th[@value="Established"]')


    # for index, row in df.iloc[:10,:].iterrows():
    for index, row in df.iterrows():

        time.sleep(2+runi(0.5, 1.5))

        if row[cname_date] != '':
            continue

        aff = row[cname_name]
        logger.debug("extracting data for {}".format(aff))

        aff = row[cname_name]

        aff_name_in_link = aff.replace(' ', '_')

        aff_link = link + aff_name_in_link


        try:
Пример #8
0
def main(n, year, metricType, ack_params, metrics_params):

    adv_search_link = 'https://www.scopus.com/search/form.uri?display=advanced&clear=t&origin=searchbasic&txGid=fc476bc6f6c3a112a577edd9f6f26e14'

    logger.debug('downloaded results table')

    # db_name_ack = 'acknowledgements'
    # coll_name_ack = 'acks_by_scival_Apr19'

    # db_name_metrics = 'scopus_metrics'
    # coll_name_metrics = 'metrics_by_scival'

    db_name_ack = ack_params['db_name']
    coll_name_ack = ack_params['coll_name']

    db_name_metrics = metrics_params['db_name']
    coll_name_metrics = metrics_params['coll_name']

    parent_field = 'scival_id'
    child_field = 'scopus_id'
    child_id_field = 'child_id'

    coll_ack = mongo_metric_ack(db_name=db_name_ack, coll_name=coll_name_ack)
    coll_metrics = mongo_scopus_metrics(db_name=db_name_metrics,
                                        coll_name=coll_name_metrics)

    # valid_ids = coll_ack.find_valid_parent_ids(metricType, str(year), n)
    valid_dicts = coll_ack.find_valid_parent_ids(metricType, str(year), n)

    logger.debug('opening browser with scopus advanced search link')

    timeout = 60

    fp = webdriver.FirefoxProfile()
    fp.set_preference("http.response.timeout", timeout)
    fp.set_preference("dom.max_script_run_time", timeout)

    binary = FirefoxBinary('/usr/lib/firefox/firefox')
    driver = webdriver.Firefox(firefox_binary=binary, firefox_profile=fp)

    try:
        # driver = open_scopus_link(driver)
        driver.get(adv_search_link)
        time.sleep(5)

        logger.debug('doing search')
    except:
        logger.warning('error has occured during opening browser')

    else:

        for valid_dict in valid_dicts:

            logger.debug('opening advanced search link')
            driver.get(adv_search_link)
            time.sleep(5 + runi(-1, 1))

            close_pop_up_window(driver)

            patent_count = 0

            parent_id = valid_dict[parent_field]
            aff_id = valid_dict[child_id_field]
            aff_name = valid_dict['name']

            logger.debug('creating query for search')
            # # query = 'af-id({}) AND pubyear = {}'.format(aff_id, year)
            # query = '( ' + ' or '.join(['af-id({})'.format(x) for x in aff_id]) + ' )'
            # query = '{} AND pubyear = {}'.format(query, year)

            # created a query with variable query_type
            query = create_query(aff_id, aff_name, ack_params['query_type'],
                                 year)
            logger.debug('query is {}'.format(query))

            q = {
                parent_field: parent_id,
                'metricType': metricType,
                'year': year
            }

            metric_response = q.copy()
            ack_response = q.copy()

            try:
                logger.debug('getting patent_count for {}'.format(aff_id))
                patent_count = get_patent_count(driver, query)

            except TimeoutException as e:
                logger.warning('timeout error')
                print(e)
                break

            except Exception as e:
                logger.warning('error has occured')
                logger.warning(e)

                ack_response['ack'] = -1
                coll_ack.update_item_by_year(parent_field, **ack_response)

                break

            else:

                # saving response and ack
                metric_response['value'] = patent_count
                ack_response['ack'] = 1

                logger.debug(
                    'number of patents has been retrieved succesfully')
                logger.debug('number of patents for {} is {}'.format(
                    parent_id, patent_count))

                logger.debug("updating metrics and ack dbs")
                coll_metrics.update_item_by_year(parent_field,
                                                 **metric_response)
                coll_ack.update_item_by_year(parent_field, **ack_response)
                logger.debug("updating metrics and ack dbs finished")

            print(ack_response)

    finally:
        driver.quit()
        pass

    return driver
Пример #9
0
def get_patent_count(driver, query):

    # click to activate the textField
    # sometimes we need to click 'contentEditLabel', sometimes 'searchfield'
    logger.debug('clicking on search input field')
    try:
        logger.debug('clicking on contentEditLabel')
        driver.find_element_by_id('contentEditLabel').click()
    except Exception as e:
        logger.debug('clicking on searchfield')
        driver.find_element_by_id('searchfield').click()

    # fill the textfield and send the request
    element = driver.find_element_by_id('searchfield')

    logger.debug('clearing search field')
    element.clear()
    # time.wait(runi(0, 0.2))
    logger.debug('entering query into search field')
    element.send_keys(query, Keys.RETURN)

    t = 4 + runi(0, 1)
    time.sleep(t)

    logger.debug('waiting for page to be downloaded')
    # get amount of patents
    el = wait(driver, 60).until(
        EC.presence_of_element_located((By.ID, "searchResFormId")))

    current_url = driver.current_url
    driver.get(current_url)

    t = 10 + runi(0, 1)
    time.sleep(t)

    # no documents found
    if driver_has_element_by_xpath(
            driver, '//div[@class="alert alert-danger"]/a[@class="close"]'):
        logger.debug('no document found for here')
        a = 0
        return a
    else:

        # if documents found
        a = 0

        try:

            to_wait = -5

            while to_wait < 0:

                logger.debug('retrieving patent_count')
                patent_hidden_element = driver.find_element_by_id(
                    'hubLinksContainer')
                if patent_hidden_element.get_attribute('class') == 'hidden':
                    logger.debug('no patent elements were found')
                    patent_value = 0
                    to_wait = 5
                elif driver.find_element_by_id('patentLink').is_displayed():
                    logger.debug('getting #patentLink once more')
                    patent_element = driver.find_element_by_id('patentLink')
                    patent_value = patent_element.text
                    time.sleep(1)
                    to_wait = to_wait + 1

        except:
            driver.find_element_by_xpath(
                "//button[@title='Edit search query']").click()
            assert 'something happened'

        else:

            logger.debug('patent_value is {}'.format(patent_value))

            a = extract_integer_(patent_value)

            logger.debug('extracted integer is {}'.format(a))

            t = 2 + runi(0, 1)
            time.sleep(t)

            try:
                logger.debug('clicking on editAuthSearch')
                driver.find_element_by_id('editAuthSearch').click()
            except:
                logger.debug('error during clicking on editAuthSearch')
                logger.debug('instead find "Edit search query" field by force')
                driver.find_element_by_xpath(
                    "//button[@title='Edit search query']").click()

            t = 2 + runi(0, 1)
            time.sleep(t)

            return a
Пример #10
0
            try:
                table, extracted_name, child_count = extract_table(driver)
                logger.debug('table extracted succesfully')
            except:
                logger.warning('table could not be extracted')

            else:
                logger.debug('saving the child_ids of {}'.format(valid_ids[i]))
                table.to_excel('data/child_id/{}.xlsx'.format(valid_ids[i]), index=False)


                logger.debug('saving acknoledge to the table')
                df.loc[df.id == valid_ids[i], 'scopus_id_downloaded'] = 1
                df.to_csv(fname, index=False)

                time.sleep(5 + runi(-1, 1))

                parent_aff = {'name': valid_names[i],
                              'scival_id': valid_ids[i]
                }

                append_scopus_ids_to_parent(db_ids, table, parent_aff)


        logger.debug('opening link to remove all universities')
        driver = open_link(driver, a)
        driver = remove_all_universities(driver)

        time.sleep(3 + runi(-1, 1))

    except: