-
Notifications
You must be signed in to change notification settings - Fork 0
/
Scraper.py
95 lines (76 loc) · 3.15 KB
/
Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Purpose of following code is to aggregate all links
# on cochrane database that has Triple - YES/NO/Maybe answers
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import time
import cPickle as p
import pymongo as mongo
def getLink(el):
heading = el.find_element_by_class_name("results-block__description-heading")
return heading.find_element_by_class_name("results-block__link").get_attribute("href")
def getLinksSinglePage(collection, driver):
# Getting list of all articles
WebDriverWait(driver, 1).until(
EC.presence_of_element_located((By.CLASS_NAME, "results-block__article"))
)
articleList = driver.find_elements_by_class_name("results-block__article")
for i in xrange(len(articleList)):
info = {"Topic": topic}
try:
info["link"] = getLink(articleList[i])
# print "Saved file with link", info["link"]
collection.insert_one(info)
except:
print "Failed to save at", i
def getLinks(topic, collection, driver):
startPage = "http://www.cochranelibrary.com/home/topic-and-review-group-list.html?page=topic"
# Clicking on given topic
driver.get(startPage)
# Loading the page
driver.find_element_by_link_text(topic).click()
# Waiting until page is loaded
numberSelector = WebDriverWait(driver, 1).until(
EC.presence_of_element_located((By.ID, "results_per_page_top"))
)
select = Select(numberSelector)
# Loading 100 element on the page
select.select_by_visible_text("100")
time.sleep(3)
# Gettin page number
pageNext = driver.find_element_by_class_name("results-block__pagination-list")
listPage = pageNext.find_elements_by_tag_name("li")
for i in range(len(listPage)-1):
getLinksSinglePage(collection, driver)
pageNext = driver.find_element_by_class_name("results-block__pagination-list")
listPage = pageNext.find_elements_by_tag_name("li")
# Click on next button
listPage[-1].click()
time.sleep(10)
c = mongo.MongoClient("localhost", 27017)
dbName = "Cochrane"
db = c[dbName]
driver = webdriver.Firefox()
# Topics to be downloaded from cochrane database
colName = "Links"
col = db[colName]
topics = ["Allergy & intolerance", "Blood disorders", "Cancer",
"Child health", "Complementary & alternative medicine",
"Consumer & communication strategies", "Dentistry & oral health",
"Developmental, psychosocial & learning problems",
"Diagnosis", "Ear, nose & throat", "Effective practice & health systems",
"Endocrine & metabolic", "Eyes & vision", "Gastroenterology", "Genetic disorders",
"Gynaecology", "Health & safety at work", "Heart & circulation",
"Infectious disease", "Kidney disease", "Lungs & airways",
"Mental health", "Methodology", "Neonatal care", "Neurology",
"Orthopaedics & trauma", "Pain & anaesthesia", "Pregnancy & childbirth",
"Public health", "Rheumatology", "Skin disorders", "Tobacco, drugs & alcohol",
"Urology", "Wounds"]
for topic in topics:
try:
getLinks(topic, col, driver)
except:
print "Failed at topic", topic