class RSCPaperSpider(scrapy.Spider): name = "RSC_Paper" http_user = '******' http_pass = '******' db = SynDevAdmin.db_access() db.connect() col = db.collection('RSC') def start_requests(self): for doc in self.col.find({'HTML_Crawled': False}): request = SplashRequest(doc['Article_HTML_Link'], self.parse, args={'wait': 2}) request.meta['DOI'] = doc['DOI'] yield request def parse(self, response): try: html = response.css('div#wrapper').extract_first() if html: self.col.update({"DOI": response.meta['DOI']}, { '$set': { 'HTML_Crawled': True, "Paper_Content_HTML": html } }) else: self.col.update({"DOI": response.meta['DOI']}, { '$set': { 'HTML_Crawled': False, 'Error_Msg': "HTML string is None" } }) except Exception as e: self.col.update( {"DOI": response.meta['DOI']}, {'$set': { 'HTML_Crawled': False, 'Error_Msg': str(e) }})
import urlparse except ImportError: import urllib.parse as urlparse __author__ = 'Ziqin (Shaun) Rong' __maintainer__ = 'Ziqin (Shaun) Rong' __email__ = '*****@*****.**' if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-j", type=str, help="ECS journal name, one of EEL, JES, JSS, SSL") args = parser.parse_args() db = SynDevAdmin.db_access() db.connect() issue_col = db.collection('{}_issue'.format(args.j)) for doc in issue_col.find({"Scraped": False}): res = requests.get(doc["URL"]) if res.status_code == 200: soup = BeautifulSoup(res.content, 'lxml') abstract_links = [] for article in soup.select('.toc-cit'): abstract_link = article.find_all('a', {'rel': 'abstract'}) if abstract_link: abstract_links.append( urljoin(doc["URL"], abstract_link[0].get('href'))) issue_col.update( {"_id": doc["_id"]},