SCRIPT_DIR = os.path.dirname( os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) root_dir = SCRIPT_DIR for i in range(up_level_N): root_dir = os.path.normpath(os.path.join(root_dir, '..')) sys.path.append(root_dir) import requests, random, re from bs4 import BeautifulSoup from selenium import webdriver from crawl_tools.ua_pool import get_one_random_ua from crawl_tools.request_with_proxy import request_with_proxy from journal_parser.JournalArticle import JournalArticle from crawl_tools.decorators import except_pass, except_return_none ERN_METHOD = lambda func: except_return_none(func, 'IEEE_PARSER') EP_METHOD = lambda func: except_pass(func, 'IEEE_ARTICLE') ''' @except_or_none def get_pdf_link(pdf_page_url): with requests.Session() as s: soup = BeautifulSoup( s.get( url = pdf_page_url, timeout=30, headers = { 'User-Agent':get_one_random_ua() } ).text,"lxml" ) try: soup.find_all('frame')[1].get('src')
""" import sys, os up_level_N = 1 SCRIPT_DIR = os.path.dirname( os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) root_dir = SCRIPT_DIR for i in range(up_level_N): root_dir = os.path.normpath(os.path.join(root_dir, '..')) sys.path.append(root_dir) import re, requests from bs4 import BeautifulSoup from journal_parser.JournalArticle import JournalArticle from crawl_tools.decorators import except_pass, except_return_none ERN_METHOD = lambda func: except_return_none(func, 'TaylorFrancisParser') EP_METHOD = lambda func: except_pass(func, 'TaylorFrancisArticle') class TF_DetailPageParser: ''' http://www.tandfonline.com/doi/abs/10.1080/08912968809386468 ''' def __init__(self, url): self.soup = BeautifulSoup( requests.get( 'http://www.tandfonline.com/doi/abs/10.1080/08912968809386468' ).text, 'lxml') class TaylorFrancisParser: '''
@description: nope """ import sys, os up_level_N = 1 SCRIPT_DIR = os.path.dirname( os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) root_dir = SCRIPT_DIR for i in range(up_level_N): root_dir = os.path.normpath(os.path.join(root_dir, '..')) sys.path.append(root_dir) from journal_parser.JournalArticle import JournalArticle from bs4 import BeautifulSoup from crawl_tools.decorators import except_pass EP_METHOD = lambda func: except_pass(func, 'WileyArticle') import re class WileyAllItemsPageParser: ''' sample_url: http://onlinelibrary.wiley.com/doi/10.1002/(SICI)1096-987X(199812)19:16%3C%3E1.0.CO;2-O/issuetoc ''' def __init__(self, html_source=None, from_web=True): if not from_web: with open('Wiley.html', 'rb') as f: html_source = f.read() self.soup = BeautifulSoup(html_source, 'lxml') @property def sections(self):
up_level_N = 1 SCRIPT_DIR = os.path.dirname( os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) root_dir = SCRIPT_DIR for i in range(up_level_N): root_dir = os.path.normpath(os.path.join(root_dir, '..')) sys.path.append(root_dir) from crawl_tools.Timer import get_beijing_time from crawl_tools.request_with_proxy import request_with_random_ua, request_with_proxy from db_config import REMOTE_CONNS_POOL import psycopg2, time, random from crawl_tools.decorators import except_pass, except_return_none from multiprocessing.dummy import Pool as ThreadPool EP_METHOD = lambda func: except_pass(func, 'JournalSpider') ERN_METHOD = lambda func: except_return_none(func, 'JournalSpider') class JournalSpider: def __init__(self, JournalObj): self.JournalObj = JournalObj self.volume_links = [] def generate_volume_links(self): pass @EP_METHOD def _run(self, AllItemsPageParser, JournalArticle,
Parser for Emerald Publisher """ import sys, os up_level_N = 1 SCRIPT_DIR = os.path.dirname( os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) root_dir = SCRIPT_DIR for i in range(up_level_N): root_dir = os.path.normpath(os.path.join(root_dir, '..')) sys.path.append(root_dir) import re from bs4 import BeautifulSoup from journal_parser.JournalArticle import JournalArticle from crawl_tools.decorators import except_pass EP_METHOD = lambda func: except_pass(func, ModelName='EmeraldArticle') class EmeraldParser: ''' sample_url: http://www.emeraldinsight.com/toc/f/32/9%2F10 ''' def __init__(self, html_source=None, from_web=True): if not from_web: with open('./emerald.html', 'rb') as f: html_source = f.read() self.soup = BeautifulSoup(html_source, 'lxml') @property def sections(self): return self.soup.find_all(class_='articleEntry')
@description: Parser for Acs Publisher """ import sys,os up_level_N = 1 SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) root_dir = SCRIPT_DIR for i in range(up_level_N): root_dir = os.path.normpath(os.path.join(root_dir, '..')) sys.path.append(root_dir) import re from bs4 import BeautifulSoup from journal_parser.JournalArticle import JournalArticle from crawl_tools.decorators import except_pass EP_METHOD = lambda func:except_pass(func,ModelName='AcsArticle') class AcsParser: ''' #sample_url: http://pubs.acs.org/toc/mpohbp/0/0 ''' def __init__(self, html_source = None, from_web = True): if not from_web: with open("./pages/Acs.html", "rb") as f: html_source = f.read() self.soup = BeautifulSoup(html_source, 'lxml') @property def sections(self): return self.soup.select('.articleBox')
@description: -- """ import sys,os up_level_N = 1 SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) root_dir = SCRIPT_DIR for i in range(up_level_N): root_dir = os.path.normpath(os.path.join(root_dir, '..')) sys.path.append(root_dir) import re from bs4 import BeautifulSoup from journal_parser.JournalArticle import JournalArticle from crawl_tools.decorators import except_pass EP_METHOD = lambda func:except_pass(func,'SageArticle') class SageParser: ''' sample_url: http://tcn.sagepub.com/content/12/2.toc#content-block ''' def __init__(self,html_source=None,from_web=True): if not from_web: with open('Sage.html','rb') as f: html_source = f.read() self.soup = BeautifulSoup(html_source,'lxml') @property def sections(self): return self.soup.select('.toc-cit')
""" import sys, os up_level_N = 1 SCRIPT_DIR = os.path.dirname( os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) root_dir = SCRIPT_DIR for i in range(up_level_N): root_dir = os.path.normpath(os.path.join(root_dir, '..')) sys.path.append(root_dir) import re from bs4 import BeautifulSoup from journal_parser.JournalArticle import JournalArticle from crawl_tools.decorators import except_pass EP_METHOD = lambda func: except_pass(func, 'BioMedArticle') class BioMedParser: def __init__(self, html_source=None, from_web=True): if not from_web: with open('BioMed.html', 'rb') as f: html_source = f.read() self.soup = BeautifulSoup(html_source, 'lxml') @property def pages_amount(self): return int(self.soup.find(text=re.compile('Page 1 of')).split(' ')[-1]) @property def sections(self):