def crawlGradeFromGwgl(): session = HTMLSession() jar = requests.cookies.RequestsCookieJar() jar.set('JSESSIONID', '28191DA0466EDA27D69CB81417772905.node1') jar.set('C2RT', 'a33e03cddc0fb11b90f118ae407641dc') jar.set('bocms_visite_user_session', 'C816B689B1A91CC278FD5FCD7CD1CD61') jar.set('SERVERNAME', 'xk2') jar.set('GSESSIONID', '28191DA0466EDA27D69CB81417772905.node1') session.cookies = jar courseIdToFind = [ '12160007.08', '22163171.01', '22163219.01', '22163280.01' ] # request_toJWGL = session.get('http://jwglnew.hunnu.edu.cn/eams/teach/grade/course/person!search.action?semesterId=82&projectType=&_=1578409087439') result = session.get( 'http://jwglnew.hunnu.edu.cn/eams/teach/grade/course/person!historyCourseGrade.action' ) gradeRows = result.html.find('div.grid>table.gridtable>tbody tr', containing=courseIdToFind) if len(gradeRows) > 0: textBuffer = '' for item in gradeRows: courseId = item.find('td:nth-child(3)')[0].text courseName = item.find('td:nth-child(4)')[0].text courseScore = item.find('td:nth-last-child(3)')[0].text textBuffer += courseId + ' ' + courseName + ': ' + courseScore + '\n' print(textBuffer) requests.get( 'http://127.0.0.1:5700/send_private_msg?user_id=806361380&message=' + textBuffer) else: print('[' + ','.join(courseIdToFind) + '] Not Found')
def get_data(self): source = HotspotSource.objects.get(code=1) uri = 'https://www.zhihu.com/hot' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.3", "Referer": "https://www.zhihu.com/" } session = HTMLSession() session.cookies = cookiejar.LWPCookieJar(filename='./hotspot/utils/cookies.txt') session.cookies.load(ignore_discard=True) # TODO: 知乎必须要登录, 没法绕过, 后续添加验证码自动识别方案, 暂时写死cookie r = session.get(url=uri, headers=headers).html sections = r.find('#TopstoryContent > div > div > div.HotList-list', first=True).find('section') tmp = [] for section in sections: order, _, desc, _, *t = section.text.split('\n') hot_uri = section.find('a', first=True).attrs['href'] title = section.find('a', first=True).attrs['title'] count = section.find('.HotItem-metrics', first=True).text.replace('分享', '') data = { 'title': title, 'uri': hot_uri, 'extra': json.dumps({ 'count': count, 'order': order, 'desc': desc }), 'hotspot_source': source.id } tmp.append(data) return tmp
def __init__(self, **kwargs): ''' Base class for common scraping tasks Args: ''' logging.getLogger(__name__).addHandler(logging.NullHandler()) self.urls = [] # use requests HTML to aid parsing # has all same methods as requests.Session _s = HTMLSession() # delay/expire if kwargs.get('delay'): self.delay = kwargs['delay'] else: self.delay = 2 if kwargs.get('expire_hours'): self.expire_hours = kwargs['expire_hours'] else: self.expire_hours = 168 # add cookies if kwargs.get('cookies'): _s.cookies = kwargs['cookies'] else: try: import cookielib _s.cookies = cookielib.MozillaCookieJar() except (NameError, ImportError): import http.cookiejar _s.cookies = http.cookiejar.MozillaCookieJar() # add headers if kwargs.get('headers'): _s.headers = kwargs['headers'] else: ua = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36') _s.headers = {'User-Agent': ua} # add proxies if kwargs.get('proxies'): _s.proxies = kwargs['proxies'] # add cache if not '/' in kwargs.get('cache_name', ''): self.cache_name = os.path.join('/tmp', kwargs['cache_name']) try: from cachecontrol import CacheControlAdapter from cachecontrol.heuristics import ExpiresAfter from cachecontrol.caches import FileCache _s.mount('http://', CacheControlAdapter(cache=FileCache(self.cache_name), cache_etags = False, heuristic=ExpiresAfter(hours=self.expire_hours))) except ImportError as e: try: import requests_cache requests_cache.install_cache(self.cache_name) except: logging.exception('could not install cache') self.s = _s
def __init__(self, **kwargs): """ """ logging.getLogger(__name__).addHandler(logging.NullHandler()) self.urls = [] # use requests HTML to aid parsing # has all same methods as requests.Session _s = HTMLSession() self.delay = kwargs.get("delay", 2) self.expire_hours = kwargs.get("expire_hours", 168) # add cookies if kwargs.get("cookies"): _s.cookies = kwargs["cookies"] else: import http.cookiejar _s.cookies = http.cookiejar.MozillaCookieJar() # add headers default_headers = { "User-Agent": random.choice(USER_AGENTS), "accept-encoding": "gzip, deflate, br", "accept-language": "en-US,en;q=0.9", "accept": "application/json, text/plain, */*", } _s.headers.update(default_headers) if kwargs.get("headers"): _s.headers.update(kwargs["headers"]) # add proxies if kwargs.get("proxies"): _s.proxies = kwargs["proxies"] # add cache if not kwargs.get("cache_name"): self.cache_name = os.path.join("/tmp", random_string(32)) elif "/" not in kwargs.get("cache_name", ""): self.cache_name = os.path.join("/tmp", kwargs["cache_name"]) else: self.cache_name = kwargs.get("cache_name") try: from cachecontrol import CacheControlAdapter from cachecontrol.heuristics import ExpiresAfter from cachecontrol.caches import FileCache _s.mount( "http://", CacheControlAdapter( cache=FileCache(self.cache_name), cache_etags=False, heuristic=ExpiresAfter(hours=self.expire_hours), ), ) except ImportError: try: import requests_cache requests_cache.install_cache(self.cache_name) except BaseException: logging.exception("could not install cache") self.session = _s
from sklearn.metrics import mean_absolute_error from sklearn import preprocessing from sklearn.model_selection import GridSearchCV from itertools import chain, combinations from sklearn.model_selection import cross_val_score from mlxtend.feature_selection import SequentialFeatureSelector as SFS from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures ##########Downlaod data # build a session session = HTMLSession() session.cookies = http.cookiejar.LWPCookieJar('cookie') headers = {'Host':'freddiemac.embs.com', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', 'Referer':'https://freddiemac.embs.com/FLoan/secure/login.php?pagename=download'} def get_cookie(): try: session.cookies.load(ignore_discard = True) except IOError: print('Cannot load cookie!') def login(un, pwd): """ entering username and password """ auth_url = 'https://freddiemac.embs.com/FLoan/secure/auth.php'
import http.cookiejar from math import * import requests from scipy import integrate s = requests.Session() from requests_html import HTMLSession sqrt(1) session = HTMLSession() session.cookies = http.cookiejar.MozillaCookieJar("anything.txt") mn, mx = 0, 0 def int_(x): def wrapper(a, b=x): global mn, mx mn = b mx = a return 0 return wrapper def frac(x): def wrapper(a, b=x): return b / a return wrapper