def analyze(txt): positions = {} positions["contacts"] = contactInfo(txt) vlpfs = vlpf(txt) #top 20% (ARBITRARY) of phrases are big phrases #nTopVLPFs = int(0.20 * len(vlpfs)) #positions["highVlpfs"] = vlpfs[:nTopVLPFs] #positions["vlpfs"] = vlpfs[nTopVLPFs:] try: positions["vlpfs"] = vlpfs except: positions["vlpfs"] = [] try: positions["headers"] = headers(txt) except: positions["headers"] = [] try: positions["segments"] = segments(txt) except: positions["segments"] = [] try: positions["newlines"] = preserveNewLines(txt) except: positions["newlines"] = [] return positions
def analyze(txt): positions = {} positions["contacts"] = contactInfo(txt) vlpfs = vlpf(txt) #top 20% (ARBITRARY) of phrases are big phrases #nTopVLPFs = int(0.20 * len(vlpfs)) #positions["highVlpfs"] = vlpfs[:nTopVLPFs] #positions["vlpfs"] = vlpfs[nTopVLPFs:] try: positions["vlpfs"] = vlpfs except: positions["vlpfs"] = [] try: positions["headers"] = headers(txt) except: positions["headers"] = [] try: positions["segments"] = segments(txt) except: positions["segments"] = [] try: positions["newlines"] = preserveNewLines(txt) except: positions["newlines"] = [] return positions
def getVideoUrl(self): """获取视频地址 :param self: self :param return: 返回视频地址videoUrl和解析状态success """ self.headers = headers(self.url).buildHeader() self.__getvideoUrl() return {'success': self.success, 'videoUrl': self.videoUrl}
def __init__(self, wp): self.wp = wp self.url = 'http://riven.market/_modules/riven/showrivens.php?' self.header = { 'content-type': 'application/jaon', 'user-agent': headers() } self.params = 'baseurl=aHR0cHM6Ly9yaXZlbi5tYXJrZXQv&platform=PC&limit=200&recency=-1&veiled=false&onlinefirst=true&polarity=all&rank=all&mastery=16&weapon=%s&stats=Any&neg=all&price=20000&rerolls=-1&sort=price&direction=ASC&page=1&time=%d' % ( str(self.wp), int(1000 * now))
def get_html_text(url): try: user_agent = {'user-agent': headers.headers()} r = requests.get(url, headers=user_agent, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding # utf-8 return r.text except RequestException: return ' '
def test_headers(): observed = headers.headers(email) expected = { 'date': 'Thu, 2 Jan 2014 21:37:44 +0000', 'from': 'Thomas Levine <*****@*****.**>', 'user-agent': 'Mutt/1.5.22 (2013-10-16)', 'content-type': 'multipart/mixed; boundary="AqsLC8rIMeq19msA"' } n.assert_dict_equal(observed, expected)
def read_email(email:bytes) -> dict: 'Compute all the statistics.' parsed_email = message_from_bytes(email) features = {} # Headers features.update(headers(parsed_email)) # Statistics stat_names = filter(lambda x: not x.startswith('_'), dir(stats)) features.update({stat_name: getattr(stats, stat_name)(parsed_email) \ for stat_name in stat_names}) return features
def main(): session_requests = requests.session() with open(OUTPUT_FILENAME, 'w', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=['sourceGuid'] + FIELD_NAMES + RELATIONSHIP_FIELD_NAMES) writer.writeheader() for match_guid in MATCH_GUIDS: url = URL.format(user_guid=USER_GUID, match_guid=match_guid) result = session_requests.get(url, headers=headers()) if json.loads(result.text).get('matchGroups'): write_match_groups(writer, json.loads(result.text)['matchGroups'], match_guid)
def get_url_list(): data = { 'viewFlag': 'A', 'sortType': 'default', 'searchStyle': '', 'searchRegion': 'city:', 'searchFansNum': '', 'currentPage': '1', 'pageSize': '100' } #post参数 header = {'content-type': 'application/jaon', 'user-agent': headers()} url = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8' r = requests.post(url, data=data, headers=header) html = r.text json = loads(html) #json转成dict return json['data']['searchDOList']
def info(keywd): from requests import get from headers import headers r = get(f'https://www.baidu.com/s?ie=UTF-8&wd={keywd}', headers=headers()) r.encoding = 'utf-8' req = r.text req = req.replace( '<img class="index-logo-src" src="" alt="到百度首页" title="到百度首页">', '') req = req.replace( '<a href="https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F" name="tj_login" class="lb" onclick="return false;">登录</a>', '<a href="localhost">') req = req.replace('百度', '邢栋的搜索引擎') req = req.replace( '<span class="bg s_ipt_wr quickdelete-wrap"><span class="soutu-btn"></span><input id="kw" name="wd" class="s_ipt" value="hi" maxlength="255" autocomplete="off"><a href="javascript:;" id="quickdelete" title="清空" class="quickdelete" style="top: 0px; right: 0px; display: block;"></a><span class="soutu-hover-tip" style="display: none;">按图片搜索</span></span>', '') req = req.replace('邢栋的搜索引擎热榜', '') req = req.replace('换一换', '') return req.replace('//www.baidu.com/img/flexible/logo/pc/result.png', '')
def main(): session_requests = requests.session() with open(OUTPUT_FILENAME, 'w', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=FIELD_NAMES + RELATIONSHIP_FIELD_NAMES) writer.writeheader() page_number = 0 retries = 0 retries_per_url = 0 while True: print("page {}...".format(page_number + 1), end="") url = MATCHES_URL.format(user_guid=USER_GUID, page_number=page_number) result = session_requests.get(url, headers=headers()) try: if json.loads(result.text)['matchCount'] == 0: print("finished!") break print("writing {} matches...".format( json.loads(result.text)['matchCount']), end="") write_match_groups(writer, json.loads(result.text)['matchGroups']) page_number += 1 retries_per_url = 0 print("done") except KeyError: print("{}...".format(json.loads(result.text).get('error')), end="") if (retries >= MAX_TOTAL_RETRIES) or (retries_per_url >= MAX_RETRIES_PER_URL): print('too many errors, terminating') break retries += 1 retries_per_url += 1 print('retrying')
def __init__(self): self.__session = requests.session() self.__header = headers() self.__pipeline = Ctrip_pipe()
def __init__(self): self.__session = requests.session() self.__header = headers()
import requests import re import os from headers import headers from multiprocessing import Pool,cpu_count import time import asyncio now = lambda: time.time() header = {'content-type': 'application/jaon', 'user-agent': headers()} def main(page): url = 'https://wall.alphacoders.com/by_category.php?' params = {'id': 1, 'name': 'Abstract Wallpapers', 'page': page} data = {'view': 'paged', 'min_resolution': 0x0, 'resolution_equals': '>=', 'sort': 'rating'} response = requests.post(url, params=params, data=data, headers=header) html = response.text image_url_list = re.findall(r'download-button" data-href="(.*?)">', html) for i in image_url_list: image = requests.get(i, headers=header) file_name = re.findall(r'wallpaper/(.*?)/images', i)[0] if not os.path.exists('D:\desktop\wallpaper\%d' % page): # 判断存在文件夹是否r存在 os.mkdir('D:\desktop\wallpaper\%d'% page) # 不存在则新建文件夹 with open('D:\desktop\wallpaper\%d\%s.jpg' % (page,file_name), 'wb') as file: file.write(image.content) print('图片:%s 下载完毕' % file_name) print('第%d页下载完毕' % page) if __name__=="__main__": p =Pool(cpu_count()) start = now() p.apply_async(main,(1,))