Exemplo n.º 1
0
def analyze(txt):
    positions = {}
    
    positions["contacts"] = contactInfo(txt)
    vlpfs = vlpf(txt)
    
    #top 20% (ARBITRARY) of phrases are big phrases
    #nTopVLPFs = int(0.20 * len(vlpfs))
    #positions["highVlpfs"] = vlpfs[:nTopVLPFs]
    #positions["vlpfs"] = vlpfs[nTopVLPFs:]
    try:
        positions["vlpfs"] = vlpfs
    except:
        positions["vlpfs"] = []
    
    try:
        positions["headers"] = headers(txt)
    except:
        positions["headers"] = []
    
    try:
        positions["segments"] = segments(txt)
    except:
        positions["segments"] = []
    
    try:
        positions["newlines"] = preserveNewLines(txt)
    except:
        positions["newlines"] = []
    
    return positions
Exemplo n.º 2
0
def analyze(txt):
    positions = {}

    positions["contacts"] = contactInfo(txt)
    vlpfs = vlpf(txt)

    #top 20% (ARBITRARY) of phrases are big phrases
    #nTopVLPFs = int(0.20 * len(vlpfs))
    #positions["highVlpfs"] = vlpfs[:nTopVLPFs]
    #positions["vlpfs"] = vlpfs[nTopVLPFs:]
    try:
        positions["vlpfs"] = vlpfs
    except:
        positions["vlpfs"] = []

    try:
        positions["headers"] = headers(txt)
    except:
        positions["headers"] = []

    try:
        positions["segments"] = segments(txt)
    except:
        positions["segments"] = []

    try:
        positions["newlines"] = preserveNewLines(txt)
    except:
        positions["newlines"] = []

    return positions
Exemplo n.º 3
0
 def getVideoUrl(self):
     """获取视频地址
     :param self: self
     :param return: 返回视频地址videoUrl和解析状态success
     """
     self.headers = headers(self.url).buildHeader()
     self.__getvideoUrl()
     return {'success': self.success, 'videoUrl': self.videoUrl}
Exemplo n.º 4
0
 def __init__(self, wp):
     self.wp = wp
     self.url = 'http://riven.market/_modules/riven/showrivens.php?'
     self.header = {
         'content-type': 'application/jaon',
         'user-agent': headers()
     }
     self.params = 'baseurl=aHR0cHM6Ly9yaXZlbi5tYXJrZXQv&platform=PC&limit=200&recency=-1&veiled=false&onlinefirst=true&polarity=all&rank=all&mastery=16&weapon=%s&stats=Any&neg=all&price=20000&rerolls=-1&sort=price&direction=ASC&page=1&time=%d' % (
         str(self.wp), int(1000 * now))
Exemplo n.º 5
0
def get_html_text(url):
    try:
        user_agent = {'user-agent': headers.headers()}
        r = requests.get(url, headers=user_agent, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding  # utf-8
        return r.text
    except RequestException:
        return ' '
Exemplo n.º 6
0
def test_headers():
    observed = headers.headers(email)
    expected = {
        'date': 'Thu, 2 Jan 2014 21:37:44 +0000',
        'from': 'Thomas Levine <*****@*****.**>',
        'user-agent': 'Mutt/1.5.22 (2013-10-16)',
        'content-type': 'multipart/mixed; boundary="AqsLC8rIMeq19msA"'
    }
    n.assert_dict_equal(observed, expected)
Exemplo n.º 7
0
def read_email(email:bytes) -> dict:
    'Compute all the statistics.'
    parsed_email = message_from_bytes(email)
    features = {}

    # Headers
    features.update(headers(parsed_email))

    # Statistics
    stat_names = filter(lambda x: not x.startswith('_'), dir(stats))
    features.update({stat_name: getattr(stats, stat_name)(parsed_email) \
        for stat_name in stat_names})

    return features
Exemplo n.º 8
0
def main():
    session_requests = requests.session()

    with open(OUTPUT_FILENAME, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile,
                                fieldnames=['sourceGuid'] + FIELD_NAMES +
                                RELATIONSHIP_FIELD_NAMES)
        writer.writeheader()
        for match_guid in MATCH_GUIDS:
            url = URL.format(user_guid=USER_GUID, match_guid=match_guid)
            result = session_requests.get(url, headers=headers())
            if json.loads(result.text).get('matchGroups'):
                write_match_groups(writer,
                                   json.loads(result.text)['matchGroups'],
                                   match_guid)
Exemplo n.º 9
0
def get_url_list():
    data = {
        'viewFlag': 'A',
        'sortType': 'default',
        'searchStyle': '',
        'searchRegion': 'city:',
        'searchFansNum': '',
        'currentPage': '1',
        'pageSize': '100'
    }  #post参数
    header = {'content-type': 'application/jaon', 'user-agent': headers()}
    url = 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'
    r = requests.post(url, data=data, headers=header)
    html = r.text
    json = loads(html)  #json转成dict
    return json['data']['searchDOList']
Exemplo n.º 10
0
def info(keywd):
    from requests import get
    from headers import headers
    r = get(f'https://www.baidu.com/s?ie=UTF-8&wd={keywd}', headers=headers())
    r.encoding = 'utf-8'
    req = r.text
    req = req.replace(
        '<img class="index-logo-src" src="" alt="到百度首页" title="到百度首页">', '')
    req = req.replace(
        '<a href="https://passport.baidu.com/v2/?login&amp;tpl=mn&amp;u=http%3A%2F%2Fwww.baidu.com%2F" name="tj_login" class="lb" onclick="return false;">登录</a>',
        '<a href="localhost">')
    req = req.replace('百度', '邢栋的搜索引擎')
    req = req.replace(
        '<span class="bg s_ipt_wr quickdelete-wrap"><span class="soutu-btn"></span><input id="kw" name="wd" class="s_ipt" value="hi" maxlength="255" autocomplete="off"><a href="javascript:;" id="quickdelete" title="清空" class="quickdelete" style="top: 0px; right: 0px; display: block;"></a><span class="soutu-hover-tip" style="display: none;">按图片搜索</span></span>',
        '')
    req = req.replace('邢栋的搜索引擎热榜', '')
    req = req.replace('换一换', '')
    return req.replace('//www.baidu.com/img/flexible/logo/pc/result.png', '')
Exemplo n.º 11
0
def main():
    session_requests = requests.session()

    with open(OUTPUT_FILENAME, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile,
                                fieldnames=FIELD_NAMES +
                                RELATIONSHIP_FIELD_NAMES)
        writer.writeheader()
        page_number = 0
        retries = 0
        retries_per_url = 0
        while True:
            print("page {}...".format(page_number + 1), end="")
            url = MATCHES_URL.format(user_guid=USER_GUID,
                                     page_number=page_number)
            result = session_requests.get(url, headers=headers())
            try:
                if json.loads(result.text)['matchCount'] == 0:
                    print("finished!")
                    break
                print("writing {} matches...".format(
                    json.loads(result.text)['matchCount']),
                      end="")
                write_match_groups(writer,
                                   json.loads(result.text)['matchGroups'])
                page_number += 1
                retries_per_url = 0
                print("done")
            except KeyError:
                print("{}...".format(json.loads(result.text).get('error')),
                      end="")
                if (retries >= MAX_TOTAL_RETRIES) or (retries_per_url >=
                                                      MAX_RETRIES_PER_URL):
                    print('too many errors, terminating')
                    break
                retries += 1
                retries_per_url += 1
                print('retrying')
Exemplo n.º 12
0
 def __init__(self):
     self.__session = requests.session()
     self.__header = headers()
     self.__pipeline = Ctrip_pipe()
Exemplo n.º 13
0
 def __init__(self):
     self.__session = requests.session()
     self.__header = headers()
Exemplo n.º 14
0
import requests
import re
import os
from headers import headers
from multiprocessing import Pool,cpu_count
import time
import asyncio

now = lambda: time.time()
header = {'content-type': 'application/jaon', 'user-agent': headers()}
def main(page):
    url = 'https://wall.alphacoders.com/by_category.php?'
    params = {'id': 1, 'name': 'Abstract Wallpapers', 'page': page}
    data = {'view': 'paged', 'min_resolution': 0x0, 'resolution_equals': '>=', 'sort': 'rating'}
    response = requests.post(url, params=params, data=data, headers=header)
    html = response.text
    image_url_list = re.findall(r'download-button" data-href="(.*?)">', html)
    for i in image_url_list:
        image = requests.get(i, headers=header)
        file_name = re.findall(r'wallpaper/(.*?)/images', i)[0]
        if not os.path.exists('D:\desktop\wallpaper\%d' % page):  # 判断存在文件夹是否r存在
            os.mkdir('D:\desktop\wallpaper\%d'% page)  # 不存在则新建文件夹
        with open('D:\desktop\wallpaper\%d\%s.jpg' % (page,file_name), 'wb') as file:
            file.write(image.content)
        print('图片:%s 下载完毕' % file_name)
    print('第%d页下载完毕' % page)

if __name__=="__main__":
    p =Pool(cpu_count())
    start = now()
    p.apply_async(main,(1,))