def scrape(forums, path):
    # check to see if the directories in the path string exist
    if not os.path.exists(path):
        #if not, create them
        os.makedirs(path)
    
    for forumID in forums:
    
        r = session.get(baseurl + 'mod/forum/view.php?id=' + forumID)
        
        if(r.status_code == 200):
            soup = BeautifulSoup(r.text, 'lxml')
        
            posts = soup.find_all('td', class_='topic starter')
        
            # loop through the posts on a forum page 
            # visit each post and save the thread as a .html file
            print "scraping posts from forumID " + forumID
            
            for post in posts:
                r = session.get(post.a.get('href'))
                if(r.status_code == 200):
                    soup = BeautifulSoup(r.text, 'lxml')
                    output_file = path + str(soup.find("input", attrs={"name": "d"}).get('value')) + ".html"
                    print "page saved at " + output_file
                    with open(output_file, 'w') as f:
                        f.write(str(soup))
                    f.close()
    
        else:
            print 'ERROR: ' + str(r.status) + ' ' + r.reason
            sys.exit()
예제 #2
0
def makeRequest(host, user, password, verbose):
    """Función que realiza las peticiones al servidor especificado. Recibe host, user, password y verbose. Si la opcion verbose es True, se
    muestra lo que se está realizando en cada paso. Se pregunta dentro de la función por las opciones --tor y --agente
    En caso de que --tor sea True se realiza la petición a través de TOR mediante los proxies y session()
    En caso de que --agente sea True se cambia el agente de usuario de Python por el de HotJava con los headers"""
    try:
        header = {}  # Diccionario vacío que indicará el agente, vació indica el agente original (Python)
        if opts.agente:
            header['User-agent'] = 'HotJava/1.1.2 FCS'  # Valor que se agrega al diccionario para cambiar el agente
        if opts.tor:
            session.proxies = {}
            session.proxies['http'] = 'socks5h://localhost:9050'  # Proxys http y https para realizar la petición mediante TOR
            session.proxies['https'] = 'socks5h://localhost:9050'
            response = session.get(host, auth=(user,password), headers = header)
        else:
            response = session.get(host, auth=(user,password), headers = header)
        if verbose:
            ip = session.get('http://httpbin.org/ip', headers = header)  # Se obtiene la ip desde la que se hace la petición
            ip = str(ip.text).replace('\"origin\":','').replace('{','').replace('}','').replace('\n','')
            print 'La petición se realiza desde:%s' % (ip)
            agente = session.get('https://httpbin.org/user-agent', headers = header)  # Se obtiene el agente de usuario con el que se hace la petición
            agente = str(agente.text).replace('\"user-agent\":','').replace('{','').replace('}','').replace('\n','')
            print 'El agente de usuario es:%s' % (agente)
            print 'Se está intentando ingresar con usuario \'%s\' contraseña \'%s\'' % (user, password)
            print 'La respuesta (response) obtenida es : %s y, por lo tanto:' % (response)
        if response.status_code == 200:
            print 'CREDENCIALES ENCONTRADAS!: %s\t%s' % (user,password)
            if opts.reporte != None:
                reportResults('Usuario: \'%s\'\t\tContraseña: \'%s\'\n' % (user,password), opts.reporte)
        else:
            print 'NO FUNCIONO :c '
    except ConnectionError:
        printError('Error en la conexion, tal vez el servidor no esta arriba.',True)
예제 #3
0
파일: erepublik.py 프로젝트: SKYnv/pollirio
def scrape(resource, **args):
    session = login()

    if resource == 'uid':
        # we effectively only need the first user, so don't scrape all pages
        search = session.get(
            'http://www.erepublik.com/en/main/search/%s/' %
                args['query'].replace(' ', '_')
        )
        doc = fromstring(search.text)
        uid = doc.xpath('//div[@class="nameholder"]/a/@href')[0].split('/')[-1].strip()
        return uid
    elif resource == 'citizen.profile':
        profile = session.get(
            'http://www.erepublik.com/en/citizen/profile/%s' % args['citizenId']
        )
        doc = fromstring(profile.text)

        citizen_state = doc.xpath('//div[@class="citizen_state"]/div[@class="is"]/span/img/@src')
        is_dead = citizen_state and 'dead_citizen' in citizen_state[0]

        profile = {
            'general': {
                'avatar': doc.xpath('//img[@class="citizen_avatar"]/@style')[0].split('(')[1].split(')')[0],
                'level': doc.xpath('//*[@class="citizen_level"]')[0].text,
                'experience_points': doc.xpath('//*[@class="citizen_experience"]/div/p')[0].text.split(' / ')[0].replace(',', ''),
                'name': doc.xpath('//*[@class="citizen_profile_header auth"]/h2')[0].text_content().strip(),
                'is_alive': str(int(not is_dead)),
                'birthDay': doc.xpath('//div[@class="citizen_second"]/p')[1].text.strip(),
                'nationalRank': doc.xpath('//div[@class="citizen_second"]/small/strong')[0].text,

            },
            'location': {
                'citizenship_country_initials': doc.xpath('//div[contains(@class, "citizen_info")]/a/@href')[2].split('/')[-1],
                'residence_country_name': doc.xpath('//div[contains(@class, "citizen_info")]/a/@title')[0],
                'residence_region_name': doc.xpath('//div[contains(@class, "citizen_info")]/a/@title')[1],
            },
            'party': {
                'name': doc.xpath('//div[@class="citizen_activity"]/div/div/span/a')[0].text.strip(),
            },
            'militaryUnit': {
                'id': doc.xpath('//div[@class="citizen_activity"]/div/div/a/@href')[0].split('/')[-1],
                'name': doc.xpath('//div[@class="citizen_activity"]/div/div/a/span')[0].text.strip(),
            },
            'militaryAttributes': {
                'strength': doc.xpath('//div[@class="citizen_military"]/h4')[0].text.replace(',', '').strip(),
                'rank_points': doc.xpath('//div[@class="stat"]/small/strong')[1].text.split(' / ')[0].replace(',', ''),
            },
        }
        return profile
예제 #4
0
def login(session):

    try:

        login = {
            'j_username': AGILE_CREDENTIALS['username'],
            'j_password': AGILE_CREDENTIALS['password']
        }

        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.5',
            'Connection':
            'keep-alive',
            'Content-Length':
            '1781',
            'Content-Type':
            'application/x-www-form-urlencoded',
            'Host':
            AGILE_HOST_NAME,
            'Referer':
            AGILE_PROD_URL,
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/57.0'
        }

        timeout = 5  # seconds

        session.post(AGILE_LOGIN_URL,
                     data=login,
                     headers=headers,
                     timeout=timeout)

        # open agile window
        session.get(AGILE_PROD_URL)

    except requests.exceptions.Timeout:
        return "no agile"
        pass

    except:
        return "no agile"
        pass
예제 #5
0
def downloadResource(session, res, path):
    try:
        src = res.a['href']
    except TypeError:
        return
    r = session.get(src)
    if(r.status_code == 200):
        headers = r.headers.keys()
        if ('content-disposition' in headers):
            #got a direct file link
            name = r.headers['content-disposition'].decode('utf-8').split(';')[1].split('=')[1].strip('"')
        else:
            #got a preview page
            soup = BeautifulSoup(r.text, 'html.parser')
            if ('content-type' in headers) and ('content-script-type' in headers) and ('content-style-type' in headers):
                #it's most obviously a website, which displays a download link
                src = soup.find(class_='region-content').a['href']
            else:
                #it's obviously an ugly frameset site
                src = soup.find_all('frame')[1]['src']
            name = os.path.basename(src)
        name = urllib.url2pathname(name.encode('utf-8'))
        saveFile(session, src, path, name)
    else:
        print 'ERROR: ' + str(r.status) + ' ' + r.reason
	sys.exit()
예제 #6
0
def downloadCourse(session, c, sem):
    global files
    global sections
    files = itertools.count()
    sections = itertools.count()
    name = c['key'].replace('/', '-') + u'/'
    path = root + sem.replace('/', '-') + u'/' + name
    path = urllib.url2pathname(path.encode('utf-8')).replace(':', '-').replace('"', '')
    if not os.path.exists(path):
        os.makedirs(path)
    print '       +--' + colors.BOLD + name + colors.ENDC
    r = session.get(c['url'])
    if(r.status_code == 200):
        soup = BeautifulSoup(r.text, 'html.parser')
        if not os.path.exists(path + '.dump'):
            os.makedirs(path + '.dump')

        dst = path + '.dump/' + c['key'].replace('/', '-').encode('utf-8') + '-' + c['type'] + '-' + str(datetime.date.today()) + '-full.html'
        dst = dst.replace(':', '-').replace('"', '')
        
        with open(dst, 'wb') as f:
            f.write(soup.encode('utf-8'))
        for s in soup.find_all(class_='section main clearfix'):
            downloadSection(session, s, path)
        #print 'Saved ' + str(files.next()) + ' Files in ' + str(sections.next()) + ' Sections'
    else:
        print 'ERROR: ' + str(r.status) + ' ' + r.reason
        sys.exit()
예제 #7
0
def get_captcha():
    # 验证码URL是按照时间戳的方式命名的
    captcha_url = 'https://www.zhihu.com/captcha.gif?r=%d&type=login&lang=cn' % (
        int(time.time() * 1000))
    response = session.get(captcha_url, headers=headers)
    # 保存验证码到当前目录
    with open('captcha.gif', 'wb') as f:
        f.write(response.content)
        f.close()

    # 自动打开刚获取的验证码
    from PIL import Image
    try:
        img = Image.open('captcha.gif')
        img.show()
        img.close()
    except:
        pass

    captcha = {
        'img_size': [200, 44],
        'input_points': [],
    }
    points = [[22.796875, 22], [42.796875, 22], [63.796875, 21],
              [84.796875, 20], [107.796875, 20], [129.796875, 22],
              [150.796875, 22]]
    print('7个字对应的坐标是:', points, '\n')
    seq = input('请输入倒立字的位置\n>')
    for i in seq:
        captcha['input_points'].append(points[int(i) - 1])
    return json.dumps(captcha)
예제 #8
0
def is_website_up(website, session):
    try:
        response = session.get(website, timeout=5)
        return (response.status_code == 200)
    except Timeout:
        print(bad + website + " is down")
        return False
예제 #9
0
    def connectieopen(self, link):
        headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            'Accept-Encoding': 'utf-8'
        }

        self.total_req += 1
        slapen = randint(1, 2)

        time.sleep(slapen)

        print(self.total_req)
        try:
            session = requests.Session()

            req = session.get(link, headers=headers)
            content = req.content
            bsObj = BeautifulSoup(content.decode('utf-8', 'ignore'),
                                  "html.parser")
        except HTTPError as fout:
            print(fout + link)
        except URLError as fout:
            print("cant find url")
        except ConnectionError as e:
            print(e)
            print(link)
        else:

            return bsObj
def exchange():
    URL = 'https://pro-api.coinmarketcap.com/v1/tools/price-conversion?amount={}&symbol={}&convert={}&CMC_PRO_API_KEY={}'
    API_KEY = 'a8cf5d78-8dac-4325-838b-2ebe27f2f644'

    parameters = {
        'amount':'1',
        'symbol':'EUR',
        'convert':'BTC'
    }

    headers = {
        'Accepts': 'application/json',
        'X-CMC_PRO_API_KEY': 'API_KEY',
}
   
    #response = requests.get(URL.format(request.values.get('from_quantity'), request.values.get('from_currency'), request.values.get('to_currency'), API_KEY))
    session = session()
    session.headers.update(headers)

    try:
        response = session.get(URL, params=parameters)
        data = json.loads(response.text)
        print(data)
    except (ConnectionError, Timeout, TooManyRedirects) as e:
        print(e)
예제 #11
0
def startTweet():
    increment()   
    session = requests.session()
    result = session.get('https://twitter.com/login')
    tree = html.fromstring(result.text)
    authenticity_token = list(set(tree.xpath("//input[@name='authenticity_token']/@value")))[0]
    print ('auth token is: ' + authenticity_token)
    payload = {
        'authenticity_token' : {authenticity_token, authenticity_token},
        'redirect_after_login' : '',
        'remember_me' : '1',
        'scribe_log': '',
        'session[password]': AutomatedPosting.pass_entry.get(),
        'session[username_or_email]' : AutomatedPosting.user_entry.get(),
        'ui_metrics':''
    }   
    params ={
        'authenticity_token' : authenticity_token,
        'batch_mode' : 'off',
        'is_permalink_page' :'false',
        'place_id' : '',
        'status' : AutomatedPosting.msg_entry.get() + " " + str(COUNT),
        'tagged_users':''
    }
    result = session.post('https://twitter.com/sessions', data = payload)
    cookies = result.cookies
    result = session.post('https://twitter.com/i/tweet/create', cookies = cookies, params = params, headers =dict(referer = 'https://www.twitter.com/'))
    print(result.text)
예제 #12
0
def login_sso():
    print("starting sso log in ...")
    login_page = session.get(login_sso_url)
    print(login_page.text)
    login_page.encoding = 'utf8'
    root = etree.HTML(login_page.content)
    if "You have successfully logged in" in login_page.text or "退出登录" in login_page.text:
        print("sso already logged in")
        return True
    form = root.xpath('//div[@class="clearfix login_btncont"]')[0]
    lt = root.xpath('//input[@name="lt"]/@value')[0]
    execution = form.xpath('//input[@name="execution"]/@value')[0]
    _eventId = form.xpath('//input[@name="_eventId"]/@value')[0]
    login_result = session.post(login_sso_url,
                                data={
                                    'submit': '登录',
                                    'username': username,
                                    'password': password,
                                    'code': '',
                                    'lt': lt,
                                    'execution': execution,
                                    "_eventId": _eventId
                                })
    result = 'You have successfully logged in' in login_result.text
    print("sso log in done, result={0}".format(result))
    return result
예제 #13
0
def makeRequest(host, user, password, verbose):
    """Función que realiza las peticiones al servidor especificado. Recibe host, user, password y verbose. Si la opcion verbose es True, se
    muestra lo que se está realizando en cada paso. Se pregunta dentro de la función por las opciones --tor y --agente
    En caso de que --tor sea True se realiza la petición a través de TOR mediante los proxies y session()
    En caso de que --agente sea True se cambia el agente de usuario de Python por el de HotJava con los headers"""
    try:
        header = {
        }  # Diccionario vacío que indicará el agente, vació indica el agente original (Python)
        if opts.agente:
            header[
                'User-agent'] = 'HotJava/1.1.2 FCS'  # Valor que se agrega al diccionario para cambiar el agente
        if opts.tor:
            session.proxies = {}
            session.proxies[
                'http'] = 'socks5h://localhost:9050'  # Proxys http y https para realizar la petición mediante TOR
            session.proxies['https'] = 'socks5h://localhost:9050'
            response = session.get(host, auth=(user, password), headers=header)
        else:
            response = session.get(host, auth=(user, password), headers=header)
        if verbose:
            ip = session.get(
                'http://httpbin.org/ip', headers=header
            )  # Se obtiene la ip desde la que se hace la petición
            ip = str(ip.text).replace('\"origin\":', '').replace(
                '{', '').replace('}', '').replace('\n', '')
            print 'La petición se realiza desde:%s' % (ip)
            agente = session.get(
                'https://httpbin.org/user-agent', headers=header
            )  # Se obtiene el agente de usuario con el que se hace la petición
            agente = str(agente.text).replace('\"user-agent\":', '').replace(
                '{', '').replace('}', '').replace('\n', '')
            print 'El agente de usuario es:%s' % (agente)
            print 'Se está intentando ingresar con usuario \'%s\' contraseña \'%s\'' % (
                user, password)
            print 'La respuesta (response) obtenida es : %s y, por lo tanto:' % (
                response)
        if response.status_code == 200:
            print 'CREDENCIALES ENCONTRADAS!: %s\t%s' % (user, password)
            if opts.reporte != None:
                reportResults(
                    'Usuario: \'%s\'\t\tContraseña: \'%s\'\n' %
                    (user, password), opts.reporte)
        else:
            print 'NO FUNCIONO :c '
    except ConnectionError:
        printError('Error en la conexion, tal vez el servidor no esta arriba.',
                   True)
예제 #14
0
def get_auth_token():
    global auth_token
    print("getting auth token...")
    redirect_1 = \
        session.get(token_url,
                    allow_redirects=False).headers['Location']
    print("redirecting to: {0}".format(redirect_1))
    redirect_2 = session.get(url=redirect_1,
                             headers=default_headers,
                             allow_redirects=False).headers['Location']
    print("redirecting to: {0}".format(redirect_2))
    redirect_3 = session.get(url=redirect_2,
                             headers=default_headers,
                             allow_redirects=False).headers['Location']
    print("redirecting to: {0}".format(redirect_3))
    auth_token = redirect_3.split('=')[1]
    print("token: {0}".format(auth_token))
예제 #15
0
 def getPlayerStatus(self, session, id):
     response = session.get(
         "http://live.nicovideo.jp/api/getplayerstatus?v=" + id)
     #print (response.text)
     status = ElementTree.fromstring(response.content)
     if status.attrib["status"] != "ok":
         print "getplayerstatus failed"
     else:
         return status
예제 #16
0
def get_xsrf():
    response = session.get('https://www.zhihu.com', headers=headers)
    _xsrf_regex = '[\s\S]*<input type="hidden" name="_xsrf" value="(.*?)"/>'
    res_xsrf = re.match(_xsrf_regex, response.text)
    if res_xsrf:
        _xsrf = res_xsrf.group(1)
    else:
        _xsrf = ''
    return _xsrf
예제 #17
0
    def _downloadTorrent_TYT(self, torrentURL, session, torrentDownloadDir):
        logging.debug(' Downloading torrent from TYT url:"%s".' % torrentURL)

        torrentFile = session.get(torrentURL)
        torrentId = re.sub('http://tenyardtracker.com/download\.php\?torrent=', '', torrentURL)
        with open(torrentDownloadDir + '/tenyardtracker_%s.torrent' % torrentId, 'wb') as f:
            for chunk in torrentFile.iter_content(1024):
                f.write(chunk)

        f.close()
예제 #18
0
def drupal_version(website, session):
    response = session.get(website)
    try:
        version = response.headers["X-Generator"].replace(
            " (https://www.drupal.org)", "")
        print(good + "Drupal Version: " + version)
        return version
    except KeyError:
        print(bad + website + " does not appear to be running Drupal")
        return None
예제 #19
0
def is_website_up(website, session):
    try:
        response = session.get(website, timeout=5)
        return (response.status_code == 200)
    except Timeout:
        return False
    except RequestException as error:
        print(bad + website + " RequestException: ")
        print(bad + str(error))
        return False
예제 #20
0
def drupal_version(website, session):
    response = session.get(website)
    try:
        version = response.headers["X-Generator"].replace(
            " (https://www.drupal.org)", "")
        print(good + "Drupal Version: " + version)
        return version
    except KeyError:
        print(bad + "Drupal Version: Not Detected")
        return None
예제 #21
0
def CanvasLogin(session):
    res = session.get(MainUrl)

    #print 'Return status:', res.status_code
    assert res.status_code == 200  # code = OK
    #print res.text

    PayloadLogin['authenticity_token'] = getAuthToken(res.text, 0)
    updateUserInfoForPayloadLogin()
    session.post(LoginPostUrl, data=PayloadLogin)
예제 #22
0
    def gdown(self, url, path, enable=False):
        baseurl = 'https://docs.google.com/uc?export=download'
        fileid = url.split('id=')[1]
        params = {'id': fileid}
        session = requests.session()
        response = session.get(baseurl, params=params, stream=True)
        tokens = self.get_confirm_token(response)
        filename = self.get_gdrive_name(response)
        filesize = self.get_content_len(response)

        if tokens:
            params.update(dict(confirm=tokens))
        path = os.path.join(path, filename)
        if not os.path.exists(path) or enable:
            respons = session.get(baseurl, params=params, stream=True)
            self.download(respons, path)
            # if os.path.exists(path):
            #   print('success')
        return filesize
예제 #23
0
def test_is_login():
    try:
        session.cookies.load()
    except:
        print('加载cookie失败')
    response = session.get('https://www.zhihu.com/inbox',
                           headers=headers,
                           allow_redirects=False)
    with open('sixin.html', 'wb') as f:
        f.write(response.text.encode('utf-8'))
        print('ok')
예제 #24
0
def check_if_file_created(target, file_name, session):
    url = target + file_name
    print(good + 'Checking... ' + url)
    file_response = session.get(url)
    if file_response.status_code != 404:
        print(good + 'File successfully created:')
        print(' ' * 4 + file_response.text.strip())
        cleanup_file(target, file_name, session)
        return True
    else:
        print(bad + target + ' File creation unsuccessful')
        return False
예제 #25
0
 def get_tweets_json(self,
                     q,
                     since='',
                     until='',
                     f='tweets',
                     username='',
                     lang='',
                     cursor='',
                     session=None):
     url = prepare_data(q, since, until, f, username, lang, cursor)
     self.headers['Referer'] = url
     return session.get(url, headers=self.headers).json()
예제 #26
0
def getXLSofRounds(session, roundsIds, baseApiUrl, cookies, resultsFolder):
    for roundsId in roundsIds:
        apiUrl = baseApiUrl + '/' + roundsId['type'] + '/rounds/' + str(
            roundsId['id']) + '/report/results/excel'
        print(apiUrl)
        resp = session.get(apiUrl, cookies=cookies, stream=True)
        resp.raw.decode_content = True

        if resp.status_code == 200:
            with open(resultsFolder + '/' + roundsId['name'] + '.xlsx',
                      'wb') as output:
                shutil.copyfileobj(resp.raw, output)
예제 #27
0
def getCoursesForSem(session, s):
    r = session.get(conf['baseurl'] + 'index.php?role=0&cat=1&csem=1&sem=' + s)
    if(r.status_code == 200):
        soup = BeautifulSoup(r.text)
        courses = list()
        for o in soup.find_all('h3'):
            c = getInfo(o.contents[0])
            courses.append(c)
        return courses
    else:
        print 'ERROR: ' + str(r.status) + ' ' + r.reason
	sys.exit()
예제 #28
0
def getCoursesForSem(session, s):
    r = session.get(baseurl + 'index.php?role=0&cat=1&csem=0&sem=' + s)
    if (r.status_code == 200):
        soup = BeautifulSoup(r.text)
        courses = list()
        for o in soup.find_all('h3'):
            c = getInfo(o.contents[0])
            courses.append(c)
        return courses
    else:
        print 'ERROR: ' + str(r.status) + ' ' + r.reason
        sys.exit()
예제 #29
0
def get_login_cookies(session, baseUrl):
    """
        Get PHPSESSID cookie to use the API
    """
    credentials = config('loginCredentials')

    print("start login")
    loginPage = session.get(baseUrl + '/login')
    soup = BeautifulSoup(loginPage.text, "html.parser")
    csrf = soup.find("input", type="hidden")
    print(csrf)
    # Get user and password from config.ini file
    credentials = config('loginCredentials')

    payload = {
        '_username': credentials['user'],
        '_password': credentials['password'],
        '_csrf_token': csrf['value'],
    }

    # Fake browser header
    headers = {
        'user-agent':
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36"
    }
    loginCheck = session.post(baseUrl + '/login_check',
                              data=payload,
                              headers=headers,
                              cookies=loginPage.cookies)
    # Response ok or not?
    if loginCheck.status_code == 200:
        securePage = session.get(
            'https://amsterdam.apptimizeplatform.nl/api/inspectionround/area/rounds/planning',
            headers=headers)
        # Check for name in html page which is only visible after login
        if securePage.status_code == 200:
            print("login succeeded!")
            return loginCheck.cookies
    if loginCheck.status_code == 401 or loginCheck == 403:
        print('login failed!')
예제 #30
0
def saveFile(session, src, path, name):
    global files
    files.next()
    dst = path + name
    try:
        with open(dst):
            print u'|  |  +--{:<50s}'.format(name) + u'['+colors.OKBLUE+'skipped'+colors.ENDC+']'
            pass
    except IOError:
        with open(dst, 'wb') as handle:
            print u'|  |  +--{:<50s}'.format(name) + u'['+colors.OKGREEN+'downloading'+colors.ENDC+']'
            r = session.get(src, stream=True)
            for block in r.iter_content(1024):
                if not block:
                    break
                handle.write(block)
예제 #31
0
def detour():
    randlist = [
        'aboleth', 'acolyte', 'adult-black-dragon', 'adult-blue-dragon',
        'adult-brass-dragon', 'adult-bronze-dragon', 'adult-copper-dragon',
        'adult-gold-dragon', 'adult-green-dragon', 'adult-red-dragon',
        'adult-silver-dragon', 'adult-white-dragon', 'air-elemental',
        'allosaurus', 'ancient-black-dragon', 'ancient-blue-dragon',
        'ancient-brass-dragon', 'ancient-bronze-dragon',
        'ancient-copper-dragon', 'ancient-gold-dragon', 'ancient-green-dragon',
        'ancient-red-dragon', 'ancient-silver-dragon', 'ancient-white-dragon',
        'androsphinx'
    ]
    randstring = randlist[random.randint(0, len(randlist) - 1)]
    url = request_url + randstring
    with requests.Session() as session:
        raw = session.get(url, headers=headers)
        sleep(random.randint(20, 50))
예제 #32
0
def saveFile(session, src, path, name):
    global files
    files.next()
    dst = path + name.decode('utf-8')

    try:
        with open(dst):
            print '[' + colors.OKBLUE + 'skip' + colors.ENDC + '] |  |  +--%s' % name
            pass
    except IOError:
        with open(dst, 'wb') as handle:
            print '[' + colors.OKGREEN + 'save' + colors.ENDC + '] |  |  +--%s' % name
            r = session.get(src, stream=True)
            for block in r.iter_content(1024):
                if not block:
                    break
                handle.write(block)
예제 #33
0
def saveFile(session, src, path, name):
    global files
    files.next()
    dst = path + name.decode('utf-8')


    try:
        with open(dst):
            print '['+colors.OKBLUE+'skip'+colors.ENDC+'] |  |  +--%s' %name
            pass
    except IOError:
        with open(dst, 'wb') as handle:
            print '['+colors.OKGREEN+'save'+colors.ENDC+'] |  |  +--%s' %name
            r = session.get(src, stream=True)
            for block in r.iter_content(1024):
                if not block:
                    break
                handle.write(block)
예제 #34
0
def downloadResource(session, res, path):
    src = res.a['href']
    r = session.get(src)
    if(r.status_code == 200):
        headers = r.headers.keys()
        if ('content-disposition' in headers):
            name = r.headers['content-disposition'].decode('utf-8').split(';')[1].split('=')[1].strip('"')
        else:
            soup = BeautifulSoup(r.text)
            if ('content-type' in headers) and ('content-script-type' in headers) and ('content-style-type' in headers):
                src = soup.find(class_='region-content').a['href']
            else:
                src = soup.find_all('frame')[1]['src']
            name = os.path.basename(src).replace('%20', ' ').replace('%28', '(').replace('%29', ')')
        saveFile(session, src, path, name)
    else:
        print 'ERROR: ' + str(r.status) + ' ' + r.reason
	sys.exit()
예제 #35
0
파일: analysis.py 프로젝트: zonezoen/scrapy
 def get_area_from_net(self):
     session = requests.session()
     session.headers.update({
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'
     })
     response = session.get(baseUrl)
     soup = BeautifulSoup(response.text, "html5lib")
     dl = soup.find_all("dl",
                        attrs={"id":
                               "rentid_D04_01"})  # 获取各地区的 url 地址的 dl 标签
     my_as = dl[0].find_all("a")  # 获取 dl 标签中所有的 a 标签,
     for my_a in my_as:
         if my_a.text == "不限" or "周边" in my_a.text:  # 清除不限地区的
             continue
         # print(my_a["href"])
         # print(my_a.text)
         self.areaList.append(my_a.text)
예제 #36
0
def getListofRounds(session, baseApiUrl, cookies):
    roundsIds = []
    apiUrls = ['area', 'object']

    for apiUrl in apiUrls:

        rounds = session.get(baseApiUrl + '/' + apiUrl + '/rounds/planning',
                             cookies=cookies)
        roundsJson = rounds.json()

        roundsItems = [{
            'id': r['id'],
            'name': r['name'].replace('/', '-'),
            'type': apiUrl
        } for r in roundsJson]
        roundsIds.extend(roundsItems)
    print(roundsIds)
    return roundsIds
예제 #37
0
def get_orders_detail(account):
    payload = f"""SrcAccount={account['SrcAccount']}&PlaceId={account['SrcAccount_placeId']}&AccountKey={account['AccountKey']}"""

    url = f"""{url_trade}/GetOrdersDetail?{payload}"""

    result = []

    try:
        r = session.get(url, headers=headers)
    except:
        return result

    soup = BeautifulSoup(r.content)

    data_table = soup.find_all(class_="data-table")
    if len(data_table) > 0:
        result = table_to_list(data_table[0])

    return result
예제 #38
0
def downloadCourse(session, c, sem):
    global files
    global sections
    files = itertools.count()
    sections = itertools.count()
    name = c['key'].replace('/', '-') + '/'
    path = root + sem.replace('/', '-') + '/' + name
    #TODO: secure pathnames
    if not os.path.exists(path):
        os.makedirs(path)
    print '+--' + name
    r = session.get(c['url'])
    if(r.status_code == 200):
        soup = BeautifulSoup(r.text)
        for s in soup.find_all(class_='section main clearfix'):
            downloadSection(session, s, path)
        #print 'Saved ' + str(files.next()) + ' Files in ' + str(sections.next()) + ' Sections'
    else:
        print 'ERROR: ' + str(r.status) + ' ' + r.reason
        sys.exit()
예제 #39
0
def saveLink(session, url, path, name):
    global files
    files.next()
    fname = name.encode('utf-8').replace('/', '') + '.html'
    dst = path.encode('utf-8') + fname
    try:
        with open(dst):
            print '['+colors.OKBLUE+'skip'+colors.ENDC+'] |  |  +--%s' %name
            pass
    except IOError:
        with open(dst, 'wb') as handle:
            print '['+colors.OKGREEN+'save'+colors.ENDC+'] |  |  +--%s' %name
            r = session.get(url)
            soup = BeautifulSoup(r.text)
            link = soup.find(class_='region-content').a['href']
            try:
                handle.write(u'<a href="' + link.decode('utf-8') + u'">' + name.decode('utf-8') + u'</a>')
            except UnicodeEncodeError:
                os.remove(dst)
                print '['+colors.FAIL+'fail'+colors.ENDC+'] |  |  +--%s' %name
                pass
def scrape(link_set, path):
    # check to see if the directories in the path string exist
    if not os.path.exists(path):
        #if not, create them
        os.makedirs(path)
    
    for link in link_set:
    
        r = session.get(link)
        
        if(r.status_code == 200):
            soup = BeautifulSoup(r.text, 'lxml')

            # extract the user id 
            # useful for naming the html and png files we will scrape
            user_id = re.search(r"(?<=user\/view\.php\?id=)\d*", link).group()

            # save the profile page as a .html file 
            #output_file = path + str(user_id) + ".html"
            #with open(output_file, 'w') as f:
            #    f.write(str(soup))
            #f.close()
            #print "page saved at " + output_file

            image_link = soup.find_all('img', class_='userpicture')[1].get('src')

            # save the profile image as a .png file
            output_file = path + str(user_id) + ".png"
            response = requests.get(image_link, stream=True)
            with open(output_file, 'wb') as out_file:
                shutil.copyfileobj(response.raw, out_file)
            del response
            print "image saved at " + output_file
    
        else:
            print 'ERROR: ' + str(r.status) + ' ' + r.reason
            sys.exit()