Python downloadUrl 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: downloader

메소드/함수: downloadUrl

hotexamples.com에서의 예제들: 5

Python downloadUrl - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 downloader.downloadUrl에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: chanscrape_html.py 프로젝트: process/chanscrape

def getThread(thread,board=None):
  url = ""
  if thread is None:
    raise Exception("getThread must be passed a threadId or URL.")
  # Check if argument is already a URL
  result = re.search("boards.4chan.org/(.+)/res/(.+)", thread)
  if result is not None:
    url = thread
  # If not, generate the URL
  else:
    if board is not None:
      # Remove slashes, if any
      if board[0] == "/":
        board = board[1:]
      if board[-1] == "/":
        board = board[:-1]
      url = "http://boards.4chan.org/%s/res/%s" % (board, thread)
    else:
      raise Exception("If getThread is given a threadId, \
                       then a board name must also be given")
  # Now we have a URL to download the thread
  pageData = downloader.downloadUrl(url)
  # Make some soup
  soup = BeautifulSoup(pageData)
  posts = soup.find_all("div", "post")
  posts = map(Post, posts) #Turn them into post objcets
  return Thread(posts)

예제 #2

파일 보기

파일: google_downloader.py 프로젝트: bagnikita/antiplagiat

    def __process_url(self):
        while(True):
            url = ""

            with self.__urlList_lock:
                if len(self.__urlList) > 0:
                    url = self.__urlList[0]
                    self.__urlList = self.__urlList[1:]
                else:
                    if self.__linksDone == True:
                        break

            if url == "":
                time.sleep(0.1)
                continue

            if self.__trace:
                with self.__general_lock:
                    print("Parsing", url)

            content_type = None
            data = None
            content_type, data = downloadUrl(url)

            if self.__trace:
                with self.__general_lock:
                    print("Content-Type = \"%s\", len = %d" % (content_type, len(data)))
          
            if content_type == "null":
                continue

            if content_type in FileDownloader.APP_EXT:
                self.__urls.append(url)
                continue

            if "text" in content_type:
                try:
                    data = data.decode('utf-8')
                except:
                    pass
                    continue
                
                url_regexp = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
                matches = re.findall(url_regexp, data)
            
                for link in matches:
                    for ext in FileDownloader.DATA_EXT:
                        if link.endswith(ext):
                            if self.__trace:
                                with self.__general_lock:
                                    print("Found \"%s\" file by extention \"%s\"" % (link, ext))
                                with self.__urls_lock:
                                    self.__urls.append(link)
                                break

                if self.__trace:
                    with self.__general_lock:
                        print("finished ", url)

예제 #3

파일 보기

파일: chanscrape.py 프로젝트: process/mu-scrape

def getThread(thread, board=None):
  url = ""
  if thread is None:
    raise Exception("getThread must be passed a threadId or URL.")
  # Check if argument is already a URL
  result = re.search("boards.4chan.org/(.+)/res/(.+)", thread)
  if result is not None:
    thread += ".json"
    url = thread
  # If not, generate the URL
  else:
    if board is not None:
      url = "https://boards.4chan.org/%s/res/%s.json" % (board, thread)
    else:
      raise Exception("If getThread is given a threadId, \
                       then a board name must also be given")
  # Now we have a URL to download the thread
  pageData = downloader.downloadUrl(url)
  return json.loads(pageData)

예제 #4

파일 보기

파일: google.py 프로젝트: bagnikita/antiplagiat

    def __fetch_page(self, page):
        searchPattern = GoogleSearch.SEARCH_URL
        if page > 0:
            searchPattern = GoogleSearch.NEXT_PAGE

        searchUrl = [searchPattern % 
                     {
                      'query': urllib.parse.quote_plus(self.query),
                      'start': page * self.num,
                      'tld' : self.tld,
                      'lang' : self.lang
                      }
                     ]

        searchUrl = "".join(searchUrl)

        content_type, data = downloadUrl(searchUrl)
        if "text" in content_type:
            data = data.decode('utf-8')
        return data

예제 #5

파일 보기

파일: libantiplagiat.py 프로젝트: bagnikita/antiplagiat

def readRemoteFile(url):
    content_type, remoteFile = downloadUrl(url)
    
    res = None

    if 'text' in content_type:
        res = readTxtFile(io.StringIO(remoteFile.decode('utf-8')))
    elif 'application/pdf' in content_type:
        res = readPdfFile(io.BytesIO(remoteFile))
    if res == None:
        return {'status' : 'error'}

    text = res['data']
    keywords = extractPlainKeywords(text)
    if keywords != None:
        res['keywords'] = keywords
    keyword_expr = extractPlainKeywordsExpressions(text)
    if keyword_expr != None:
        res['keywords_expressions'] = keyword_expr
    emails = extractEmail(text)
    if emails != None:
        res['emails'] = emails
    return res