Пример #1
0
def downloadFromList(alist, directory=".", timeout=10):
    """Get files from a list of urls.
  return : list, contained the failure fetch"""
    failure = []
    for url in alist:
        print alist.index(url),
        stream = getStream(url, timeout=timeout)
        file_name = getFilenameFromURL(url)
        if not stream or not writeBinFile(stream, file_name, directory):
            failure.append(url)
    return failure
Пример #2
0
def downloadFromQueue(queue, failure, directory='.', timeout=10):
  """Get files from a list of urls.
  return : list, contained the failure fetch"""
  while not queue.empty():
    url = queue.get()
    stream = getStream(url, timeout=timeout)
    file_name = getFilenameFromURL(url)
    if stream and writeBinFile(stream, file_name, directory):
      queue.task_done()
      print "Fetching", url, 'done.'
      continue
    failure.append(url)
    queue.task_done()
  return failure
Пример #3
0
def main():
  # 开始准备
  prepare()
  while_n = 0 # 循环计数器
  imglist = []
  makedir(Config.directory)
  print 'Generate search url'
  URL = baseURL()
  # 下载 #############
  # 获取搜索结果数量并与_count比较取其较小值
  count = min(searchResult(URL), Config.count)
  # 没有搜索结果时退出
  if not count:
    print "No search result at current condition."
    sys.exit(1)
  # 获得指定数量的url, 存放于list  
  print 'Fetching page',
  while len(imglist) < count:
    print while_n,
    while_n += 1
    tmplist = getImageUrlList(URL)
    imglist = imglist + tmplist
    URL = nextPage(URL, len(tmplist))
  print '' # 换行
  count = len(imglist)
  print "There're %d files to download" % count
  # 将已有文件从imglist中去除
  imglist = [url for url in imglist
             if not getFilenameFromURL(url) in os.listdir(Config.directory)]
  print "There's %d files already downloaded." % (count - len(imglist))
  # 下载该list 
  print 'Fetching list of %d files' % len(imglist)
  queue = Queue()
  for url in imglist:
    queue.put(url)
  failure = []
  for i in range(Config.thread_count):
    start_new_thread(downloadFromQueue, (
                                         queue, failure, Config.directory, Config.timeout))
  queue.join()
  print "%d failed to fetch." % len(failure)
Пример #4
0
  count = min(searchResult(searchURL), count)
  # 没有搜索结果时退出
  if not count:
    print "No search result at current condition."
    sys.exit(1)
  # 获得指定数量的url, 存放于list  ,one page by one page
  print 'Fetching page',
  while len(imglist) < count:
    print while_n,
    #mark the times of while
    while_n += 1
    tmplist = getImageUrlList(searchURL)
    imglist = imglist + tmplist
    searchURL = nextPage(searchURL, len(tmplist))
  print '' # 换行
  count = len(imglist)
  print "There're %d files to download" % count
  # 将已有文件从imglist中去除
  imglist = [url for url in imglist if not getFilenameFromURL(url) in os.listdir(directory)]
  print "There's %d files already downloaded." % (count - len(imglist))
  # 下载该list 使用超时20 10好像小了点
  print 'Fetching list of %d files' % len(imglist)
  failure = threadDownloadFromList(imglist, directory=directory, timeout=20)
  print "%d failed to fetch." % len(failure)
  # 清理
  # 1.添加后缀
  print 'Adding extension ...',
  for fname in os.listdir(directory):
    addExtension(directory + os.sep + fname, '.jpg')
  print 'done.'
Пример #5
0
 def run(self):
   stream = getStream(self.url, timeout=self.timeout)
   file_name = getFilenameFromURL(self.url)
   if not stream or not writeBinFile(stream, file_name, self.directory):
     self.failure.append(self.url)
   self.finished = True