def run(self): """ Get doublicates of article of sameas webservice create datasources with resources updates resources (download content, save content to disk if it is new or was updated) """ self.done = 0 directoryBaseURL = self.config['directoryURL'] dbPediaURL = self.config['dbPediaURL'] directoryURL = "%s%s%s" % (directoryBaseURL, dbPediaURL, self.article) page = json.load(urllib2.urlopen(directoryURL)) duplicates = page[0]["duplicates"] self.total = len(duplicates) # create resources and append resources to datasources for url in duplicates: #DEBUG only list freebase and geonames if True or "freebase" in url or "geonames" in url: resource = Resource(url) if resource.domain not in self.datasources: datasource = Datasource(resource.domain, self.lastdate) self.datasources[resource.domain] = datasource datasource.resources.append(resource) # update datasources, dublicate detection, creation of json for domain, datasource in self.datasources.iteritems(): if not self._stop.is_set(): #do not proceed if stop is set datasource.update() self.done += 1 self.completed = 1 self.callback(self.datasources)
raise Exception('类型' + str(type(entity.media)) + '无法识别') cur_file = Path(filename) if cur_file.exists(): log.info('文件已存在' + str(filename)) else: log.info('开始下载[' + str(filename) + '], 当前进度' + str(index) + "/" + str(total)) client.download_media(entity, filename) end_time = datetime.datetime.now() log.info('下载完成[' + str(filename) + '], 耗时' + str(end_time - start_time)) except BaseException as e: try: log.info('下载失败,尝试删除文件[' + filename + ']') cur_file.unlink() except IsADirectoryError as ie: log.error('删除文件[' + filename + ']失败' + ie) log.error('Exception:' + str(index) + ':' + str(e)) log.info(_type + '类型文件下载结束') if config.job['type_video'] == 1: download(InputMessagesFilterVideo) if config.job['type_photo'] == 1: download(InputMessagesFilterPhotos) client.disconnect() if args.d is not None: exec_id = my_source.update('update job_exec set end_time = sysdate() where id = ' + str(exec_id))