def fetch_url(self, url): filename = url.split('/')[-1] utilities.download_file(url, self.processingDir) xmldoc = minidom.parse(os.path.join(self.processingDir, filename)) MainPubDate = xmldoc.getElementsByTagName('pubDate')[0].firstChild.data epochPubDate = datetime.datetime.strptime(MainPubDate, "%a, %d %b %Y %H:%M:%S +0200").strftime('%s') print "main date " + MainPubDate # if (epochPubDate <= self.lastFetchDate): # return 0 itemlist = xmldoc.getElementsByTagName('item') for elt in itemlist : # TODO : Test object first title = elt.getElementsByTagName('title')[0].firstChild.data link = elt.getElementsByTagName('link')[0].firstChild.data pubDate = elt.getElementsByTagName('pubDate')[0].firstChild.data # print "link " + link if (epochPubDate <= self.lastFetchDate): break if (not os.path.isfile(os.path.join(self.processingDir, link.split('/')[-1]))): print "Downloading %s" % (link) utilities.download_file(link, self.processingDir)
def fetch_url(self, url): filename = url.split('/')[-1] utilities.download_file(url, self.processingDir) xmldoc = minidom.parse(os.path.join(self.processingDir, filename)) MainPubDate = xmldoc.getElementsByTagName('pubDate')[0].firstChild.data epochPubDate = datetime.datetime.strptime( MainPubDate, "%a, %d %b %Y %H:%M:%S +0200").strftime('%s') print "main date " + MainPubDate # if (epochPubDate <= self.lastFetchDate): # return 0 itemlist = xmldoc.getElementsByTagName('item') for elt in itemlist: # TODO : Test object first title = elt.getElementsByTagName('title')[0].firstChild.data link = elt.getElementsByTagName('link')[0].firstChild.data pubDate = elt.getElementsByTagName('pubDate')[0].firstChild.data if (epochPubDate <= self.lastFetchDate): break if (not os.path.isfile( os.path.join(self.processingDir, link.split('/')[-1]))): print "Downloading %s" % (link) utilities.download_file(link, self.processingDir) self.processOffer(link)
def fetch_offer(self, url): if (not os.path.isfile(os.path.join(self.processingDir, url.split('/')[-1]))): print "Downloading %s" % (url) utilities.download_file(url, self.processingDir) else: print "Download failed. File already there." return os.path.join("", url.split('/')[-1])
def fetch_url(self, url): filename = url.split('/')[-1] utilities.download_file(url, self.processingDir) xmlfile = os.path.join(self.processingDir, filename) fileObj = codecs.open( xmlfile, "r", "utf-8" ) content = fileObj.read() xmldoc = minidom.parseString( content ) fileObj.close() #xmldoc = minidom.parse(xmlfile) MainPubDate = xmldoc.getElementsByTagName('lastBuildDate')[0].firstChild.data MainPubDate = MainPubDate[:MainPubDate.rindex(' ')] epochPubDate = datetime.datetime.strptime(MainPubDate, "%a, %d %b %Y %H:%M:%S").strftime('%s') if (epochPubDate <= self.lastFetchDate): return 0 itemlist = xmldoc.getElementsByTagName('item') for elt in itemlist : # TODO : Test object first title = elt.getElementsByTagName('title')[0].firstChild.data link = elt.getElementsByTagName('link')[0].firstChild.data.split("?")[0] + "index.html" pubDate = elt.getElementsByTagName('pubDate')[0].firstChild.data pubDate = pubDate[:pubDate.rindex(' ')] if (epochPubDate <= self.lastFetchDate): break # if (not os.path.isfile(os.path.join(self.processingDir, link.split('/')[-1]))): offer = ProgressiveOffer() guid = elt.getElementsByTagName('guid')[0].firstChild.data offer.ref = guid.split('/')[-2] print "Processing %s" % (offer.ref) offer.date_add = int(time.time()) loc = Location() offer.lat = loc.lat offer.lon = loc.lon offer.title = title.encode( 'iso-8859-1' ) offer.url = link offer.date_pub = datetime.datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S").strftime('%s') offer.content = elt.getElementsByTagName('description')[0].firstChild.data offer.content = offer.content.encode( 'iso-8859-1' ) offer.company = 'Progressive Recruitment' offer.location = 'NA' offer.cleanLocation() offer.contract = 'NA' offer.cleanContract() offer.salary = 'NA' offer.cleanSalary() offer.experience = 'NA' offer.add_db()
def fetch_offer(self, url): if (not os.path.isfile( os.path.join(self.processingDir, url.split('/')[-1]))): print "Downloading %s" % (url) utilities.download_file(url, self.processingDir) else: print "Download failed. File already there." return os.path.join("", url.split('/')[-1])
def fetch_url(self, url): filename = url.split("/")[-1] utilities.download_file(url, self.processingDir) xmldoc = minidom.parse(os.path.join(self.processingDir, filename)) MainPubDate = xmldoc.getElementsByTagName("pubDate")[0].firstChild.data itemlist = xmldoc.getElementsByTagName("item") for elt in itemlist: # TODO : Test object first title = elt.getElementsByTagName("title")[0].firstChild.data link = elt.getElementsByTagName("link")[0].firstChild.data.split("?")[0] pubDate = elt.getElementsByTagName("pubDate")[0].firstChild.data if not os.path.isfile(os.path.join(self.processingDir, link.split("/")[-1])): print "Downloading %s" % (link) utilities.download_file(link, self.processingDir)
def download_item(module, item, session): filename = clean_file_name(item.name) moduleFilename = clean_file_name(module.name) fullPath = "{0}{1}/{2}".format(DOWNLOAD_DIR, moduleFilename, filename) print(" - Downloading ({0}-{1}) {2}...".format(item.itemType, item.itemId, item.name)) if item.itemType == 'Attachment': download_file( "{0}/courses/{1}/files/{2}/download".format( CANVAS_URL, COURSE_ID, item.itemId), session, fullPath) else: # Add .html extension for webpage downloads fullPath += ".html" download_file( "{0}/courses/{1}/modules/items/{2}".format(CANVAS_URL, COURSE_ID, item.modId), session, fullPath)
def download_files(job_name, output_id, output_folder=None): """ Downloads the files from the output of the job locally Parameters ---------- job_name: [str] The name of the job e.g run_cntk, run_pytorch output_id: [str] The id of the output you want to download the files from e.g stdOuterr, notebooks """ if output_folder: logger.info('Downloading files to {}'.format(output_folder)) files = client.jobs.list_output_files( config.group_name, job_name, models.JobsListOutputFilesOptions(output_id)) for file in files: logger.info('Downloading {}'.format(file.name)) file_name = path.join(output_folder, file.name) if output_folder else file.name ut.download_file(file.download_url, file_name) print("All files Downloaded")
def fetch_url(self, url): filename = url.split('/')[-1] utilities.download_file(url, self.processingDir) xmldoc = minidom.parse(os.path.join(self.processingDir, filename)) MainPubDate = xmldoc.getElementsByTagName('pubDate')[0].firstChild.data itemlist = xmldoc.getElementsByTagName('item') for elt in itemlist: # TODO : Test object first title = elt.getElementsByTagName('title')[0].firstChild.data link = elt.getElementsByTagName('link')[0].firstChild.data.split( "?")[0] pubDate = elt.getElementsByTagName('pubDate')[0].firstChild.data if (not os.path.isfile( os.path.join(self.processingDir, link.split('/')[-1]))): print "Downloading %s" % (link) utilities.download_file(link, self.processingDir)
def epoch_init(self): self.all_images = [] self.all_landmarks = [] if self.tar_idx < 10: tarfilestr = "00" + str(self.tar_idx) elif self.tar_idx < 100: tarfilestr = "0" + str(self.tar_idx) else: tarfilestr = str(self.tar_idx) download_file( "https://s3.amazonaws.com/google-landmark/train/images_{}.tar". format(tarfilestr), "images.tar", bar=False) #print(os.listdir()) tar = tarfile.open('images.tar') tar.extractall("imagesfolder") tar.close() self.total = self.pickfiles("imagesfolder") self.tar_idx += 1 print("tar", self.tar_idx - 1, "total:", self.total)
def fetch_url(self, url): filename = url.split('/')[-1] utilities.download_file(url, self.processingDir) xmlfile = os.path.join(self.processingDir, filename) fileObj = codecs.open(xmlfile, "r", "utf-8") content = fileObj.read() xmldoc = minidom.parseString(content) fileObj.close() #xmldoc = minidom.parse(xmlfile) MainPubDate = xmldoc.getElementsByTagName( 'lastBuildDate')[0].firstChild.data MainPubDate = MainPubDate[:MainPubDate.rindex(' ')] epochPubDate = datetime.datetime.strptime( MainPubDate, "%a, %d %b %Y %H:%M:%S").strftime('%s') if (epochPubDate <= self.lastFetchDate): return 0 itemlist = xmldoc.getElementsByTagName('item') for elt in itemlist: # TODO : Test object first title = elt.getElementsByTagName('title')[0].firstChild.data link = elt.getElementsByTagName('link')[0].firstChild.data.split( "?")[0] + "index.html" pubDate = elt.getElementsByTagName('pubDate')[0].firstChild.data pubDate = pubDate[:pubDate.rindex(' ')] if (epochPubDate <= self.lastFetchDate): break # if (not os.path.isfile(os.path.join(self.processingDir, link.split('/')[-1]))): offer = ProgressiveOffer() guid = elt.getElementsByTagName('guid')[0].firstChild.data offer.ref = guid.split('/')[-2] print "Processing %s" % (offer.ref) offer.date_add = int(time.time()) loc = Location() offer.lat = loc.lat offer.lon = loc.lon offer.title = title.encode('iso-8859-1') offer.url = link offer.date_pub = datetime.datetime.strptime( pubDate, "%a, %d %b %Y %H:%M:%S").strftime('%s') offer.content = elt.getElementsByTagName( 'description')[0].firstChild.data offer.content = offer.content.encode('iso-8859-1') offer.company = 'Progressive Recruitment' offer.location = 'NA' offer.cleanLocation() offer.contract = 'NA' offer.cleanContract() offer.salary = 'NA' offer.cleanSalary() offer.experience = 'NA' offer.add_db()