def get_executor_path_or_download(executor_url): #get filename from url _executor_name = executor_url.split('/').pop(-1) executor_path = os.path.join(os.environ['MARVIN_DATA_PATH'], _executor_name) if not os.path.exists(executor_path): wget(executor_url, out=executor_path) return executor_path
def xhamsterFindGalleryImages(url): try: try: data = wget(url, useCache=True, numTries=3) except Exception, e: print 'exception: %s' % e print 'falling back..' # Fallback to trying to find a gallery link. data = wget(url=xhamsterFindGalleryLink(url), useCache=True) d = pq(data) imagePageLinks = [imagePageLink for imagePageLink in d('table.img a') if '/photos/view/' in imagePageLink.attrib['href']] out = [] for imagePageLink in imagePageLinks: out.append(imagePageLink.attrib['href']) return out
def downloadGallery(galleryLink): """ Saves the passed image link or attempts to extract and download a group of links from the html.""" try: (imageOrGalleryHtml, headers) = wget( url=galleryLink, includeHeaders=True, useCache=True, referer=galleryLink, numTries=NUM_TRIES ) if 'content-type' in headers and 'image/' in headers['content-type'].lower(): if not fileAlreadyDownloaded(galleryLink): saveDownloadedFile(galleryLink, imageOrGalleryHtml) else: links = extractMostCommonGroup( url=galleryLink, html=imageOrGalleryHtml ) if links is not None: # Found a group to download. cleanedGalleryLink = _cleanupHtmlLinkForStorage(galleryLink) if not fileAlreadyDownloaded(cleanedGalleryLink): saveDownloadedFile(cleanedGalleryLink, imageOrGalleryHtml) _downloadAndSaveFiles(links=links, referer=galleryLink) else: print '[warn] no link groups could be extracted from url=%s' % galleryLink except WgetError, e: print '%s: (%s): %s' % (__file__, type(e), e)
def _downloadAndSaveFiles(links, referer=None): """Defaults to using each link as it's own referer.""" r = referer for link in links: if referer is None: r = link if not fileAlreadyDownloaded(link): try: (data, headers) = wget( url=link, includeHeaders=True, useCache=True, referer=r, numTries=NUM_TRIES ) if 'content-type' in headers: if 'image/' in headers['content-type'].lower(): saveDownloadedFile(link, data) else: pass else: saveDownloadedFile(link, data) except WgetError, e: print '%s: (%s): %s' % (__file__, type(e), e) except Exception, e: print '%s: (%s): %s' % (__file__, type(e), e)
def check(self, sub): try: data = wget(self.templateUrl % sub) if self._invalidPageRe.search(data) is not None: return False except Exception, e: print 'EXCEPTION: %s' % e return False
def extractLargestImageFromUrl(url, html=None, parentUrl=None, onlyUrl=False): """ @param onlyUrl set to True to have only the url returned. """ if parentUrl is None: parentUrl = url if html is None: html = wget(url=url, referer=parentUrl, useCache=True) print html d = pq(html) imageUrls = [ urlparse.urljoin(url, img.attrib['src']) for img in d('img') if img.attrib.has_key('src') ] rawImages = [] largestIdx = -1 largestSz = 0 conflictedIdx = -1 for idx, image_url in enumerate(imageUrls): rawImageData = wget( url=imageUrls[idx], referer=parentUrl, useCache=True ) rawImages.append(rawImageData) assert rawImages[idx] == rawImageData fakeFile = StringIO.StringIO(rawImages[-1]) try: image = Image.open(fakeFile) print image.format, image.size sz = image.size[0] * image.size[1] print sz if sz > largestSz: largestIdx = idx largestSz = sz # Any previous conflict is no longer relevant. conflictedIdx = -1 elif sz == largestSz: conflictedIdx = idx except IOError, e: print '%s: extractLargestImageUrlFromUrl Caught exception: (%s) %s' % (__file__, type(e), e)
def inline_images(domain, html): for m in css_url_re.finditer(html): url = m.group('url') if url[0] == '/': url = '%s%s' % (domain, url) data = wget(url) b64_data = base64.b64encode(data) html = html.replace( m.group('wholething'), 'url(data:image/%s;base64,%s)' % (url[-3:], b64_data), 1) for m in img_src_re.finditer(html): url = m.group('url') if url[0] == '/': url = '%s%s' % (domain, url) data = wget(url) b64_data = base64.b64encode(data) html = html.replace( m.group('wholething'), '<img src="data:image/%s;base64,%s"' % (url[-3:], b64_data), 1) return html
def xhamsterDownloadGallery(url): imagePageLinks = xhamsterFindGalleryImages(url) for imagePageLink in imagePageLinks: try: data = wget(url=imagePageLink, useCache=True, numTries=3) except WgetError, e: raise d = pq(data) imageElementCandidates = d('td#img img') # #imgSized') if len(imageElementCandidates) > 0: imageElement = imageElementCandidates[0] imageUrl = imageElement.attrib['src'] if not fileAlreadyDownloaded(imageUrl): try: imageData = wget(url=imageUrl, referer=imagePageLink, numTries=3) except WgetError, e: raise saveDownloadedFile(imageUrl, imageData) else: print 'File already saved: %s' % imageUrl
def xhamsterFindGalleryLink(url): try: if '/gallery/' in url: return url m = galleryIdExtractor.match(url) if m: print 'shortcut!' return 'http://xhamster.com/photos/gallery/%s/index.html' % m.group(1) try: data = wget(url=url, useCache=True) d = pq(data) link = d('.TitleTable a')[1] if '/gallery/' in link.attrib['href']: return link.attrib['href'] else: raise Exception('No gallery found for url: %s' % url) except Exception, e: print 'exception: %s' % e return None except WgetError, e: raise
# Sometimes this fails with an "http protocol error, bad status line". # Maybe from too many requests at once or something. So we retry up to 5 times. nattempts = 5 cmd = 'wget -q --no-check-certificate %s'%(full_file) for attempt in range(1,nattempts+1): if os.path.exists(full_file): break return full_file # In[5]: # Download the files we need. These files are #It looks to me that the image file is the full coadd, image_file = wget('ftp://ftp.star.ucl.ac.uk/whartley/ultraVISTA/','UVISTA_J_21_01_16_allpaw_skysub_015_dr3_rc_v5.fits.gz') #row['root'] = root #row['image_file'] = image_file #usually weight is in image file but in this case, it's a separate file weight_file = wget('ftp://ftp.star.ucl.ac.uk/whartley/ultraVISTA/','UVISTA_J_21_01_16_allpaw_skysub_015_dr3_rc_v5.weight.fits.gz') # In[6]: #Not sure this is necessary, but having this information might be useful for further tests def read_image_header(row, img_file): """Read some information from the image header and write into the df row.
# # vim: fenc=utf-8:ts=4:et:ai import os,shutil,wget dir_name = "20170419_F04" url = "http://nabetani.sakura.ne.jp/hena/ordf04octsp/" wget(url)
def include_bare_minimum_css(domain, html, omit_bad_css=True): """ @param domain string. @param html string. HTML document string. @param omit_bad_css boolean. Defaults to True. When True, erroneous CSS will simply be omitted. When False, any questionable CSS will be included. """ d = pq(html) links_and_styles = d('link,style') favicon = None stylesheets = [] for link_or_style_ele in links_and_styles: if link_or_style_ele.tag == 'link': if 'rel' in link_or_style_ele.attrib and link_or_style_ele.attrib[ 'rel'] == 'shortcut icon': favicon = link_or_style_ele.attrib['href'] continue elif link_or_style_ele.attrib['href'][0] == '/': link_or_style_ele.attrib['href'] = '%s%s' % ( domain, link_or_style_ele.attrib['href']) stylesheets.append(wget(link_or_style_ele.attrib['href'])) elif link_or_style_ele.tag == 'style': stylesheets.append(str(link_or_style_ele)) out = '' for stylesheet in stylesheets: for stmt in stylesheet.split('}'): stmt = stmt.replace('\n', ' ').strip(' ') + '}' match = css_rule_re.match(stmt) if match: specifiers = match.group('specifiers') rule = match.group('rule') include_specifiers = '' for specifier in specifiers.split(','): clean_specifier = re.sub(css_spec_cleaning_re, '', specifier) # Interesting idea, but maybe not the best. -.v #specifier = specifier.replace(' ', '>') include_current = False try: matched_elements = d(clean_specifier) if len(matched_elements): # Then the element should be included. include_current = True #else: # Otherwise it can be fairly safely omitted. #pass except Exception, e: #lxml.cssselect.ExpressionError: # PQ can't handle it; it is likely bad CSS so omit it # unless omit_bad_css suppression has been requested. if omit_bad_css: include_current = True if include_current: if len(include_specifiers): include_specifiers += ',' + specifier else: include_specifiers = specifier if len(include_specifiers): # Then this rule is used, so include it. out += '%s %s\n' % (include_specifiers, rule)
include_current = True if include_current: if len(include_specifiers): include_specifiers += ',' + specifier else: include_specifiers = specifier if len(include_specifiers): # Then this rule is used, so include it. out += '%s %s\n' % (include_specifiers, rule) d('link,style').replaceWith('') d('head').append('<style type="text/css">%s</style>' % out) if favicon is not None: try: favicon_bin = wget(favicon) favicon_b64 = base64.b64encode(favicon_bin) d('head').append( '<link id="favicon" rel="shortcut icon" type="image/png" href="data:image/png;base64,%s">' % favicon_b64) except Exception, e: #print 'error: favicon integration failed' pass return str(d) css_url_re = re.compile( '''(?P<wholething>url\s*\(\s*['"]?(?P<url>[^\)'"]*)['"]?\s*\))''', re.I | re.M) img_src_re = re.compile( '''(?P<wholething><\s*img [^>]*src=['"](?P<url>[^'"]*)['"])''',
def GetData2(args): wdir = os.path.expanduser(args.outdir) print('work dir = ',wdir) try: if not os.path.exists(wdir): os.makedirs(wdir) except OSError as e: print("Ignore OSError from makedirs(work):") print(e) pass url_base = 'https://*****:*****@desar2.cosmology.illinois.edu/DESFiles/desarchive/'%ps() all_exp = fitsio.read(args.erinfile) all_exp = all_exp.astype(all_exp.dtype.newbyteorder('=')) if args.filexps != '': print('Read file ',args.filexps) with open(args.filexps) as fin: exps = [ line.strip() for line in fin if line[0] != '#' ] print('File includes %d exposures'%len(exps)) for exp in sorted(exps): exp = int(exp) #print(exp) data = all_exp[all_exp['expnum'] == exp] #print(data) exp_df = pandas.DataFrame(data, columns=['expnum', 'ccdnum', 'band', 'path', 'magzp']) # Add some blank columns to be filled in below. for k in [ 'telra', 'teldec', 'telha' , 'tiling', 'airmass', 'sat', 'fwhm', 'sky', 'sigsky', 'humidity', 'pressure', 'dimmseeing', 'dT', 'outtemp', 'msurtemp', 'winddir', 'windspd']: exp_df[k] = [-999.] * len(data) for k, row in exp_df.iterrows(): ccdnum = row['ccdnum'] try: path = row['path'].strip() base_path, _, _, image_file_name = path.rsplit('/',3) root, ext = image_file_name.rsplit('_',1) #print('root, ext = |%s| |%s|'%(root,ext)) image_file = wget(url_base, base_path + '/red/immask/', wdir, root + '_' + ext) #print('image_file = ',image_file) read_image_header(row, image_file) remove_temp_files(wdir, root) exp_df.iloc[k] = row except: print("Unxpected error from exp, ccd:", exp, ccdnum) print(sys.exc_info()[0]) continue #print('path = ',path) #file_name = os.path.join(wdir, '%d_Y3A1_atmos_pos_condition.fits'%exp) file_name = os.path.join(wdir, 'Y3A1_extrafields.fits') write_fit(exp_df.to_records(index=False), file_name) #print('Done with exposure ',exp) print('\nFinished processing all exposures')
import pandas as pd import csv import wget import tarfile # 下载维基百科词条,截取部分作为other类 dbpedia_url = 'https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz' wget(dbpedia_url) with tarfile.open("dbpedia_csv.tar.gz", "r:gz") as tar: tar.extractall() df = pd.read_csv("dbpedia_csv/test.csv", names=["class", "title", "content"]) df2 = df.sample(frac=0.03) title = df2["title"] content = df2["content"] with open("d:/research_paper_classfication/other/train_data.csv", "w", newline="", encoding="utf-8") as csvfile: writer = csv.writer(csvfile) for t, c in zip(title[:1800], content[:1800]): writer.writerow([t, c, "5"]) with open("d:/research_paper_classfication/other/test_data.csv", "w", newline="", encoding="utf-8") as csvfile: writer = csv.writer(csvfile) for t, c in zip(title[1800:], content[1800:]): writer.writerow([t, c, "5"])
def extractMostCommonGroup(url, html=None): ## REQUIRING A HOSTNAME MATCH DOESN'T WORK FOR RELATIVE URLS.. # match = url_hostname_re.match(url) # if match is not None: # hostname = match.group(2) # else: # raise Exception('Failed to extract hostname from the supplied url?') # a=array([[1,2,9,10,99,100], [3,4,10,11,99,150], [99, 100, 10, 13, 400, -3]]) # mask=array([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]) # clusterid, error, nfound = kcluster(a, # nclusters=2, mask=mask, weight=array([1, 1, 1, 1, 1, 1]), # transpose=0, npass=1000, method='a', dist='e') # print clusterid # print error # print nfound # return if html is None: html = wget(url) bodyOnlyMatch = bodyOnlyRe.match(html) if bodyOnlyMatch: d = pq(bodyOnlyMatch.group(1)) else: print 'HTML: %s' % html print 'Warning: <body> tag region could not be extracted - this is bad' d = pq(html) def _print(x): print x return True #print [urlparse.urljoin(url, a.attrib['href']) for a in d('a')] lst = list(excludeDuplicates([ urlparse.urljoin(url, a.attrib['href']) for a in d('a') if ( #_print(pq(a).children('img')) and len(pq(a).children('img')) > 0 and a.attrib.has_key('href') and containsNumberRe.search(a.attrib['href']) ) ])) #print lst domain = urlparse.urlsplit(url).netloc diffs = dict([ (item, levenshtein_distance(lst[i].replace('http://', ''), domain)) for i, item in enumerate(lst) ]) # avg = mean(diffs) # print avg # print min(diffs) # print max(diffs) # pprint (lst) def f(a, b): return abs(a - b) numGroups = len(diffs) / 2 print 'Num groups:',numGroups if len(diffs) == 0: print 'LargestImage extract.. error: no diffs!' return [] groups = kmeans(diffs, numGroups, f) # Select and return the largest group of similar links. maxIdx = -1 conflictedIdx = -1 maxSz = 0 for idx, group in groups.items(): l = len(group) print l if l > maxSz: maxIdx = idx maxSz = l # Any previous conflict is no longer relevant. conflictedIdx = -1 elif l == maxSz: # Mark conflicted state. conflictedIdx = idx print 'groups = %s' % groups # Make sure we got a result. if maxIdx == -1: raise Exception('No groups were found? Very odd.. groups = %s' % groups) # Check to see if the largest group had conflicts. if conflictedIdx != -1: print 'WARNING: There was a group of equal size which was not selected.' imageLinks = False for link in groups[maxIdx]: if imageRe.match(link): imageLinks = True break if not imageLinks: print 'no image links were found.. for url=%s' % url out = [] for link in groups[maxIdx]: out.append(extractLargestImageUrlFromUrl(link)) return out #print 'imageLinks = %s' % imageLinks return groups[maxIdx]