def get_executor_path_or_download(executor_url):
    #get filename from url
    _executor_name = executor_url.split('/').pop(-1)

    executor_path = os.path.join(os.environ['MARVIN_DATA_PATH'],
                                 _executor_name)

    if not os.path.exists(executor_path):
        wget(executor_url, out=executor_path)

    return executor_path
def xhamsterFindGalleryImages(url):
    try:
        try:
            data = wget(url, useCache=True, numTries=3)
        except Exception, e:
            print 'exception: %s' % e
            print 'falling back..'
            # Fallback to trying to find a gallery link.
            data = wget(url=xhamsterFindGalleryLink(url), useCache=True)
    
        d = pq(data)
        imagePageLinks = [imagePageLink for imagePageLink in d('table.img a') if '/photos/view/' in imagePageLink.attrib['href']]
        out = []
        for imagePageLink in imagePageLinks:
            out.append(imagePageLink.attrib['href'])
        return out
def downloadGallery(galleryLink):
    """
    Saves the passed image link or attempts to extract and download a group
    of links from the html."""
    try:
        (imageOrGalleryHtml, headers) = wget(
            url=galleryLink,
            includeHeaders=True,
            useCache=True,
            referer=galleryLink,
            numTries=NUM_TRIES
        )
        if 'content-type' in headers and 'image/' in headers['content-type'].lower():
            if not fileAlreadyDownloaded(galleryLink):
                saveDownloadedFile(galleryLink, imageOrGalleryHtml)
        else:
            links = extractMostCommonGroup(
                url=galleryLink,
                html=imageOrGalleryHtml
            )
            if links is not None:
                # Found a group to download.
                cleanedGalleryLink = _cleanupHtmlLinkForStorage(galleryLink)
                if not fileAlreadyDownloaded(cleanedGalleryLink):
                    saveDownloadedFile(cleanedGalleryLink, imageOrGalleryHtml)
                _downloadAndSaveFiles(links=links, referer=galleryLink)
            else:
                print '[warn] no link groups could be extracted from url=%s' % galleryLink
    except WgetError, e:
        print '%s: (%s): %s' % (__file__, type(e), e)
def _downloadAndSaveFiles(links, referer=None):
    """Defaults to using each link as it's own referer."""
    r = referer
    for link in links:
        if referer is None:
            r = link
        if not fileAlreadyDownloaded(link):
            try:
                (data, headers) = wget(
                    url=link,
                    includeHeaders=True,
                    useCache=True,
                    referer=r,
                    numTries=NUM_TRIES
                )
                if 'content-type' in headers:
                    if 'image/' in headers['content-type'].lower():
                        saveDownloadedFile(link, data)
                    else:
                        pass
                else:
                    saveDownloadedFile(link, data)

            except WgetError, e:
                print '%s: (%s): %s' % (__file__, type(e), e)
            except Exception, e:
                print '%s: (%s): %s' % (__file__, type(e), e)
 def check(self, sub):
     try:
         data = wget(self.templateUrl % sub)
         if self._invalidPageRe.search(data) is not None:
             return False
     except Exception, e:
         print 'EXCEPTION: %s' % e
         return False
def extractLargestImageFromUrl(url, html=None, parentUrl=None, onlyUrl=False):
    """
    @param onlyUrl set to True to have only the url returned.
    """
    if parentUrl is None:
        parentUrl = url
    if html is None:
        html = wget(url=url, referer=parentUrl, useCache=True)

    print html
    d = pq(html)

    imageUrls = [
        urlparse.urljoin(url, img.attrib['src'])
        for img in d('img') if img.attrib.has_key('src')
    ]

    rawImages = []
    largestIdx = -1
    largestSz = 0
    conflictedIdx = -1
    for idx, image_url in enumerate(imageUrls):
        rawImageData = wget(
            url=imageUrls[idx],
            referer=parentUrl,
            useCache=True
        )
        rawImages.append(rawImageData)
        assert rawImages[idx] == rawImageData
        fakeFile = StringIO.StringIO(rawImages[-1])
        try:
            image = Image.open(fakeFile)
            print image.format, image.size
            sz = image.size[0] * image.size[1]
            print sz
            if sz > largestSz:
                largestIdx = idx
                largestSz = sz
                # Any previous conflict is no longer relevant.
                conflictedIdx = -1
            elif sz == largestSz:
                conflictedIdx = idx
        except IOError, e:
            print '%s: extractLargestImageUrlFromUrl Caught exception: (%s) %s' % (__file__, type(e), e)
Exemplo n.º 7
0
def inline_images(domain, html):
    for m in css_url_re.finditer(html):
        url = m.group('url')
        if url[0] == '/':
            url = '%s%s' % (domain, url)
        data = wget(url)
        b64_data = base64.b64encode(data)
        html = html.replace(
            m.group('wholething'),
            'url(data:image/%s;base64,%s)' % (url[-3:], b64_data), 1)
    for m in img_src_re.finditer(html):
        url = m.group('url')
        if url[0] == '/':
            url = '%s%s' % (domain, url)
        data = wget(url)
        b64_data = base64.b64encode(data)
        html = html.replace(
            m.group('wholething'),
            '<img src="data:image/%s;base64,%s"' % (url[-3:], b64_data), 1)
    return html
def xhamsterDownloadGallery(url):
    imagePageLinks = xhamsterFindGalleryImages(url)
    for imagePageLink in imagePageLinks:
        try:
            data = wget(url=imagePageLink, useCache=True, numTries=3)
        except WgetError, e:
            raise
        d = pq(data)
        imageElementCandidates = d('td#img img') # #imgSized')
        if len(imageElementCandidates) > 0:
            imageElement = imageElementCandidates[0]
            imageUrl = imageElement.attrib['src']
            if not fileAlreadyDownloaded(imageUrl):
                try:
                    imageData = wget(url=imageUrl, referer=imagePageLink, numTries=3)
                except WgetError, e:
                    raise
                saveDownloadedFile(imageUrl, imageData)
            else:
                print 'File already saved: %s' % imageUrl
def xhamsterFindGalleryLink(url):
    try:
        if '/gallery/' in url:
            return url
        m = galleryIdExtractor.match(url)
        if m:
            print 'shortcut!'
            return 'http://xhamster.com/photos/gallery/%s/index.html' % m.group(1)
        try:
            data = wget(url=url, useCache=True)
            d = pq(data)
            link = d('.TitleTable a')[1]
            if '/gallery/' in link.attrib['href']:
                return link.attrib['href']
            else:
                raise Exception('No gallery found for url: %s' % url)
        except Exception, e:
            print 'exception: %s' % e
            return None
    except WgetError, e:
        raise
Exemplo n.º 10
0
        # Sometimes this fails with an "http protocol error, bad status line".
        # Maybe from too many requests at once or something.  So we retry up to 5 times.
        nattempts = 5
        cmd = 'wget -q --no-check-certificate %s'%(full_file)
        for attempt in range(1,nattempts+1):
            if os.path.exists(full_file):
                break
    return full_file


# In[5]:


# Download the files we need. These files are 
#It looks to me that the image file is the full coadd, 
image_file = wget('ftp://ftp.star.ucl.ac.uk/whartley/ultraVISTA/','UVISTA_J_21_01_16_allpaw_skysub_015_dr3_rc_v5.fits.gz')

#row['root'] = root
#row['image_file'] = image_file

#usually weight is in image file but in this case, it's a separate file
weight_file = wget('ftp://ftp.star.ucl.ac.uk/whartley/ultraVISTA/','UVISTA_J_21_01_16_allpaw_skysub_015_dr3_rc_v5.weight.fits.gz')


# In[6]:


#Not sure this is necessary, but having this information might be useful for further tests

def read_image_header(row, img_file):
    """Read some information from the image header and write into the df row.
Exemplo n.º 11
0
#
# vim: fenc=utf-8:ts=4:et:ai

import os,shutil,wget
dir_name = "20170419_F04"
url = "http://nabetani.sakura.ne.jp/hena/ordf04octsp/"
wget(url)
Exemplo n.º 12
0
def include_bare_minimum_css(domain, html, omit_bad_css=True):
    """
    @param domain string.

    @param html string.  HTML document string.

    @param omit_bad_css boolean.  Defaults to True.  When True, erroneous CSS
    will simply be omitted.  When False, any questionable CSS will be included.
    """
    d = pq(html)
    links_and_styles = d('link,style')
    favicon = None
    stylesheets = []
    for link_or_style_ele in links_and_styles:
        if link_or_style_ele.tag == 'link':
            if 'rel' in link_or_style_ele.attrib and link_or_style_ele.attrib[
                    'rel'] == 'shortcut icon':
                favicon = link_or_style_ele.attrib['href']
                continue
            elif link_or_style_ele.attrib['href'][0] == '/':
                link_or_style_ele.attrib['href'] = '%s%s' % (
                    domain, link_or_style_ele.attrib['href'])
            stylesheets.append(wget(link_or_style_ele.attrib['href']))
        elif link_or_style_ele.tag == 'style':
            stylesheets.append(str(link_or_style_ele))

    out = ''

    for stylesheet in stylesheets:
        for stmt in stylesheet.split('}'):
            stmt = stmt.replace('\n', ' ').strip(' ') + '}'
            match = css_rule_re.match(stmt)
            if match:
                specifiers = match.group('specifiers')
                rule = match.group('rule')
                include_specifiers = ''
                for specifier in specifiers.split(','):
                    clean_specifier = re.sub(css_spec_cleaning_re, '',
                                             specifier)
                    # Interesting idea, but maybe not the best. -.v
                    #specifier = specifier.replace(' ', '>')
                    include_current = False
                    try:
                        matched_elements = d(clean_specifier)
                        if len(matched_elements):
                            # Then the element should be included.
                            include_current = True
                        #else:
                        # Otherwise it can be fairly safely omitted.
                        #pass
                    except Exception, e:  #lxml.cssselect.ExpressionError:
                        # PQ can't handle it; it is likely bad CSS so omit it
                        # unless omit_bad_css suppression has been requested.
                        if omit_bad_css:
                            include_current = True
                    if include_current:
                        if len(include_specifiers):
                            include_specifiers += ',' + specifier
                        else:
                            include_specifiers = specifier
                if len(include_specifiers):
                    # Then this rule is used, so include it.
                    out += '%s %s\n' % (include_specifiers, rule)
Exemplo n.º 13
0
                            include_current = True
                    if include_current:
                        if len(include_specifiers):
                            include_specifiers += ',' + specifier
                        else:
                            include_specifiers = specifier
                if len(include_specifiers):
                    # Then this rule is used, so include it.
                    out += '%s %s\n' % (include_specifiers, rule)

    d('link,style').replaceWith('')
    d('head').append('<style type="text/css">%s</style>' % out)

    if favicon is not None:
        try:
            favicon_bin = wget(favicon)
            favicon_b64 = base64.b64encode(favicon_bin)
            d('head').append(
                '<link id="favicon" rel="shortcut icon" type="image/png" href="data:image/png;base64,%s">'
                % favicon_b64)
        except Exception, e:
            #print 'error: favicon integration failed'
            pass
    return str(d)


css_url_re = re.compile(
    '''(?P<wholething>url\s*\(\s*['"]?(?P<url>[^\)'"]*)['"]?\s*\))''',
    re.I | re.M)
img_src_re = re.compile(
    '''(?P<wholething><\s*img [^>]*src=['"](?P<url>[^'"]*)['"])''',
Exemplo n.º 14
0
def GetData2(args):
    wdir = os.path.expanduser(args.outdir)
    print('work dir = ',wdir)
    try:
        if not os.path.exists(wdir):
            os.makedirs(wdir)
    except OSError as e:
        print("Ignore OSError from makedirs(work):")
        print(e)
        pass

    url_base = 'https://*****:*****@desar2.cosmology.illinois.edu/DESFiles/desarchive/'%ps()
    all_exp = fitsio.read(args.erinfile)
    all_exp = all_exp.astype(all_exp.dtype.newbyteorder('='))
   
    if args.filexps != '':
        print('Read file ',args.filexps)
        with open(args.filexps) as fin:
            exps = [ line.strip() for line in fin if line[0] != '#' ]
        print('File includes %d exposures'%len(exps))

    for exp in sorted(exps):
        exp = int(exp)
        #print(exp)
        data = all_exp[all_exp['expnum'] == exp]
        #print(data)
        exp_df = pandas.DataFrame(data,  columns=['expnum', 'ccdnum', 'band',  'path',  'magzp'])

        # Add some blank columns to be filled in below.
        
        for k in [ 'telra', 'teldec',  'telha' , 'tiling',  'airmass', 'sat', 'fwhm', 'sky',  'sigsky',  'humidity',  'pressure',  'dimmseeing',  'dT', 'outtemp',  'msurtemp',  'winddir',  'windspd']:
            exp_df[k] = [-999.] * len(data)



        for k, row in exp_df.iterrows():
            ccdnum =  row['ccdnum']
            try:
                path = row['path'].strip()
                base_path, _, _, image_file_name = path.rsplit('/',3)
                root, ext = image_file_name.rsplit('_',1)
                #print('root, ext = |%s| |%s|'%(root,ext))
                image_file = wget(url_base, base_path + '/red/immask/', wdir, root + '_' + ext)
                #print('image_file = ',image_file)
                read_image_header(row, image_file)
                remove_temp_files(wdir,  root)
                exp_df.iloc[k] = row
                
            except:
                print("Unxpected error from exp, ccd:",  exp,  ccdnum)
                print(sys.exc_info()[0])
                continue
        
        
        #print('path = ',path)
     
        #file_name = os.path.join(wdir, '%d_Y3A1_atmos_pos_condition.fits'%exp)
        file_name = os.path.join(wdir, 'Y3A1_extrafields.fits')
        write_fit(exp_df.to_records(index=False), file_name)
        #print('Done with exposure ',exp)       
        
    print('\nFinished processing all exposures')
Exemplo n.º 15
0
import pandas as pd
import csv
import wget
import tarfile

# 下载维基百科词条,截取部分作为other类
dbpedia_url = 'https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz'
wget(dbpedia_url)
with tarfile.open("dbpedia_csv.tar.gz", "r:gz") as tar:
    tar.extractall()

df = pd.read_csv("dbpedia_csv/test.csv", names=["class", "title", "content"])
df2 = df.sample(frac=0.03)
title = df2["title"]
content = df2["content"]

with open("d:/research_paper_classfication/other/train_data.csv",
          "w",
          newline="",
          encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    for t, c in zip(title[:1800], content[:1800]):
        writer.writerow([t, c, "5"])

with open("d:/research_paper_classfication/other/test_data.csv",
          "w",
          newline="",
          encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    for t, c in zip(title[1800:], content[1800:]):
        writer.writerow([t, c, "5"])
def extractMostCommonGroup(url, html=None):
## REQUIRING A HOSTNAME MATCH DOESN'T WORK FOR RELATIVE URLS..
#    match = url_hostname_re.match(url)
#    if match is not None:
#        hostname = match.group(2)
#    else:
#        raise Exception('Failed to extract hostname from the supplied url?')

#    a=array([[1,2,9,10,99,100], [3,4,10,11,99,150], [99, 100, 10, 13, 400, -3]])
#    mask=array([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]])
#    clusterid, error, nfound = kcluster(a,
#        nclusters=2, mask=mask, weight=array([1, 1, 1, 1, 1, 1]),
#        transpose=0, npass=1000, method='a', dist='e')
#    print clusterid
#    print error
#    print nfound
#    return

    if html is None:
        html = wget(url)

    bodyOnlyMatch = bodyOnlyRe.match(html)
    if bodyOnlyMatch:
        d = pq(bodyOnlyMatch.group(1))
    else:
        print 'HTML: %s' % html
        print 'Warning: <body> tag region could not be extracted - this is bad'
        d = pq(html)

    def _print(x):
        print x
        return True

    #print [urlparse.urljoin(url, a.attrib['href']) for a in d('a')]
    lst = list(excludeDuplicates([
        urlparse.urljoin(url, a.attrib['href']) for a in d('a') if (
            #_print(pq(a).children('img')) and
            len(pq(a).children('img')) > 0 and
            a.attrib.has_key('href') and
            containsNumberRe.search(a.attrib['href'])
        )
    ]))

    #print lst


    domain = urlparse.urlsplit(url).netloc

    diffs = dict([
        (item, levenshtein_distance(lst[i].replace('http://', ''), domain))
        for i, item in enumerate(lst)
    ])

#     avg = mean(diffs)
#     print avg
#     print min(diffs)
#     print max(diffs)
#     pprint (lst)

    def f(a, b):
        return abs(a - b)

    numGroups = len(diffs) / 2
    print 'Num groups:',numGroups

    if len(diffs) == 0:
        print 'LargestImage extract.. error: no diffs!'
        return []

    groups = kmeans(diffs, numGroups, f)

    # Select and return the largest group of similar links.
    maxIdx = -1
    conflictedIdx = -1
    maxSz = 0
    for idx, group in groups.items():
        l = len(group)
        print l
        if l > maxSz:
            maxIdx = idx
            maxSz = l
            # Any previous conflict is no longer relevant.
            conflictedIdx = -1
        elif l == maxSz:
            # Mark conflicted state.
            conflictedIdx = idx

    print 'groups = %s' % groups
    # Make sure we got a result.
    if maxIdx == -1:
        raise Exception('No groups were found?  Very odd.. groups = %s' % groups)

    # Check to see if the largest group had conflicts.
    if conflictedIdx != -1:
        print 'WARNING: There was a group of equal size which was not selected.'

    imageLinks = False

    for link in groups[maxIdx]:
        if imageRe.match(link):
            imageLinks = True
            break

    if not imageLinks:
        print 'no image links were found.. for url=%s' % url
        out = []
        for link in groups[maxIdx]:
            out.append(extractLargestImageUrlFromUrl(link))
        return out

    #print 'imageLinks = %s' % imageLinks

    return groups[maxIdx]