示例#1
0
def batch_scrape(session):
    scrape.run(os.path.join(app.config['UPLOAD_FOLDER'], session['sid']),
               batch=True,
               output=os.path.join(app.config['SCRAPE_OUTPUT_FOLDER'],
                                   session['sid']))
    zip_files(session['sid'])
    print("BATCH DONE", file=sys.stderr)
示例#2
0
def terms_test():
    label = "test"
    # terms = ["apple", "banana", "carrot", "dragonfruit", "edamame", "fennel", "ginger"]
    terms = [("asparagus","_"), ("broccoli","_")]
    # terms = [("hong kong", "香港")]
    # df = DataFrame([{"english": term, "chinese": "test", "label": label} for term in terms])
    df = DataFrame([{"english": en_term, "chinese": cn_term, "label": label} for en_term,cn_term in terms])
    run(termlist=df, shuffle=True)
示例#3
0
def single_scrape(session):
    scrape.run(os.path.join(
        os.path.join(app.config['UPLOAD_FOLDER'], session['sid'],
                     session['filename'])),
               batch=False,
               output=os.path.join(app.config['SCRAPE_OUTPUT_FOLDER'],
                                   session['sid']))
    print("DONE", file=sys.stderr)
示例#4
0
def scrape_func():

    return scrape.run()
示例#5
0
def start():
    # this didn't need to be an api but here we are.
    run(page_number=1, min_date=end_date, max_date=from_date)
    return "finished!"
示例#6
0
# this gets rid of tags and condenses whitespace
def striptags(s):
  s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>",
             "", s)
  s = re.sub(r"\&\#160\;", " ", s)
  #return condense(re.sub(r"\<[^\>]*\>", " ", s))
  return condense(s)

def getUrlArgs(parseUrl):
  return re.search('grib2_table4-2-(\d+)-(\d+).shtml', parseUrl).groups()

if(len(sys.argv) == 1):  # called with no arguments
  print("Usage: ", sys.argv[0], " url [n]")
  print("  (where n indicates which html table to parse)")
  exit(1)

url = sys.argv[1]

soup = opensoup(url)
tables = soup.findAll("table")  #, {"class":"wikitable sortable"})

for table in tables:
  for r in table.findAll('tr'):
    rl = []
    for c in r.findAll(re.compile('td|th')):
      rl.append(striptags(c.renderContents()))
    if len(rl) > 1 and "href" in rl[1]:
      print('! ' + stripurl(rl[1]))
      scrapeUrl='http://www.nco.ncep.noaa.gov/pmb/docs/grib2/grib2_table4-2-' + getUrlArgs(rl[1])[0] + "-" + getUrlArgs(rl[1])[1] + '.shtml'
      scrape.run(scrapeUrl)