Пример #1
0
def main():
    scrape.main()
    step = 25  # how many pixels we'll jump over, the higher it is, the more HD the sub-image will appear
    targetWidth = 2500  # how big output image will be
    folder = 'Scraped Images'  # folder to get images from
    imageDict = load_images(folder, dimension=(step, step))  # load images to paste on
    imageFile = 'gia.jpg'  # image we'll be making a photo mosaic out of
    editedImage = photo_mosaic(imageFile, imageDict=imageDict, step=step, targetWidth=targetWidth)  # get a photo mosaic
    editedImage.show()  # view image
    save_image(editedImage, imageFile)  # save image
Пример #2
0
def main(teams, delay, lock):
  start = time.time()
  
  scrape.main(teams, lock)

  # go on forever :: request after each delay
  while True:
    # wait for some time
    if (time.time() - start) > delay:
      print "CALL SCRAPE.MAIN() AGAIN."
      scrape.main(teams, lock)
      start = time.time()
      print time.ctime(start), " time after delay call to scrape."

# if __name__ == "__main__":
#   teams = ["mexico","uruguay"]
#   main(teams, 10, Tlock)                    
Пример #3
0
def application():
    print request
    companies = request.args.get('companies')
    print companies
    companylist = companies.split(',')
    blogs = main(companies)
#    blogs.tags = json.dumps(blogs.tags)
    return render_template('application.html', companies=companylist, blogs=blogs)
Пример #4
0
def test_quote_extractions():
    # TODO: Don't curl the website, instead pull from the checked in HTML file!
    for i, quote in enumerate(scrape.main(year=2015, month=1, page=20)):
        fname = join(dirname(__file__), 'data/{}.json'.format(i))
        with open(fname, 'r') as f:
            expected = f.read()

        observed = quote.json()
        assert expected == observed
Пример #5
0
def get_recent_changes(links, done_links):
    """Scrape links on the Special:RecentChanges page."""
    _, _, _, recent_links = S.main('Special:RecentChanges')
    starting_recent_links_num = len(recent_links)
    #    recent_links = recent_links.difference(done_links) # Bad! What if updated?
    recent_links = recent_links.difference(links)
    links.update(recent_links)
    len(recent_links)
    print('Retrieved {} links from "Special:RecentChanges"; {} of which new.'.
          format(starting_recent_links_num, len(recent_links)))
    return links
Пример #6
0
def get_recent_changes(links, done_links):
    """Scrape links on the Special:RecentChanges page."""
    _, _, _, recent_links = S.main('Special:RecentChanges')
    starting_recent_links_num = len(recent_links)
#    recent_links = recent_links.difference(done_links) # Bad! What if updated?
    recent_links = recent_links.difference(links)
    links.update(recent_links)
    len(recent_links)
    print('Retrieved {} links from "Special:RecentChanges"; {} of which new.'.
            format(starting_recent_links_num, len(recent_links)))
    return links
Пример #7
0
def test_dedup_quotes_across_adjacent_pages():
    cross_page_quotes = []
    cross_page_quotes += list(scrape.main(year=2018, month=1, page=1))
    cross_page_quotes += list(scrape.main(year=2018, month=1, page=2))

    quotes_counter = Counter()
    for quote in cross_page_quotes:
        quotes_counter[quote] += 1
        #print('{}  -- hashes to -- {}'.format(
        #    repr(quote)[:10],
        #    hash(quote)
        #))

    fname = join(dirname(__file__),
                 'data/dedup_quotes_across_adjacent_pages.pprint')

    with open(fname, 'r') as f:
        expected = f.read()

    observed = pformat(quotes_counter, indent=4)
    assert expected == observed
Пример #8
0
def main():

    # Update.
    scrape.main()

    # Add any new files.
    cmd = 'git add .'
    check_call(cmd, shell=True)

    # Commit changes.
    cmd = 'git commit -am"changes as of %s"'
    cmd = cmd % datetime.datetime.now().isoformat()
    check_call(cmd, shell=True)

    # Pull changes.
    cmd = 'git pull origin master'
    check_call(cmd, shell=True) 

    # Push changes.   
    cmd = 'git push origin master'
    check_call(cmd, shell=True)
Пример #9
0
def match_data(postalcode, year, id, sid):
    ''' Creates two csv files one with weather and one with
        solar panel data. 
    '''
    solar_file = str(postalcode) + "_" + str(year) + "_S.csv"
    weather_file = str(postalcode) + "_" + str(year) + "_W.csv"

    solarpanel = main(1, year, 1, year + 1, id, sid)
    weather = _main_(postalcode, year)

    weather = add_month(weather, year)
    solarpanel = solar_csv(solarpanel)
    weather = weather_csv(solarpanel, weather)

    weather.to_csv(weather_file)
    solarpanel.to_csv(solar_file)
Пример #10
0
def main():
    workouts = scrape.main()
    email = construct_email(workouts)

    send_email(email)
Пример #11
0
# This doesn't make sense but was easiest
import gcintegration
import scrape
import time

scrape.main()
time.sleep(20)
gcintegration.main()


Пример #12
0
 def setUpClass(self):
     cmd = "https://tabs.ultimate-guitar.com/tab/ed-sheeran/perfect-chords-1956589 -f test -i -c -a -l -j"
     scrape.main(cmd.split(" "))
Пример #13
0
import save_contents
import scrape
import get_contents

links = []
with open('list_of_algorithms_link.txt', 'r') as f:
    links = f.readlines()

for link in links:
    scrape.main(link)
save_contents.main()
get_contents.main()
Пример #14
0
def index():
    """
    Present the data in a RESTful API
    """
    return jsonify(main())
Пример #15
0
def scrape_links(time_before_new_changed, title=None, links=None,
        unscraped_links_filename=os.path.join(
            '..', 'data', 'links', 'links_unscraped.txt'), 
        done_links_filename=os.path.join(
            '..', 'data', 'links', 'done_links.txt')):
    """Scrape links from pages on candidate URLs and retrieve any synonyms."""
    start_time = time.time()
    done_links = get_done_links(done_links_filename)
    if links == None:
        links = get_unscraped_links(unscraped_links_filename, done_links)
    syn_count = len(os.listdir(os.path.join('..', 'data', 'synonyms_new')))
    print('Found {} synonym-files at start of while-loop.\n'.format(syn_count))
    while links:
        if time.time() > start_time + time_before_new_changed:
            print('Time {} seconds exceeded; getting new changed links.'.
                    format(time_before_new_changed))
            links = get_recent_changes(links, done_links)
            start_time = time.time()
        title = links.pop()
        try:
            page, _, synonyms, new_links = S.main(title)
        except KeyboardInterrupt:
            print('''\nWe met with KeyboardInterrupt; title: {}. '''.
                    format(title))
            exc_type, exc_value, exc_traceback = sys.exc_info()
            traceback.print_exception(exc_type, exc_value, exc_traceback)
            return links, done_links
        except TypeError:
            # TypeError: 'NoneType' object is not iterable
            # Usually because "HTTP Error 404: Not Found", so restore title.
            # But temporarily we are leaving titles unrestored, as we think
            # some of these were previously unfiltered `redlink=1" cases.
            links.add(title)
            try:
                done_links.remove(title)
            except KeyError:
                pass
            print('    {}'.format(title))
            continue
        except Exception:
            print('\nWe met with Exception; title: {}.'.
                    format(title))
            exc_type, exc_value, exc_traceback = sys.exc_info()
            traceback.print_exception(exc_type, exc_value, exc_traceback)
            print('\n')
            continue
        # Do not examine whether title in done_links; 
        # would prevent utility of "get_recent_changes()".
        # Assume new links are checked only when received from S.main().
        with open(done_links_filename, 'a') as f:
            f.write('\n' + title)
        if synonyms:
            _ = U.store_data(
                    json.dumps(synonyms).encode(), title, 
                    target_dir='synonyms_new', tar=False)
            syn_count = len(
                    os.listdir(os.path.join('..', 'data', 'synonyms_new')))
        links, new_links, done_links = update_links(
                links, new_links, done_links, title)
        print('''T: {}; links: + {:>3} => {:>}; done: {} ({}%); '''
              '''syn: + {} => {} ({}%);\n    {}'''.
                format(int(time.time() - start_time), len(new_links), 
                    len(links), len(done_links), 
                    round(
                        100 * len(done_links) / 
                        (len(done_links) + len(links)), 1), 
                    len(synonyms), syn_count, 
                    round(100 * syn_count / len(done_links), 1), 
                    title))
        # Uncomment the following line to save whole pages (compressed).
        # _ = U.store_data(page, title, target_dir='html_new', tar=True)
#         # Write the whole of "links": "title" removed, "new_links" added.
#         try:
#             with open(unscraped_links_filename, 'w') as f:
#                 f.write('\n'.join(links))
#         except KeyboardInterrupt:
#             print('''\nWe met with KeyboardInterrupt; title: {}. '''.
#                     format(title))
#             exc_type, exc_value, exc_traceback = sys.exc_info()
#             traceback.print_exception(exc_type, exc_value, exc_traceback)
#             return links, done_links
    return links, done_links
Пример #16
0
def scrape_links(time_before_new_changed,
                 title=None,
                 links=None,
                 unscraped_links_filename=os.path.join('..', 'data', 'links',
                                                       'links_unscraped.txt'),
                 done_links_filename=os.path.join('..', 'data', 'links',
                                                  'done_links.txt')):
    """Scrape links from pages on candidate URLs and retrieve any synonyms."""
    start_time = time.time()
    done_links = get_done_links(done_links_filename)
    if links == None:
        links = get_unscraped_links(unscraped_links_filename, done_links)
    syn_count = len(os.listdir(os.path.join('..', 'data', 'synonyms_new')))
    print('Found {} synonym-files at start of while-loop.\n'.format(syn_count))
    while links:
        if time.time() > start_time + time_before_new_changed:
            print(
                'Time {} seconds exceeded; getting new changed links.'.format(
                    time_before_new_changed))
            links = get_recent_changes(links, done_links)
            start_time = time.time()
        title = links.pop()
        try:
            page, _, synonyms, new_links = S.main(title)
        except KeyboardInterrupt:
            print('''\nWe met with KeyboardInterrupt; title: {}. '''.format(
                title))
            exc_type, exc_value, exc_traceback = sys.exc_info()
            traceback.print_exception(exc_type, exc_value, exc_traceback)
            return links, done_links
        except TypeError:
            # TypeError: 'NoneType' object is not iterable
            # Usually because "HTTP Error 404: Not Found", so restore title.
            # But temporarily we are leaving titles unrestored, as we think
            # some of these were previously unfiltered `redlink=1" cases.
            links.add(title)
            try:
                done_links.remove(title)
            except KeyError:
                pass
            print('    {}'.format(title))
            continue
        except Exception:
            print('\nWe met with Exception; title: {}.'.format(title))
            exc_type, exc_value, exc_traceback = sys.exc_info()
            traceback.print_exception(exc_type, exc_value, exc_traceback)
            print('\n')
            continue
        # Do not examine whether title in done_links;
        # would prevent utility of "get_recent_changes()".
        # Assume new links are checked only when received from S.main().
        with open(done_links_filename, 'a') as f:
            f.write('\n' + title)
        if synonyms:
            _ = U.store_data(json.dumps(synonyms).encode(),
                             title,
                             target_dir='synonyms_new',
                             tar=False)
            syn_count = len(
                os.listdir(os.path.join('..', 'data', 'synonyms_new')))
        links, new_links, done_links = update_links(links, new_links,
                                                    done_links, title)
        print('''T: {}; links: + {:>3} => {:>}; done: {} ({}%); '''
              '''syn: + {} => {} ({}%);\n    {}'''.format(
                  int(time.time() - start_time), len(new_links), len(links),
                  len(done_links),
                  round(100 * len(done_links) / (len(done_links) + len(links)),
                        1), len(synonyms), syn_count,
                  round(100 * syn_count / len(done_links), 1), title))
        # Uncomment the following line to save whole pages (compressed).
        # _ = U.store_data(page, title, target_dir='html_new', tar=True)


#         # Write the whole of "links": "title" removed, "new_links" added.
#         try:
#             with open(unscraped_links_filename, 'w') as f:
#                 f.write('\n'.join(links))
#         except KeyboardInterrupt:
#             print('''\nWe met with KeyboardInterrupt; title: {}. '''.
#                     format(title))
#             exc_type, exc_value, exc_traceback = sys.exc_info()
#             traceback.print_exception(exc_type, exc_value, exc_traceback)
#             return links, done_links
    return links, done_links
Пример #17
0
#!/bin/python3

# Used to call scrape.py by passing it a text file with multiple entries
# and categories

# File input should be <category> <url> Make sure a space is inbetween to parse correctly

import sys
import scrape

# print(sys.argv[1])

infile = open(sys.argv[1], "r")
count = 0

for item in infile:
    insplit = item.split(" ")
    category = insplit[0]
    url = insplit[1]
    #   print(url)
    #   print(category)
    scrape.main(url, category)
    count = count + 1

print(str(count) + " file(s) processed.")
Пример #18
0
# You should have received a copy of the GNU General Public License
# along with roastcalc.  If not, see <http://www.gnu.org/licenses/>.

### Define magic numbers ###
safetyBuffer = 0.005        # Add this percent to roast input weight
cupQuantity = 0.2           # Add to roast for production cupping
emailBool = False           # Email report or no?
emails = ["*****@*****.**"]

### Import data from CSV files ###
import scrape   # Download the spreadsheet as CSV files
import csv      # Parse the CSV files
import os       # Run the email command

# Download the files 
scrape.main()

# Column indices for history.csv:
# ID-Tag, Profile, Date, Component, Start Weight, End Weight, % Loss
# 0,      1,       2,    3,         4,            5,          6
historyCSV = list(csv.reader(open("history.csv")))

# Column indices for products.csv:
# Product, Component, Component #, Roast profile, % of blend
# 1,       2,         3,           4,             5
productsCSV = list(csv.reader(open("products.csv")))
# List of available products
uniqueProducts = set([product[0] for product in productsCSV])

# Column indices for totals.csv:
# Product, 3oz, 12oz, 2lb, 5lb, total lbs, roast/don't roast [1/0]
Пример #19
0
import os;  os.chdir(os.getenv("HOME")+"/Dropbox/Courses/MIT/1.2 LDA - Finding Themes in Project Description/MITprojects")
#import gather
#abs_dict=gather.main(stop=2)
import json
with open('abs_dict.json') as abs_dict_file:    #import json from file
    abs_dict = json.load(abs_dict_file)

import scrape
scrape.main(abs_dict)
scrape.tokenize()

#Now run lda. in onlineMIT I coded path to ../MITprojects/ etc
#This creates some .txt files
#os.chdir(os.getenv("HOME")+"/Dropbox/Courses/MIT/1.2 LDA - Finding Themes in Project Description/onlineldavb")
cd "/home/kinkyboy/Dropbox/Courses/MIT/1.2 LDA - Finding Themes in Project Description/onlineldavb"
python onlineMIT.py 101

python printtopics.py dictnostops.txt lambda-100.dat  #NaN


#TODO
#remove symbols from tokens
#check wikirandom for regular expressions: saxon genitives should be removed: "an agent's" in 'Leslie Kaelbling_abs20.txt'

#Notes
#python onlinewikipedia.py 101; python printtopics.py dictnostops.txt lambda-100.dat
#python printtopics.py dictnostops.txt lambda-10.dat
#https://wellecks.wordpress.com/2014/10/26/ldaoverflow-with-online-lda/

#(wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
#printed output is in the return of do_e_step_docs  (gamma, sstats) = self.do_e_step_docs(docs)