def _initailize_progress(pigeon_key, card_key): progress = Progress(pigeon_key=pigeon_key, card_key=card_key, familiar_factor=0, learn_factor=1) progress.put() return
#!/usr/bin/env python from models import User, UserPool, Article, ArticlePool, Progress import requests import re from bs4 import BeautifulSoup #page = requests.get("https://www.ptt.cc/bbs/Gossiping/index20373.html", cookies={"over18":"1"}) #bs = BeautifulSoup(page.content, "html.parser") #print(list(div.find("a") for div in list(bs.find_all(class_="title")))) #ap = ArticlePool() #ap.load() prg = Progress() prg.load() #print(len(ap.articles)) print(prg.current_index)
contributors = list(contributors) organization.contributors = contributors try: session.add(organization) session.commit() except: session.rollback() if __name__ == "__main__": logging.basicConfig(filename="scraper.log", level=logging.INFO) G = Github(ACCESS_TOKEN) progress = session.query(Progress).first() if not progress: progress = Progress(id="progress", value=0) with open('government.github.com/_data/governments.yml') as infile: _data = yaml.load(infile) data = reshape_data(_data) organizations_government = set( [organization['entity'].lower() for organization in data]) with open('government.github.com/_data/civic_hackers.yml') as infile: _data_civic = yaml.load(infile) data_civic = reshape_data(_data_civic) organizations_civic = set( [organization['entity'].lower() for organization in data_civic]) for i in xrange(progress.value, len(data)): logging.info("{} {} {}".format(i, data[i]['entity'],
print("[Info]: Retriving ...") page = requests.get("{}{}".format(ROOT_URL, url), cookies=COOKIES) if page.status_code != requests.codes.ok: error_message = "[Error {}]: Unable Retrive URL: {}\n".format( page.status_code, url) log_error_message(error_message) return round_trip = page.elapsed.total_seconds() request_time_total += round_trip request_counter += 1 print("[Info]: Time: {}s \t Counter: {} \t Average: {}s".format( round_trip, request_counter, request_time_total / request_counter)) return BeautifulSoup(page.content, "html.parser") progress = Progress() user_pool = UserPool() article_pool = ArticlePool() progress.load() user_pool.load() article_pool.load() for index_url in url_generator(progress): print("[Info]: ".ljust(60, "=")) print("[Index Page]: {}{}".format(ROOT_URL, index_url)) index_html = retrive_html_from_url(index_url) for article_item_div in index_html.find_all(class_="title"): article_item_a = article_item_div.find("a") if not article_item_a: