Exemplo n.º 1
0
def _initailize_progress(pigeon_key, card_key):
    progress = Progress(pigeon_key=pigeon_key,
                        card_key=card_key,
                        familiar_factor=0,
                        learn_factor=1)
    progress.put()
    return
Exemplo n.º 2
0
#!/usr/bin/env python

from models import User, UserPool, Article, ArticlePool, Progress
import requests
import re
from bs4 import BeautifulSoup

#page = requests.get("https://www.ptt.cc/bbs/Gossiping/index20373.html", cookies={"over18":"1"})
#bs = BeautifulSoup(page.content, "html.parser")
#print(list(div.find("a") for div in list(bs.find_all(class_="title"))))

#ap = ArticlePool()
#ap.load()
prg = Progress()
prg.load()
#print(len(ap.articles))
print(prg.current_index)
Exemplo n.º 3
0
    contributors = list(contributors)
    organization.contributors = contributors

    try:
        session.add(organization)
        session.commit()
    except:
        session.rollback()


if __name__ == "__main__":
    logging.basicConfig(filename="scraper.log", level=logging.INFO)
    G = Github(ACCESS_TOKEN)
    progress = session.query(Progress).first()
    if not progress:
        progress = Progress(id="progress", value=0)

    with open('government.github.com/_data/governments.yml') as infile:
        _data = yaml.load(infile)
    data = reshape_data(_data)
    organizations_government = set(
        [organization['entity'].lower() for organization in data])

    with open('government.github.com/_data/civic_hackers.yml') as infile:
        _data_civic = yaml.load(infile)
    data_civic = reshape_data(_data_civic)
    organizations_civic = set(
        [organization['entity'].lower() for organization in data_civic])

    for i in xrange(progress.value, len(data)):
        logging.info("{} {} {}".format(i, data[i]['entity'],
Exemplo n.º 4
0
    print("[Info]: Retriving ...")
    page = requests.get("{}{}".format(ROOT_URL, url), cookies=COOKIES)
    if page.status_code != requests.codes.ok:
        error_message = "[Error {}]: Unable Retrive URL: {}\n".format(
            page.status_code, url)
        log_error_message(error_message)
        return
    round_trip = page.elapsed.total_seconds()
    request_time_total += round_trip
    request_counter += 1
    print("[Info]: Time: {}s \t Counter: {} \t Average: {}s".format(
        round_trip, request_counter, request_time_total / request_counter))
    return BeautifulSoup(page.content, "html.parser")


progress = Progress()
user_pool = UserPool()
article_pool = ArticlePool()

progress.load()
user_pool.load()
article_pool.load()

for index_url in url_generator(progress):
    print("[Info]: ".ljust(60, "="))
    print("[Index Page]: {}{}".format(ROOT_URL, index_url))
    index_html = retrive_html_from_url(index_url)

    for article_item_div in index_html.find_all(class_="title"):
        article_item_a = article_item_div.find("a")
        if not article_item_a: