Пример #1
0
def crawlers_update():
    # FIXME ugly code to do both edit/update
    if request.method == 'GET':
        id = request.args.get('id', '')
        crawler = Crawler.get(Crawler.id == id)
        form = CrawlersForm(None, crawler)
    else:
        id = request.form.get('id')
        crawler = Crawler.get(Crawler.id == id)
        form = CrawlersForm(request.form)
        form.validate()
        if form.errors:
            pass
        else:
            now = datetime.utcnow()
            crawler.name = request.form.get('name')
            crawler.runnable = request.form.get('runnable')
            crawler.gspread_link = request.form.get('gspread_link')
            crawler.url = None
            crawler.updated_at = now
            crawler.save()
            new_crawler = False
            form = CrawlersForm(None, crawler)
            flash('Crawler was updated')
            return redirect(url_for('crawlers.crawlers_list'))
    return render_template('crawler.html',
                           current_user=current_user,
                           form=form,
                           new_crawler=False,
                           id=id)
Пример #2
0
def db_create():
  db.connect()
  User.create_table(True) # True means fail siliently if table exists
  SocialCounter.create_table(True)
  SocialCount.create_table(True)
  ChannelCounter.create_table(True)
  Channel.create_table(True)
  UserFunction.create_table(True)
  FunctionResult.create_table(True)
  Crawler.create_table(True)
  CrawlerPage.create_table(True)
  Mozscape.create_table(True)
  MozscapeResult.create_table(True)
  MozscapeIndexMetadata.create_table(True)
  db.close()
Пример #3
0
def crawlers_create():
    form = CrawlersForm(request.form)
    form.validate()
    new_crawler = True
    if form.errors:
        pass
    else:
        now = datetime.utcnow()
        crawler = Crawler.create(
            name=request.form.get('name'),
            runnable=request.form.get('runnable'),
            gspread_link=request.form.get('gspread_link'),
            # url=request.form.get('url'),
            url=None,
            crawl_status=None,
            crawled_at=None,
            created_at=now,
            updated_at=now)
        new_crawler = False
        form = CrawlersForm(None, crawler)
        flash('Crawler was created')
        return redirect(url_for('crawlers.crawlers_list'))
    return render_template('crawler.html',
                           current_user=current_user,
                           form=form,
                           new_crawler=new_crawler)
Пример #4
0
def crawlers():
    crawlers = Crawler.select()
    for crawler in crawlers:
        if crawler.is_runnable():
            # delete crawler before crawling it again:
            dq = CrawlerPage.delete().where(CrawlerPage.name == crawler.name)
            deleted_count = dq.execute()
            pages = CrawlerPage.crawl(crawler)
    print("%s job: crawlers processed" % datetime.utcnow())
Пример #5
0
def crawlers_summary():
    id = request.args.get('id', None)
    if id is None:
        flash('Error: id is missing for Crawler summary page!')
        return redirect(url_for('crawlers.crawlers_list'))
    c = Crawler.get(Crawler.id == id)
    pages = CrawlerPage.select().where(CrawlerPage.name == c.name).order_by(
        CrawlerPage.timestamp.desc())
    return render_template('crawlers_summary.html',
                           current_user=current_user,
                           pages=pages,
                           id=id)
def async_spider(app, crawler_id):
    now = datetime.utcnow()
    print("%s async_spider started..." % datetime.utcnow())
    print("\tPID=%s" % os.getpid())
    print("\tcrawler_id=%s" % crawler_id)
    crawler = Crawler.get(Crawler.id == crawler_id)
    if crawler.is_runnable:
        # delete crawled pages before crawling it again:
        dq = CrawlerPage.delete().where(CrawlerPage.name == crawler.name)
        deleted_count = dq.execute()
        crawler.crawled_at = now
        crawler.crawl_status = 'crawling'
        crawler.save()
        pages_len = 0
        try:
            pages_len = CrawlerPage.crawl(crawler)
        finally:
            crawler.crawl_status = "crawled %s pages" % pages_len
            crawler.save()
            print("\tnumber of pages crawled=%s" % pages_len)
    print("%s async_spider ended" % datetime.utcnow())
Пример #7
0
def crawlers_list():
    crawlers = Crawler.select()
    return render_template('crawlers_list.html',
                           current_user=current_user,
                           crawlers=crawlers)