Exemplo n.º 1
0
    psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
    conn = psycopg2.connect(cp.get("search", "db"))

    curs = conn.cursor()

    # Start by indexing the main website
    log("Starting indexing of main website")
    SitemapSiteCrawler("www.postgresql.org", conn, 1, cp.get("search", "frontendip"), True).crawl()
    conn.commit()

    # Skip id=1, which is the main site..
    curs.execute("SELECT id, hostname FROM sites WHERE id>1")
    for siteid, hostname in curs.fetchall():
        log("Starting indexing of %s" % hostname)
        GenericSiteCrawler(hostname, conn, siteid).crawl()
        conn.commit()

    curs.execute(
        "WITH t AS (SELECT site,count(*) AS c FROM webpages GROUP BY site) UPDATE sites SET pagecount=t.c FROM t WHERE id=t.site"
    )
    conn.commit()

    time.sleep(1)


if __name__ == "__main__":
    cp = ConfigParser()
    cp.read("search.ini")

    threadwrapper(doit)
Exemplo n.º 2
0
    curs.execute(
        "WITH t AS (SELECT list,count(*) AS c FROM messages GROUP BY list) UPDATE lists SET pagecount=t.c FROM t WHERE id=t.list"
    )
    # Indicate when we crawled
    curs.execute("UPDATE lastcrawl SET lastcrawl=CURRENT_TIMESTAMP")
    conn.commit()

    log("Indexed %s messages" % n)
    time.sleep(1)


if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("-l", "--list", dest="list", help="Crawl only this list")
    parser.add_option("-m", "--month", dest="month", help="Crawl only this month")
    parser.add_option("-f", "--full", dest="full", action="store_true", help="Make a full crawl")
    parser.add_option("-t", "--status-interval", dest="status_interval", help="Seconds between status updates")
    parser.add_option("-c", "--commit-interval", dest="commit_interval", help="Messages between each commit")

    (opt, args) = parser.parse_args()

    if opt.full and opt.month:
        print "Can't use both full and specific month!"
        sys.exit(1)

        # assign default values
    opt.status_interval = opt.status_interval and int(opt.status_interval) or 30
    opt.commit_interval = opt.commit_interval and int(opt.commit_interval) or 500

    threadwrapper(doit, opt)
Exemplo n.º 3
0
    conn = psycopg2.connect(cp.get("search", "db"))

    curs = conn.cursor()

    # Start by indexing the main website
    log("Starting indexing of main website")
    SitemapSiteCrawler("www.postgresql.org", conn, 1,
                       cp.get("search", "frontendip"), True).crawl()
    conn.commit()

    # Skip id=1, which is the main site..
    curs.execute("SELECT id, hostname, https FROM sites WHERE id>1")
    for siteid, hostname, https in curs.fetchall():
        log("Starting indexing of %s" % hostname)
        GenericSiteCrawler(hostname, conn, siteid, https).crawl()
        conn.commit()

    curs.execute(
        "WITH t AS (SELECT site,count(*) AS c FROM webpages GROUP BY site) UPDATE sites SET pagecount=t.c FROM t WHERE id=t.site"
    )
    conn.commit()

    time.sleep(1)


if __name__ == "__main__":
    cp = ConfigParser()
    cp.read("search.ini")

    threadwrapper(doit)
Exemplo n.º 4
0
                      "--month",
                      dest='month',
                      help="Crawl only this month")
    parser.add_option("-f",
                      "--full",
                      dest='full',
                      action="store_true",
                      help="Make a full crawl")
    parser.add_option("-t",
                      "--status-interval",
                      dest='status_interval',
                      help="Seconds between status updates")
    parser.add_option("-c",
                      "--commit-interval",
                      dest='commit_interval',
                      help="Messages between each commit")

    (opt, args) = parser.parse_args()

    if opt.full and opt.month:
        print "Can't use both full and specific month!"
        sys.exit(1)

    # assign default values
    opt.status_interval = opt.status_interval and int(
        opt.status_interval) or 30
    opt.commit_interval = opt.commit_interval and int(
        opt.commit_interval) or 500

    threadwrapper(doit, opt)