Пример #1
0
def export_domains(site):
    db_export = CrawlerDb(site)
    db_export.connect()

    logger.info("=" * 40)
    logger.info("Processing...")
    domains = db_export.get_all_domains()
    logger.info("There are %d domains" % len(domains))
    file = open(DOMAINS_FILENAME, "w+")
    file.writelines("\n".join(domains))
    file.close()
    logger.info("All domains saved to ./data/domains.csv")
    logger.info("=" * 40)
Пример #2
0
def export_emails(site):
    # Set up the database
    db_export = CrawlerDb(site)
    db_export.connect()

    logger.info("=" * 40)
    logger.info("Processing...")
    emails = db_export.get_all_emails()
    logger.info("There are %d emails" % len(emails))
    file = open(EMAILS_FILENAME, "w+")
    file.writelines("\n".join(emails))
    file.close()
    logger.info("All emails saved to ./data/emails.csv")
    logger.info("=" * 40)
Пример #3
0
def crawl(site, keywords, output_ui: OutputUIInterface = None):
    # Set up the database
    global db
    db = CrawlerDb(site)
    db.connect()
    """
	This method will

	1) Google the keywords, and extract MAX_SEARCH_RESULTS
	2) For every result (aka website), crawl the website 2 levels deep.
		That is the homepage (level 1) and all it's links (level 2).
		But if level 1 has the email, then skip going to level 2.
	3) Store the html in /data/html/ and update the database of the crawled emails

	crawl(keywords):
		Extract Google search results and put all in database
		Process each search result, the webpage:
			Crawl webpage level 1, the homepage
			Crawl webpage level 2, a link away from the homepage
			Update all crawled page in database, with has_crawled = True immediately
			Store the HTML
	"""
    logger.info("-" * 40)
    # logger.info("Keywords to Google for: %s" % keywords.decode('utf-8'))
    logger.info("Keywords to Google for: %s" % keywords)
    logger.info("-" * 40)

    # Step 1: Crawl Google Page
    # eg http://www.google.com/search?q=singapore+web+development&start=0
    # Next page: https://www.google.com/search?q=singapore+web+development&start=10
    # Google search results are paged with 10 urls each. There are also adurls
    for page_index in range(0, MAX_SEARCH_RESULTS, 10):
        query = {'q': keywords}
        url = 'http://%s/search?' % site
        url = url + urllib.parse.urlencode(query) + '&start=' + str(page_index)
        # query = {'wd': keywords}
        # url = 'http://www.baidu.com/s?' + urllib.parse.urlencode(query) + '&pn=' + str(page_index)

        try:
            data = retrieve_html(url)
            # 	print("data: \n%s" % data)
            for url in google_url_regex.findall(data):
                db.enqueue(str(url))
            for url in google_adurl_regex.findall(data):
                db.enqueue(str(url))
        except Exception as e:
            logger.error(e)

        # for url in baidu_url_regex.findall(data):
        # 	db.enqueue(str(url))
        # for url in baidu_adurl_regex.findall(data):
        # 	db.enqueue(str(url))

    # Step 2: Crawl each of the search result
    # We search till level 2 deep
    while (True):
        # Dequeue an uncrawled webpage from db
        uncrawled = db.dequeue()
        if (uncrawled == False):
            break
        email_set = find_emails_2_level_deep(uncrawled.url, output_ui)
        if (len(email_set) > 0):
            db.crawled(uncrawled, ",".join(list(email_set)))
            if output_ui:
                output_ui.append(list(email_set))
        else:
            db.crawled(uncrawled, None)
Пример #4
0
google_adurl_regex = re.compile('adurl=(.*?)"')
google_url_regex = re.compile('url\?q=(.*?)&sa=')
email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE)
url_regex = re.compile('<a\s.*?href=[\'"](.*?)[\'"].*?>')
# Below url_regex will run into 'Castrophic Backtracking'!
# http://stackoverflow.com/questions/8010005/python-re-infinite-execution
# url_regex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')

# Maximum number of search results to start the crawl
MAX_SEARCH_RESULTS = 150

EMAILS_FILENAME = 'data/emails.csv'
DOMAINS_FILENAME = 'data/domains.csv'

# Set up the database
db = CrawlerDb()
db.connect()


def crawl(keywords):
	"""
	This method will

	1) Google the keywords, and extract MAX_SEARCH_RESULTS
	2) For every result (aka website), crawl the website 2 levels deep.
		That is the homepage (level 1) and all it's links (level 2).
		But if level 1 has the email, then skip going to level 2.
	3) Store the html in /data/html/ and update the database of the crawled emails

	crawl(keywords):
		Extract Google search results and put all in database
Пример #5
0
google_adurl_regex = re.compile('adurl=(.*?)"')
google_url_regex = re.compile('url\?q=(.*?)&amp;sa=')
email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE)
url_regex = re.compile('<a\s.*?href=[\'"](.*?)[\'"].*?>')
# Below url_regex will run into 'Castrophic Backtracking'!
# http://stackoverflow.com/questions/8010005/python-re-infinite-execution
# url_regex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')

# Maximum number of search results to start the crawl
MAX_SEARCH_RESULTS = 500

EMAILS_FILENAME = 'data/emails.csv'
DOMAINS_FILENAME = 'data/domains.csv'

# Set up the database
db = CrawlerDb()
db.connect()


def crawl(keywords):
	"""
	This method will

	1) Google the keywords, and extract MAX_SEARCH_RESULTS
	2) For every result (aka website), crawl the website 2 levels deep.
		That is the homepage (level 1) and all it's links (level 2).
		But if level 1 has the email, then skip going to level 2.
	3) Store the html in /data/html/ and update the database of the crawled emails

	crawl(keywords):
		Extract Google search results and put all in database
Пример #6
0
google_adurl_regex = re.compile('adurl=(.*?)"')
google_url_regex = re.compile('url\?q=(.*?)&amp;sa=')
email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE)
url_regex = re.compile('<a\s.*?href=[\'"](.*?)[\'"].*?>')
# Below url_regex will run into 'Castrophic Backtracking'!
# http://stackoverflow.com/questions/8010005/python-re-infinite-execution
# url_regex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')

# Maximum number of search results to start the crawl
MAX_SEARCH_RESULTS = 150

EMAILS_FILENAME = 'data/emails.csv'
DOMAINS_FILENAME = 'data/domains.csv'

# Set up the database
db = CrawlerDb()
db.connect()


def crawl(keywords):
	"""
	This method will

	1) Google the keywords, and extract MAX_SEARCH_RESULTS
	2) For every result (aka website), crawl the website 2 levels deep.
		That is the homepage (level 1) and all it's links (level 2).
		But if level 1 has the email, then skip going to level 2.
	3) Store the html in /data/html/ and update the database of the crawled emails

	crawl(keywords):
		Extract Google search results and put all in database
Пример #7
0
def crawl(keywords,name):
	"""
	This method will

	1) Google the keywords, and extract MAX_SEARCH_RESULTS
	2) For every result (aka website), crawl the website 2 levels deep.
		That is the homepage (level 1) and all it's links (level 2) and all it's link's links (level 3).
	3) Store the html in /data/html/ and update the database of the crawled emails

	crawl(keywords):
		Extract Google search results and put all in database
		Process each search result, the webpage:
			Crawl webpage level 1, the homepage
			Crawl webpage level 2, a link away from the homepage
			Crawl webpage level 3, a link away from the homepage
			Update all crawled page in database, with has_crawled = True immediately
			Store the HTML
	"""
	if internet_on() == False:
		print "*******************NO INTERNET CONNECTION*******************"
		sys.exit(0)

	db = CrawlerDb()
	db.connect()
	logger.info("-"*40)
	logger.info("Keywords to Google for: %s" % keywords)
	logger.info("-"*40)

	# Step 1: Crawl Google Page
	# eg http://www.google.com/search?q=singapore+web+development&start=0
	# Next page: https://www.google.com/search?q=singapore+web+development&start=10
	# Google search results are paged with 10 urls each. There are also adurls
	for page_index in range(0, MAX_SEARCH_RESULTS, 10):
		query = {'q': keywords}
		url = 'http://www.google.com/search?' + urllib.urlencode(query) + '&start=' + str(page_index)
		data = retrieve_html(url)
		# 	print("data: \n%s" % data)
		for url in google_url_regex.findall(data):
			db.enqueue(url)
			break



	# Step 2: Crawl each of the search result
	# We search till level 2 deep
	while (True):
		# Dequeue an uncrawled webpage from db
		uncrawled = db.dequeue()
		if (uncrawled == False):
			break
		email_set = find_emails_3_level_deep(uncrawled.url,db)
		if (len(email_set) > 0):
			db.crawled(uncrawled, ",".join(list(email_set)))
		else:
			db.crawled(uncrawled, None)
	write(name,db)