示例#1
0
    def load_components(self, count=0):
        if count >= 2:  #If the loader fails for 2 consecutive times, exit the engine
            custom_logger().log_message("Unable to load the engine. Exiting.",
                                        logger_handler.log_level_CRITICAL)
            print("Unable to load the engine. Exiting.")
            sys.exit(0)
        graph_obj = pagerank().unpickle_graph()
        indexer_obj = indexer().unpickle_indexer()
        if graph_obj is not None and indexer_obj is not None:
            lda_obj = lda()
            lda_obj.load_model()
            return graph_obj, indexer_obj, lda_obj

        if count > 0:
            custom_logger().log_message(
                "Unexpected error while trying to launch engine. Trying again.",
                logger_handler.log_level_ERROR)
            print(
                "Unexpected error while trying to launch engine. Trying again."
            )

        #Prompt the user to reconstruct the models again
        val = input(
            "Processed information not found. Do you want to crawl again? (y/n) "
        )
        val = val.lower()
        if val == 'y' or val == 'yes':
            try:
                print(
                    "Launching web crawler. This operation will take a while to complete."
                )
                crawler()
                print(
                    "Web crawler operations completed! Launching components.")
                return self.load_components(count + 1)
            except Exception as e:
                custom_logger().log_message(
                    "Unexpected error while crawling the web. Exiting.\n" +
                    str(e), logger_handler.log_level_CRITICAL)
                print("Unexpected error while crawling the web. Exiting.")
                sys.exit(0)
        else:
            custom_logger().log_message(
                "Unable to load the search engine without the required components. Exiting.",
                logger_handler.log_level_CRITICAL)
            print(
                "Unable to load the search engine without the required components. Exiting."
            )
            sys.exit(0)
示例#2
0
def populateData(start_date, end_date):
    events = crawler(start_date, end_date)
    events = processGeoInfo(events)

    # populate data into db
    for e in events:
        try:
            add_activity(db_handler, e['Name'], e['CreatorId'], e['Location'], e['latlon']['lat'], \
                e['latlon']['lng'], e['Date'], e['Time'], e['Description'], e['Tag'])
        except:
            continue
示例#3
0
def populateData(start_date, end_date):
    events = crawler(start_date, end_date)
    events = processGeoInfo(events)

    # populate data into db
    for e in events:
        try:
            add_activity(db_handler, e['Name'], e['CreatorId'], e['Location'], e['latlon']['lat'], \
                e['latlon']['lng'], e['Date'], e['Time'], e['Description'], e['Tag'])
        except:
            continue
示例#4
0
def facebook_crawl():
    for i in range(20):
        try:
            c = crawler(0)
            print 'Crawler object created'
            print '              CRAWL_STARTING'
            c.start()
        except Exception as e:
            print e
            try:
                del c
            except:
                pass
            time.sleep(20)
def work_thread():
    #print('enter',threading.current_thread().name)
    global detail_urls
    while(detail_urls):
        url = detail_urls.get_random_item()

        try:
            sleep_time = random.random()*6
            time.sleep(sleep_time)

            cra = crawler()
            #print('crawler url:%s'%url)
            flag,res = cra.get_by_proxy(url=url,call_back=get_pagedetail_callback)
            if(flag):
                print(threading.current_thread().name,'get detail done')
                after_process(url,res)
            else:
                print(threading.current_thread().name,'get detail failed')

        except Exception as e:
            print(threading.current_thread().name, 'get detail done,error ending')
            print(threading.current_thread().name,e)
示例#6
0
def run_crawler(database_file, url_file, crawler_id, number_processes):
    bot = crawler(database_file, url_file, crawler_id, number_processes)
    bot.crawl(depth=1)
示例#7
0
import MySQLdb
from crawler import *


if __name__ == '__main__':
    

    crawl = crawler()
    
    rep = 'Y'
    while rep == 'y' or rep == 'Y':
        print ""
        print "-----------------------------------------------------------------------------"
        print "1. Add New Website"
        print "2. Delete Website"
        print "3. Parse News (Just for Demo)"
        print "4. Parse All News from existing xml(s)"
        print "-----------------------------------------------------------------------------"

        choice = int(raw_input("\t\t\t\tEnter Your Choice\n-----------------------------------------------------------------------------\n"))
        print "-----------------------------------------------------------------------------"
        if choice == 1:
            
            link = raw_input('\t\t\t\tEnter a link\n-----------------------------------------------------------------------------\n')
            crawl.add_web(link)
            
        elif choice == 2:
            link = raw_input('\t\t\t\tEnter a link\n-----------------------------------------------------------------------------\n')
            crawl.delete_web(link)

        elif choice == 3:
示例#8
0
import socket
import sys
from crawler import *
import string
server_address = ('localhost', 10003)

# Create a TCP/IP socket
socks = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

# Connect the socket to the port where the server is listening
print >> sys.stderr, 'connecting to %s port %s' % server_address
socks.connect(server_address)

x = 1
data = 'aditya'
while x == 1:
    l = socks.recv(2)
    length = int(l)
    data = socks.recv(length)
    urls = crawler(data)
    length = len(urls)
    socks.send((chr)(length))
    socks.send(urls)
    print data
    x = 2
socks.close()
示例#9
0
文件: tests.py 项目: sanglech/CSC326
 def test_emptyUrlList (self):
     self.bot=crawler(None, "empty.txt ")
     self.bot.crawl(depth=1)
     self.assertFalse ((self.bot) ==-1, 'this text file exists!')
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from crawler import *
import json, sys
# example : crawler(['-b', 'PublicServan', '-i', '100', '101'])
argvs = sys.argv[1:]
filename = crawler(argvs)
with open('ptt.kcm', 'w', encoding='utf-8') as f:
	for i in map(lambda x:x.get('content', ''), json.load(open(filename, 'r'))['articles']):
		f.write(i + '\n')
 def setUp(self):
     self.con = lite.connect("dbFile.db")
     self.bot = crawler(self.con, "urls.txt")
     self.bot.crawl(depth=1)
     self.bot.generate_page_ranks(self.bot._links)
     self.curs = self.con.cursor()
示例#12
0
from crawler import *
from pagerank import page_rank
import redis
import time

# Get crawler object and crawl on urls found in urls.txt
crawler = crawler(None, 'urls.txt')
start = time.time()
crawler.crawl()
print "Elapsed Time: %s" % (time.time() - start)
# Get the data structures generated by the crawler
lexicon = crawler.get_lexicon()
inverted_index = crawler.get_inverted_index()
resolved_inverted_index = crawler.get_resolved_inverted_index()
document_index = crawler.get_document_index()
# Run pagerank on the links generated by the crawler
pagerank = page_rank(crawler._links)
# Store data on persistent storage i.e. Redis
rdb = redis.Redis()
rdb.flushdb()

all_words = ''

for word in lexicon:
    rdb.set('lexicon:' + str(word), lexicon[word])
    all_words = all_words + str(word) + " , "

rdb.set('all_words', all_words.strip(' , '))

for word_id in inverted_index:
    rdb.set('inverted_index:' + str(word_id),
示例#13
0
def course_info():

    # build a request object
    # get related important factors from query result
    req = request.get_json(force=True)
    query_result = req.get('queryResult')
    # debug use 
    # print(query_result)
    action = query_result.get('action')
    params = query_result.get('parameters')
    intents = query_result.get('intent').get('displayName')

    #
    if intents != 'askKB':
        CourseId = params.get('CourseId')
        # CourseId = re.sub(r'([a-zA-Z]{4}) - ([0-9]) - ([0-9]{3})', r'\1\2\3', CourseId)

    #different intents get different feedback
    if intents == "courseInfo":
        info = get_data(CourseId)
        message = "\n".join(i+': '+str(info[i]) for i in info.keys())
        # denug use 
        # print("message:", message)

    elif intents == 'courseInfo - custom':
        message = 'For more information about this course, click handbook link: ' + crawler(CourseId, 'handbook Link')

    elif intents == "Overview":
        message = crawler(CourseId, intents)

    elif intents == 'Timetable':
        if check_t(CourseId):
            message = 'You can click this link to see timetable: ' + crawler(CourseId, 'timetable Link')
        else:
            message = 'This course is not offered in this term!'

    elif intents == "Lecturer":
        if check_t(CourseId):
                message = "The Lecturer of this course is: " + crawler(CourseId, intents)
        else:
            message = 'This course is not offered in this term!'

    elif intents == 'Census':
        if check_t(CourseId):
                message = 'The Census Date is: '+ crawler(CourseId, 'timetable')['Census Date']+", which is the last day of dropping this course."
        else:
            message = 'This course is not offered in this term!'

    elif intents == 'Enrols':
        if check_t(CourseId):
                message = crawler(CourseId, intents)
        else:
            message = 'This course is not offered in this term!'

    elif intents == 'Status':
        if check_t(CourseId):
            message = "The status of this course is: " + crawler(CourseId, intents)
        else:
            message = 'This course is not offered in this term!'

    elif intents == 'Faculty':
        faculty = crawler(CourseId, intents)
        school = crawler(CourseId, 'School')
        message = "Faculty: " + faculty + ", School: " + school

    elif intents == 'OfferTerms':
        message = "This course is offered in " + crawler(CourseId, 'Offering Terms')

    elif intents == 'Prerequisite':
        prerequisite = crawler(CourseId, intents)
        if prerequisite:
            message = 'The Prerequisite of this course are: ' + prerequisite
        else:
            message = 'There is no prerequisite for this course.'

    elif intents == 'Study Level':
        message = "The Study Level of this course is: "+ crawler(CourseId, intents)

    elif intents == 'Unit of Credit':
        message = "Unit of Credit for this course is: " + crawler(CourseId, intents)

    elif intents == 'askKB':
        message = manager.find(params['Concept'])

    return {'fulfillmentText': message}    
示例#14
0
文件: tests.py 项目: sanglech/CSC326
 def test_urllistDoesNotExist (self):
     self.bot=crawler(None, "bad.txt ")
     self.bot.crawl(depth=1)
     self.assertFalse ((self.bot) ==-1, 'this text file exists!')
示例#15
0
import MySQLdb
from crawler import *


if __name__ == '__main__':
    crawlerObj = crawler()
    
    rep = 'Y'
    while rep == 'y' or rep == 'Y':
        print ""
        print "-----------------------------------------------------------------------------"
        print "1. Add New Website"
        print "2. Delete Website"
        print "3. Parse News (Just for Demo)"
        print "4. Parse All News from existing xml(s)"
        print "-----------------------------------------------------------------------------"

        choice = int(raw_input("\t\t\t\tEnter Your Choice\n-----------------------------------------------------------------------------\n"))
        print "-----------------------------------------------------------------------------"
        if choice == 1:
            
            link = raw_input('\t\t\t\tEnter a link\n-----------------------------------------------------------------------------\n')
            crawlerObj.add_web(link)
            
        elif choice == 2:
            link = raw_input('\t\t\t\tEnter a link\n-----------------------------------------------------------------------------\n')
            crawlerObj.delete_web(link)

        elif choice == 3:
            crawlerObj.parse_one('http://rss.cnn.com/rss/edition_sport.rss')
示例#16
0
# CSC326 Lab 3

# Python script for running the crawler and the PageRank algorithm,
# and pretty prints PageRank scores in greatest-to-least PageRank sorted order

# Note: This script assumes that:
# urls.txt (containing a list of urls to be crawled) and pagerank.py is in the current directory

# See crawler.py for function implementations
from crawler import *
import pprint

if __name__ == "__main__":
    bot = crawler(None, "urls.txt")
    bot.crawl(depth=1)

    pprint.pprint(bot.get_sorted_pagerank())
    pprint.pprint(bot.get_sorted_pagerank_url())
示例#17
0
import threading
from queue import Queue
from parsing import *
from demo import *
from general import *
from crawler import *
from scrape import *

PROJECT_NAME = 'TheNewBoston'
HOMEPAGE = 'https://www.patreon.com/thenewboston'
DOMAIN_NAME = get_domain_name(HOMEPAGE)
QUEUE_FILE = PROJECT_NAME + '/queue.text'
CRAWLED_FILE = PROJECT_NAME + '/crawled.text'
NUMBER_OF_THREADS = 8
queue = Queue()
crawler(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)


# Create worker threads (will die when main exits)
def create_workers():
    for _ in range(NUMBER_OF_THREADS):
        t = threading.Thread(target=work)
        t.daemon = True
        t.start()


# Do the next job in the queue
def work():
    while True:
        url = queue.get()
        crawler.crawl_page(threading.current_thread().name, url)