Пример #1
0
class Page():

    def __init__(self, link, response):
        self.link = link
        self.frontier = Frontier(pop='random')
        self.response = response
        self.read_resp = self.response.read()
        self.soup = BeautifulSoup(self.read_resp)
        self.body = self.soup.body
        self.get_links()

    def __str__(self):
        return self.link

    def get_links(self):
        try:
            article = self.body.find("div", {"id": "mw-content-text"})
            for link in article.findAll('a'):
                try:
                    if self.is_valid_link(link['href']):
                        self.frontier.append(link['href'])
                        print link['href']
                except:
                    link
        except:
            "No body"

    def is_valid_link(self, link):
        if re.match('/wiki/.*', link) and not ":" in link:
            return link
Пример #2
0
class Crawler:
    '''
    This Crawler class will implement crawling the website, get the text the links in page
    '''
    def __init__(self):
        self.frontier = Frontier()
        self.count = 0
        self.last_domain = ''
        self.store = Store()

    def crawl(self):
        '''
        pop a url from frontier and get the header, html, text and out links and push the out links
        into frontier and insert them into elasticsearch
        :return: None
        '''
        while True and self.count < MAX_COUNT:
            url = self.frontier.pop_url()

            try:
                current_domain = urlparse(url).netloc

                if current_domain not in self.frontier.robot_dict and self.frontier.no_robot:
                    self.frontier.add_robot_dict(url)

                if current_domain in self.frontier.robot_dict and not (
                        self.frontier.robot_dict[current_domain].can_fetch(
                            '*', url)):
                    continue

            except Exception, e:
                print 'current_domain_exception'.format(e)
                continue

            print 'current url {}'.format(url)

            if current_domain == self.last_domain:
                time.sleep(1)
            else:
                self.last_domain = current_domain

            try:
                header, raw_html = self.downloader(url)
            except Exception, e:
                print 'downloader exception'.format(e)
                continue

            try:
                text, title, links = self.parse_url(url, raw_html)
            except Exception, e:
                print e
                continue
Пример #3
0
class Crawler:
    '''
    This Crawler class will implement crawling the website, get the text the links in page
    '''

    def __init__(self):
        self.frontier = Frontier()
        self.count = 0
        self.last_domain = ''
        self.store = Store()

    def crawl(self):
        '''
        pop a url from frontier and get the header, html, text and out links and push the out links
        into frontier and insert them into elasticsearch
        :return: None
        '''
        while True and self.count < MAX_COUNT:
            url = self.frontier.pop_url()

            try:
                current_domain = urlparse(url).netloc

                if current_domain not in self.frontier.robot_dict and self.frontier.no_robot:
                    self.frontier.add_robot_dict(url)

                if current_domain in self.frontier.robot_dict and not (self.frontier.robot_dict[current_domain].can_fetch('*', url)):
                    continue

            except Exception, e:
                print 'current_domain_exception'.format(e)
                continue

            print 'current url {}'.format(url)

            if current_domain == self.last_domain:
                time.sleep(1)
            else:
                self.last_domain = current_domain

            try:
                header, raw_html = self.downloader(url)
            except Exception, e:
                print 'downloader exception'.format(e)
                continue

            try:
                text, title, links = self.parse_url(url, raw_html)
            except Exception, e:
                print e
                continue
 def __init__(self):
     self.seed_urls = None
     self.frontier = Frontier()
     self.canonicalizer = Canonicalizer()
     self.all_links = None
     self.crawled_links = set()
     self.count = 0
     self.all_out_links = {}
     self.redirected_map = {}
     self.robots = {}
     self.robots_delay = {}
     self.robots_timer = {}
     self.time_out = 3
     self.total_count = 40000
Пример #5
0
    def pick_clusters(self, clusters, nonleaves):
        """
    """
        _logger.debug("compute initial cluster errors. %d clusters",
                      len(clusters))
        start = time.time()
        for c in clusters:
            c.error = self.influence_cluster(c)
            c.c_range = list(self.c_range)
            c.inf_func = self.create_inf_func(c)
        self.stats['init_cluster_errors'] = [time.time() - start, 1]

        self.update_status("computing frontier")
        _logger.debug("compute initial frontier")
        frontier, _ = Frontier(self.c_range, 0.001)(clusters)

        ret = list(frontier)
        _logger.debug("get nonleaves containing frontier")
        for nonleaf in nonleaves:
            for c in frontier:
                if nonleaf.contains(c):
                    nonleaf.error = self.influence_cluster(nonleaf)
                    ret.append(nonleaf)
                    break

        self.update_status("expanding frontier (%d rules)" % len(ret))
        _logger.debug("second merger pass")
        return ret
Пример #6
0
 def __init__(self, link, response):
     self.link = link
     self.frontier = Frontier(pop='random')
     self.response = response
     self.read_resp = self.response.read()
     self.soup = BeautifulSoup(self.read_resp)
     self.body = self.soup.body
     self.get_links()
Пример #7
0
def greedy(matrix, start, goal):
    """
    Find the path from start to the goal using Greedy Best-first Search Algorithm
    The algorithm is implemented based on the description on Wikipedia:
    https://en.wikipedia.org/wiki/Best-first_search#Greedy_BFS
    Notice: GBFS is suboptimal algorithm, so the solution MAY NOT BE OPTIMAL!
    :param matrix: Search space, as a 2D list
    :param start: Start point, as a tuple
    :param goal: Goal point, as a tuple
    :return: The path (if found) from start to goal, or None
    """
    print('Analytics: start node ' + str(start) + ', goal node ' + str(goal))

    # The set of nodes already evaluated
    visited = set()
    partially_expanded = set()

    # The set of currently discovered nodes that are not evaluated yet.
    # Initially, only the start node is known.
    # frontier is implemented as a priority queue
    frontier = Frontier()
    frontier.add(start, manhattan_dist(start, goal))

    # For each node, which node it can most efficiently be reached from.
    # If a node can be reached from many nodes, came_from will eventually contain the
    # most efficient previous step.
    came_from = {}

    while frontier:
        current, current_distance = frontier.nearest
        if current == goal:
            print('Analytics: ' + str(len(partially_expanded)) +
                  ' expanded nodes out of ' + str(count_nodes(matrix)) +
                  ' nodes , among which ' + str(len(visited)) +
                  ' are fully expanded (all successors evaluated)')
            return reconstruct_path(came_from, current)

        partially_expanded.add(current)
        is_interrupted = False
        for neighbor in expand(current, matrix):
            if neighbor not in visited:
                neighbor_distance = manhattan_dist(neighbor, goal)
                if neighbor not in frontier:  # Discover a new node
                    came_from[neighbor] = current
                    frontier.add(neighbor, neighbor_distance)
                    if current_distance > neighbor_distance:
                        is_interrupted = True
                        break

        if not is_interrupted:
            frontier.pop_nearest()
            visited.add(current)

    return None
Пример #8
0
    def __init__(self, num_threads, seeds, cont_to_crawl):
        self.num_workers = num_threads
        self.dash = Dashboard(num_threads)
        self.workers = []
        self.frontier = Frontier(num_threads, self.dash)
        self.db = Storage()

        # Create the workers
        for i in range(num_threads):
            self.workers.append(CrawlerThread(i, 'CrawlerThread' + str(i), self.frontier, self.dash))
        # print("Workers created")
        # insert seeds in to serve
        if not cont_to_crawl:
            self.frontier.push_to_serve(seeds, 0)
            # print("seeds pushed")
            self.frontier.distribute()
            # print("seeds distributed")
        else:
            self.frontier.load_to_crawl()
Пример #9
0
def main():
    from frontier import Frontier
    from options import parse_command_line

    parse_command_line()

    ft = Frontier([
        ('http://localhost/', 1),
    ])
    Master(ft).start()
    IOLoop.instance().start()
Пример #10
0
    def _a_star_search(self, heuristic_func):
        if self.state == Puzzle.goal_state:
            return [self.state], 0

        # heap of tuple(path_total_cost, path)
        # path_total_cost = path_cost + heuristic
        frontier = Frontier()
        frontier.push((heuristic_func(self.state), [self.state]))
        explored_list = []
        node_gen = 0

        while True:
            if frontier.empty():
                break

            # pop state with min cost from the frontier
            cur_path_total_cost, cur_path = frontier.pop()
            cur_state = cur_path[-1]

            # check lazy delete
            if cur_state in explored_list:
                continue

            # test goal condition
            if cur_state == Puzzle.goal_state:
                return cur_path, node_gen

            # add current state to explored list
            explored_list.append(cur_state)

            # get all neighbours of current state in asc order
            neighbours = Puzzle._get_neighbours(cur_state)

            for neighbour in neighbours:
                if neighbour not in explored_list:
                    # new path to go to a neighbour
                    path_to_neighbour = cur_path.copy()
                    path_to_neighbour.append(neighbour)

                    # calc path_total_cost (include heuristic)
                    path_to_neighbour_total_cost = cur_path_total_cost - heuristic_func(cur_state) \
                                                    + 1 + heuristic_func(neighbour)

                    node_gen += 1

                    # if neighbour already in frontier or not
                    # -> use lazy delete
                    frontier.push(
                        (path_to_neighbour_total_cost, path_to_neighbour))

        return None, node_gen
Пример #11
0
    def __init__(self):
        self.url_waiting_queue = mp.Queue()
        self.url_result_queue = mp.Queue()

        self.config = Config()
        self.frontier = Frontier(self.config)
        self.agents = []
        for id in range(self.config.agentCount):
            self.agents.append(Agent(self.config, id))

        #setting seed url on waiting queue
        self.url_waiting_queue.put(self.config.seedURL)
Пример #12
0
class Controller:
    """The Controller of the hole Crawling process"""

    def __init__(self, num_threads, seeds, cont_to_crawl):
        self.num_workers = num_threads
        self.dash = Dashboard(num_threads)
        self.workers = []
        self.frontier = Frontier(num_threads, self.dash)
        self.db = Storage()

        # Create the workers
        for i in range(num_threads):
            self.workers.append(CrawlerThread(i, 'CrawlerThread' + str(i), self.frontier, self.dash))
        # print("Workers created")
        # insert seeds in to serve
        if not cont_to_crawl:
            self.frontier.push_to_serve(seeds, 0)
            # print("seeds pushed")
            self.frontier.distribute()
            # print("seeds distributed")
        else:
            self.frontier.load_to_crawl()

    def run(self):
        """The main Program"""
        try:
            for i in range(self.num_workers):
                self.workers[i].start()
            # print("All Workers started")
            self.saver_to_crawl = PeriodicThread(self.frontier.save_to_crawl, 3600.0)
            self.saver_to_crawl.start()
            while True:
                self.frontier.distribute()
        except:
            self.dash.print_frontier_stat("saving before exit")
            self.frontier.save_to_crawl()
Пример #13
0
    def wake_generative_with_pyccg(self,
                    grammar, tasks, 
                    maximumFrontier=None,
                    enumerationTimeout=None,
                    CPUs=None,
                    solver=None,
                    evaluationTimeout=None):
        """
        Dreamcoder wake_generative using PYCCG enumeration to guide exploration.

        Enumerates from PyCCG with a timeout and blindly from the EC grammar.
        Updates PyCCG using both sets of discovered meanings.
        Converts the meanings into EC-style frontiers to be handed off to EC.
        """
        # Enumerate PyCCG meanings and update the word learner.
        pyccg_meanings = {t : [] for t in tasks}
        if self.use_pyccg_enum:
            pyccg_meanings = self._update_pyccg_with_distant_batch(tasks, enumerationTimeout)
       
        # Enumerate the remaining tasks using EC-style blind enumeration.
        unsolved_tasks = [task for task in tasks if len(pyccg_meanings[task]) == 0]
        fallback_frontiers, fallback_times = [], None
        if self.use_blind_enum:
            fallback_frontiers, fallback_times = multicoreEnumeration(grammar, unsolved_tasks, 
                                                       maximumFrontier=maximumFrontier,
                                                       enumerationTimeout=enumerationTimeout,
                                                       CPUs=CPUs,
                                                       solver=solver,
                                                       evaluationTimeout=evaluationTimeout)

        # Log enumeration results.
        print("PyCCG model parsing results")
        self._describe_pyccg_results(pyccg_meanings)
        print("Non-language generative model enumeration results:")
        print(Frontier.describe(fallback_frontiers))

        # Update PyCCG model with fallback discovered frontiers.
        self._update_pyccg_with_supervised_batch(fallback_frontiers) # TODO(catwong, jgauthier): does not yet update.

        # Convert and consolidate PyCCG meanings and fallback frontiers for handoff to EC.
        pyccg_frontiers = self._pyccg_meanings_to_ec_frontiers(pyccg_meanings)
        fallback_frontiers = {frontier.task : frontier for frontier in fallback_frontiers}
        all_frontiers = {t : pyccg_frontiers[t] if t in pyccg_frontiers else fallback_frontiers[t] for t in tasks}
        all_times = {t : enumerationTimeout if t in pyccg_frontiers else fallback_times[t] for t in tasks}

        return list(all_frontiers.values()), all_times
Пример #14
0
def main():

    corpus = Corpus(
        output="stack_without_dupes/result-{}.csv".format(CORPUS_SIZE))
    frontier = Frontier(corpus,
                        10,
                        8,
                        duplicate_identification=True,
                        verbose=VERBOSE,
                        debug=DEBUG)

    crawler = Crawler(SEEDS,
                      corpus,
                      frontier,
                      corpuse_max_size=CORPUS_SIZE,
                      duplicate_identification=True,
                      verbose=VERBOSE,
                      debug=DEBUG)
    print("Starting at {}".format(datetime.now()))
    crawler.start()
    print("Done at {}".format(datetime.now()))
Пример #15
0
def reweightbasegrammar(basegrammar,
                        pseudoCounts,
                        filter_depth=None,
                        size=None):
    frontiers = []
    for datum in islice(
            batchloader('train',
                        batchsize=1,
                        compute_sketches=False,
                        filter_depth=filter_depth), size):  #TODO
        #class Task(object):
        #def __init__(self, name, request, examples, features=None, cache=False):
        frontiers.append(
            Frontier([
                FrontierEntry(datum.p,
                              logPrior=basegrammar.logLikelihood(
                                  datum.tp, datum.p),
                              logLikelihood=0)
            ], Task('dummyName', datum.tp, [])))

    return basegrammar.insideOutside(frontiers, pseudoCounts, iterations=1)
Пример #16
0
    def _pyccg_meanings_to_ec_frontiers(self, pyccg_meanings):
        """
        Ret:
            pyccg_frontiers: dict from task -> Dreamcoder frontiers.
        """
        pyccg_frontiers = {}
        for task in pyccg_meanings:
            if len(pyccg_meanings[task]) > 0:
                frontier_entries = []
                for (meaning, log_prob) in pyccg_meanings[task]:
                    ec_sexpr = self.pyccg_learner.ontology.as_ec_sexpr(meaning)
                    if self.ec_ontology_translation_fn:
                        ec_sexpr = self.ec_ontology_translation_fn(ec_sexpr, is_pyccg_to_ec=True)

                    # Uses the p=1.0 likelihood for programs that solve the task.
                    frontier_entry = FrontierEntry(
                        program=Program.parse(ec_sexpr),
                        logPrior=log_prob, 
                        logLikelihood=0.0)
                    frontier_entries.append(frontier_entry)

                pyccg_frontiers[task] = Frontier(frontier_entries, task)
        return pyccg_frontiers
Пример #17
0
import datetime
import hashlib
from colorama import Style
from colorama import Fore
from colorama import init
from tldextract import extract
import time
import traceback
import sys

init()



# Frontier object for frontier interaction
frontier = Frontier()

domains = ["gov.si", "evem.gov.si", "e-uprava.gov.si", "e-prostor.gov.si"]
urls = ["https://www.gov.si", "http://evem.gov.si/evem/drzavljani/zacetna.evem", "https://e-uprava.gov.si/", "https://www.e-prostor.gov.si/"]

allowed_domain = 'gov.si'
type_codes = {
	'application/msword' : 'doc',
	'application/vnd.openxmlformats-officedocument.wordprocessingml.document' : 'docx',
	'application/pdf' : 'pdf',
	'application/vnd.ms-powerpoint' : 'ppt',
	'application/vnd.openxmlformats-officedocument.presentationml.presentation' : 'pptx',
	'text/html' : 'html'
}
request_rate_sec = 5
user_agent = "fri-ieps-nasagrupa"
Пример #18
0
    "http://www.dce.harvard.edu",
    "http://hsdm.harvard.edu",
    "http://www.fas.harvard.edu",
    "http://hds.harvard.edu",
    "http://www.gsd.harvard.edu",
    "http://www.gse.harvard.edu",
    "http://www.gsas.harvard.edu",
    "http://www.seas.harvard.edu",
    "https://www.hks.harvard.edu",
    "http://hls.harvard.edu",
    "http://www.radcliffe.harvard.edu",
    "http://hms.harvard.edu",
    "https://www.hsph.harvard.edu"
]

frontier = Frontier(SEEDS)
crawler = Crawler()

FILE = "/Users/Sun/Documents/IR/Data/HW3/pages/page"
FRONTIER_BACKUP = "/Users/Sun/Documents/IR/Data/HW3/pages/frontier"
# frontier.restore(open(FRONTIER_BACKUP))

crawled = 0
MIN_CRAWL = 35000

purl = None
DOMAIN_TIMESTAMP = {}

while not frontier.empty() and crawled < MIN_CRAWL:
    if crawled % 100 == 0:
        print str(crawled) + " pages crawled   " + str(
Пример #19
0
class RouteFinder(object):
    def __init__(self, space):
        self.__space = space
        self.__frontier = Frontier()
        self.__goal = None
        self.__origin = None

    def __calculate_distance(self, from_xy, to_xy):
        """
        Calculate the distance between the given coordinates using MATH.

        :param from_xy: a tuple of the form (x, y)
        :param to_xy: a tuple of the form (x, y)
        :returns: float
        """
        diff_x = math.fabs(from_xy[0] - to_xy[0])
        diff_y = math.fabs(from_xy[1] - to_xy[1])
        return math.sqrt(math.pow(diff_x, 2) + math.pow(diff_y, 2))

    def __initialize_goal(self, goal_id):
        """
        Initialise the goal node, identified by the given ID.

        :param goal_id: the goal id
        """
        self.__goal = self.__space.nodes[goal_id]
        self.__goal.distance_to_goal = 0

    def __initialize_origin(self, origin_id):
        """
        Initialise the origin node, identified by the given ID.

        :param origin: the origin id
        """
        self.__origin = self.__space.nodes[origin_id]
        self.__origin.route_distance = 0

    def __add_to_frontier(self, from_node, to_node):
        """
        Attempt to add the "to" node to the frontier.

        :param from_node: the previous node
        :param to_node: the next node
        """
        if to_node.visited:
            return

        if from_node is None:
            distance_to_node = 0
        else:
            distance_to_node = self.__calculate_distance(
                (from_node.x, from_node.y),
                (to_node.x, to_node.y)) + from_node.distance_to_here

        # Have we found a shorter route to this node?
        if distance_to_node < to_node.distance_to_here:
            to_node.distance_to_here = distance_to_node
            to_node.via = from_node

        if math.isinf(to_node.distance_to_goal):
            # Always low-ball to estimated distance, otherwise A* won't work
            to_node.distance_to_goal = self.__calculate_distance(
                (to_node.x, to_node.y), (self.__goal.x, self.__goal.y)) * 0.9

        self.__frontier.add_or_update(to_node.id,
                                      to_node.total_distance_to_goal)

    def __get_route_ids(self):
        """
        Get the IDs of the best route from the origin to the goal.

        :returns: a list of node IDs
        """
        route = []
        node = self.__goal
        while node is not None:
            route.append(node.id)
            node = node.via
        return list(reversed(route))

    def __find_route(self):
        """
        Find the shortest route between the origin and the goal.

        :returns: a list of IDs representing the steps from the origin to the goal, or None
        """
        active_node = self.__space.nodes[self.__frontier.remove()]

        # Exit conditions
        if active_node is None:
            return None
        if active_node == self.__goal:
            return self.__get_route_ids()

        active_node.visited = True

        # Add the linked nodes to the frontier
        for linked_id in self.__space.find_linked_nodes(active_node.id):
            if linked_id not in self.__space.nodes:
                # Something is awry with the map data
                return None
            self.__add_to_frontier(active_node, self.__space.nodes[linked_id])

        return self.__find_route()

    def find_route(self, origin_id, goal_id):
        """
        Find the shortest route between the specified nodes.

        :param origin_id: the ID of the "origin" node
        :param goal_id: the ID of the "goal" node
        :returns: a list of IDs representing the steps from the origin to the goal, or None
        """
        if origin_id not in self.__space.nodes or goal_id not in self.__space.nodes:
            return None

        if origin_id == goal_id:
            return [origin_id]

        self.__space.reset()
        self.__frontier.reset()
        self.__initialize_origin(origin_id)
        self.__initialize_goal(goal_id)
        self.__add_to_frontier(None, self.__origin)

        return self.__find_route()
Пример #20
0
 def __init__(self, space):
     self.__space = space
     self.__frontier = Frontier()
     self.__goal = None
     self.__origin = None
Пример #21
0
class Crawler:
    def __init__(self):
        self.seed_urls = None
        self.frontier = Frontier()
        self.canonicalizer = Canonicalizer()
        self.all_links = None
        self.crawled_links = set()
        self.count = 0
        self.all_out_links = {}
        self.redirected_map = {}
        self.robots = {}
        self.robots_delay = {}
        self.robots_timer = {}
        self.time_out = 3
        self.total_count = 40000

    def initialize(self, seed_urls):
        self.all_links = set(seed_urls)
        self.seed_urls = seed_urls
        self.frontier.initialize(seed_urls)

    def crawl_control(self):
        file_io.initialize_log()

        current_wave = 0
        while True:
            # if empty, move to next wave
            if self.frontier.is_empty():
                self.frontier.change_wave(current_wave + 1)
            # if still empty, finished
            if self.frontier.is_empty():
                self.finish()
                return "Finished"
            current_wave, score, url = self.frontier.frontier_pop()

            # get protocol, domain
            domain = self.canonicalizer.get_domain(url)

            # check robots.txt
            if domain not in self.robots:
                try:
                    robots = Robots("http://" + domain + "/robots.txt")
                    self.robots[domain] = robots
                    if robots.delay > self.time_out:
                        self.robots_delay[domain] = self.time_out
                    else:
                        self.robots_delay[domain] = robots.delay
                    self.robots_timer[domain] = datetime.now()
                except Exception as e:
                    error = "Read robots.txt error:\n{0}\nError: {1}\n\n".format(
                        "http://" + domain + "/robots.txt", e)
                    file_io.write_error_info(error)
                    continue

            delay = self.robots_delay[domain]

            # check if can fetch
            if not self.robots[domain].can_fetch(url):
                not_allowed = "Not Allowed: {}\n".format(url)
                print(not_allowed)
                file_io.write_not_allowed(not_allowed)
                continue
            else:
                # politeness
                since_last_crawl = datetime.now() - self.robots_timer[domain]
                if since_last_crawl.total_seconds() < delay:
                    time.sleep(delay - since_last_crawl.total_seconds())
                print("Current: " + url)
                file_io.write_current_link(url)
                # print time interval
                # print((datetime.now() - self.robots_timer[domain]).total_seconds())

                # get page header
                try:
                    url_head = self.get_head(url)
                    if url_head.status_code == 404:
                        error = "Status error:\n{0}\nError code: {1}\n\n".format(
                            url, url_head.status_code)
                        file_io.write_error_info(error)
                        continue
                except Exception as e:
                    error = "Read head error:\n{0}\nError: {1}\n\n".format(
                        url, e)
                    file_io.write_error_info(error)
                    self.robots_timer[domain] = datetime.now()
                    continue
                header = dict(url_head.headers)

                # get content type
                if "content-type" in url_head.headers:
                    content_type = url_head.headers["content-type"]
                else:
                    content_type = "text/html"
                # crawl html type
                if "text/html" not in content_type:
                    continue
                else:
                    # read page
                    try:
                        soup, raw_html, base_url, lang = self.get_page(url)
                        self.robots_timer[domain] = datetime.now()
                        # whether we should crawl, language, black list
                        if not self.page_should_crawl(base_url, lang):
                            continue
                        # multiple redirected url
                        if base_url in self.crawled_links:
                            self.frontier.objects[base_url].in_links.update(
                                self.frontier.objects[url].in_links)
                            error = "Multiple redirected URL:\nURL: {0}\nRedirected URL: {1}\n\n".format(
                                url, base_url)
                            file_io.write_error_info(error)
                            continue
                        else:
                            self.crawled_links.add(base_url)
                            frontier_item = FrontierItem(base_url)
                            frontier_item.in_links = self.frontier.objects[
                                url].in_links
                            self.frontier.objects[base_url] = frontier_item
                            self.redirected_map[url] = base_url
                    except Exception as e:
                        error = "Read page error:\n{0}\nError: {1}\n\n".format(
                            url, e)
                        file_io.write_error_info(error)
                        self.robots_timer[domain] = datetime.now()
                        continue

                    raw_out_links = self.get_out_links(soup)
                    out_links = []

                    # write as ap format
                    text = self.extract_text(soup)
                    if len(soup.select("title")) != 0:
                        title = soup.select("title")[0].get_text()
                    else:
                        title = None
                    file_io.write_ap(base_url, text, header, title)
                    file_io.write_raw_html({base_url: raw_html})

                    for link in raw_out_links:
                        processed_link = self.canonicalizer.canonicalize(
                            base_url, domain, link)
                        file_io.write_canonicalization(link, processed_link)
                        # if link is not empty
                        if len(processed_link) != 0:
                            out_links.append(processed_link)
                            if processed_link not in self.all_links:
                                # new frontier item
                                frontier_item = FrontierItem(
                                    processed_link, link)
                                frontier_item.update_in_links(base_url)

                                self.frontier.frontier_put(
                                    frontier_item, current_wave + 1)
                                self.all_links.add(processed_link)
                            else:
                                # update in links
                                if processed_link in self.redirected_map:
                                    redirected = self.redirected_map[
                                        processed_link]
                                    self.frontier.frontier_update_inlinks(
                                        redirected, base_url)
                                else:
                                    self.frontier.frontier_update_inlinks(
                                        processed_link, base_url)
                    file_io.write_all_out_links({base_url: out_links})
                self.count += 1
                print(self.count, current_wave, url, score)
                file_io.write_log(self.count, current_wave, url, score)
                file_io.write_final_info(len(self.crawled_links),
                                         len(self.all_links))
                if self.count == self.total_count:
                    self.finish()
                    print("Finished")
                    return

    def finish(self):
        for url in self.crawled_links:
            file_io.write_crawled_links(url)
            file_io.write_all_in_links(
                {url: list(self.frontier.objects[url].in_links)})
        file_io.write_all_links(self.all_links)

    def get_out_links(self, soup):
        a = soup.select('a')
        out_links = []
        for item in a:
            if item.get('href'):
                out_links.append(item['href'])
        return out_links

    def get_page(self, url: str):
        headers = {"Connection": "close"}
        res = requests.get(url=url, headers=headers, timeout=self.time_out)
        soup = BeautifulSoup(res.text, "lxml")
        try:
            if soup.select("html")[0].has_attr("lang"):
                lang = soup.select("html")[0]['lang']
            else:
                lang = "en"
        except Exception as e:
            error = "Read language error:\n{0}\nError: {1}\n\n".format(url, e)
            file_io.write_error_info(error)
            lang = "en"
        base_url = res.url
        return soup, res.text, base_url, lang

    def get_head(self, url: str):
        headers = {"Connection": "close"}
        head = requests.head(url=url,
                             headers=headers,
                             timeout=self.time_out,
                             allow_redirects=True)
        return head

    def extract_text(self, soup: BeautifulSoup):
        output = ""
        text = soup.find_all("p")
        for t in text:
            new_t = t.get_text()
            new_t = re.sub("\n", "", new_t)
            new_t = re.sub("  +", " ", new_t)
            if len(new_t) == 0:
                continue
            output += "{} ".format(new_t)
        return output

    def page_should_crawl(self, base_url, lang):
        result = True
        # check language
        if "en" not in lang.lower():
            error = "Language error: {0}\nLanguage = {1}\n\n".format(
                base_url, lang)
            file_io.write_error_info(error)
            result = False
        # check black list
        black_list = [
            ".jpg", ".svg", ".png", ".pdf", ".gif", "youtube", "edit",
            "footer", "sidebar", "cite", "special", "mailto", "books.google",
            "tel:", "javascript", "www.vatican.va", ".ogv", "amazon", ".webm"
        ]
        block = 0
        key = ""
        for key in black_list:
            if key in base_url.lower():
                block = 1
                break
        if block == 1:
            error = "Page type error: {0}\nkeyword = {1}\n\n".format(
                base_url, key)
            file_io.write_error_info(error)
            result = False
        return result
Пример #22
0
            if rp.can_fetch("*", url):
                get_urls(driver, frontier, page_id)
        elif is_html:
            # no robots.txt => parse everything :)
            # Write site to database without
            get_urls(driver, frontier, page_id)

        if not frontier.has_urls():
            print(th_num + " sleep")
            time.sleep(10)

    driver.close()


if __name__ == "__main__":
    frontier = Frontier(seed)
    robots = []
    rp = RobotFileParser()
    sp = SitemapParser()
    db = Database(use_database)

    init_sites()
    print(robots)
    start = time.time()

    # Read thread num argument
    thread_num = 1
    print(sys.argv)
    if len(sys.argv) > 1:
        thread_num = int(sys.argv[1])
Пример #23
0
import atexit
import logging

from crawler import Crawler
from frontier import Frontier

if __name__ == "__main__":
    # Configures basic logging
    logging.basicConfig(
        format='%(asctime)s (%(name)s) %(levelname)s %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=logging.INFO)

    # Instantiates frontier and loads the last state if exists
    frontier = Frontier()
    frontier.load_frontier()
    # Registers a shutdown hook to save frontier state upon unexpected shutdown
    atexit.register(frontier.save_frontier)

    # Instantiates a crawler object and starts crawling
    crawler = Crawler(frontier)
    crawler.start_crawling()
    frontier.data_dump()
    crawler.data_dump2()
Пример #24
0
from frontier import Frontier
from parser import Parser
from graph import Graph
from pagerank import Ranker
from indexer import Indexer
from scorer import Scorer

frontier = Frontier([
    'http://mysql12.f4.htw-berlin.de/crawl/d01.html',
    'http://mysql12.f4.htw-berlin.de/crawl/d06.html',
    'http://mysql12.f4.htw-berlin.de/crawl/d08.html'
])
parser = Parser()
indexer = Indexer()
web_graph = Graph()

for url in frontier:
    # get outgoing links for the graph and content for tokenization
    body, links_on_page = parser.parse(url)

    # add document to indexer
    indexer.add_document(url, body)

    # build our webgraph
    node = web_graph.get_node(url)
    if node is None:
        node = web_graph.add_node(url)

    for out_link in links_on_page:
        web_graph.add_edge(url, out_link)
Пример #25
0
from frontier import Frontier
from node import Node

frontier = Frontier()
frontier.add_or_update('a', 40)
frontier.add_or_update('b', 20)
frontier.add_or_update('c', 30)

assert frontier.remove() == 'b'
assert frontier.remove() == 'c'

frontier.add_or_update('d', 10)
frontier.add_or_update('e', 50)

assert frontier.remove() == 'd'
assert frontier.remove() == 'a'
assert frontier.remove() == 'e'
assert frontier.remove() is None
import atexit
import logging

import sys

from corpus import Corpus
from crawler import Crawler
from frontier import Frontier

if __name__ == "__main__":
    # Configures basic logging
    logging.basicConfig(
        format='%(asctime)s (%(name)s) %(levelname)s %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=logging.INFO)

    # Instantiates frontier and loads the last state if exists
    frontier = Frontier()
    frontier.load_frontier()

    # Instantiates corpus object with the given cmd arg
    corpus = Corpus(sys.argv[1])

    # Registers a shutdown hook to save frontier state upon unexpected shutdown
    atexit.register(frontier.save_frontier)

    # Instantiates a crawler object and starts crawling
    crawler = Crawler(frontier, corpus)
    crawler.start_crawling()
    crawler.analytics()
Пример #27
0
 def __init__(self, num_of_workers=1, seed=False, seed_path=None):
     self.n_workers = num_of_workers
     self.frontier = Frontier(seed=seed, seed_path=seed_path)
Пример #28
0
 def __init__(self):
     self.frontier = Frontier()
     self.count = 0
     self.last_domain = ''
     self.store = Store()
Пример #29
0
class Crawler:
    '''
    crawling the website, get the text the links in page
    '''
    def __init__(self):
        self.count = 0
        self.last_domain = ''
        self.frontier = Frontier()
        self.store = Store()

    def initial_seeds(self):
        self.frontier.initial_queue()

    def parseRobot(self, domain):
        robot_url = 'http://' + domain + '/robots.txt'
        
        try:
            robot_file = urllib2.urlopen(robot_url).read()
            robot_content = ''
            for l in robot_file.split('\n'):
                if l.replace(' ','') != '':
                    robot_content += l + '\n'
            robot_parser = robotexclusionrulesparser.RobotExclusionRulesParser()
            robot_parser.parse(robot_content)

            try:
                crawler_delay = robot_parser.get_crawl_delay('*')
            except Exception as e:
##                print 'crawler_delay exception: {}'.format(e)
                crawler_delay = None
            
            return robot_parser, crawler_delay
        except Exception as e:
##            print 'robot parse exception: {}'.format(e)
            return None, None

    def crawl(self):
        '''
        pop a url from frontier and get the header, html, text and out links.
        push the out links into frontier and insert them into elasticsearch
        '''
        while self.count < MAX_COUNT:
            level, url = self.frontier.pop_url()

            try:
                current_domain = urlparse(url).netloc

##                if current_domain not in self.frontier.robot_dict and self.frontier.no_robot:
##                    self.frontier.add_robot_dict(url)
##
##                if current_domain in self.frontier.robot_dict and not (self.frontier.robot_dict[current_domain].can_fetch('*', url)): 
##                    continue

                robot_parser, crawler_delay = self.parseRobot(current_domain)
                if robot_parser is not None:
                    if not robot_parser.is_allowed('*', url):
                        print 'not allowed to crawl: {}'.format(url)
                        continue
                    if crawler_delay is not None:
                        time.sleep(crawler_delay)
                
            except Exception as e:
                print 'current_domain_exception: {}'.format(e)
                print url
                continue

            if current_domain == self.last_domain:
                time.sleep(1)
            else:
                self.last_domain = current_domain

            try:
                header, raw_html = self.downloader(url)
            except Exception, e:
                print 'downloader exception: {}'.format(e)
                continue

            try:
                text, title, links = self.parse_url(url, raw_html)
            except Exception as e:
                print 'parse exception: {}'.format(e)
                continue

            if text or links:
                self.count += 1
                out_links = []
                
                for link in links:
                    try:
                        if len(self.frontier.pq) > MAX_COUNT:
                            break
                        if self.frontier.check_push_url(link, url):
                            out_links.append(link)
                    except Exception as e:
                        continue
                
                print 'FINISHED: {}'.format(self.count)

                self.store.insert(self.count, url, header, title, text,
                                  raw_html, [], out_links, level)

                self.write_to_file(self.count, url, header, title, text,
                                  raw_html, out_links, level)
            else:
                continue

        self.frontier.write_in_links()
        self.store.write_urls()
Пример #30
0
 def __init__(self):
     self.count = 0
     self.last_domain = ''
     self.frontier = Frontier()
     self.store = Store()
Пример #31
0
def a_star(matrix, start, goal, estimate=manhattan_dist):
    """
    Find the path from start to the goal using Greedy Best-first Search Algorithm
    The algorithm is implemented based on the description on Wikipedia:
    https://en.wikipedia.org/wiki/Best-first_search#Greedy_BFS
    Notice: GBFS is suboptimal algorithm, so the solution MAY NOT BE OPTIMAL!
    :param estimate: Heuristics used in a_star search
    :param matrix: Search space, as a 2D list
    :param start: Start point, as a tuple
    :param goal: Goal point, as a tuple
    :return: The path (if found) from start to goal, or None
    """
    print('Analytics: start node ' + str(start) + ', goal node ' + str(goal))

    # The set of nodes already evaluated
    visited = set()

    # For each node, the cost of getting from the start node to that node.
    # The cost of going from start to start is zero.
    g_score = {start: 0}

    # For each node, the total cost of getting from the start node to the goal
    # by passing by that node. That value is partly known, partly heuristic.
    # For the first node, that value is completely heuristic.
    f_score = {start: estimate(start, goal)}

    # The set of currently discovered nodes that are not evaluated yet.
    # Initially, only the start node is known.
    # frontier is implemented as a priority queue
    frontier = Frontier()
    frontier.add(start, f_score[start])

    # For each node, which node it can most efficiently be reached from.
    # If a node can be reached from many nodes, came_from will eventually contain the
    # most efficient previous step.
    came_from = {}

    while frontier:
        current, current_f_score = frontier.pop_nearest()
        if current == goal:
            print('Analytics: ' + str(len(visited)) +
                  ' expanded nodes, out of ' + str(count_nodes(matrix)) +
                  ' nodes')
            # draw_expanded_nodes(matrix, visited)
            return reconstruct_path(came_from, current)

        visited.add(current)
        for neighbor in expand(current, matrix):
            if neighbor not in visited:
                g_through_current = g_score[
                    current] + 1  # every neighbor has distance 1

                if (neighbor not in frontier
                        or g_through_current < g_score[neighbor]):
                    # Discover a new node or a better path
                    came_from[neighbor] = current
                    g_score[neighbor] = g_through_current
                    f_score[neighbor] = (g_score[neighbor] +
                                         estimate(neighbor, goal))
                    frontier.add(neighbor, f_score[neighbor])

    return None
Пример #32
0
def a_star_multidots(edges,
                     start: tuple,
                     goals: tuple,
                     estimate=mst_estimator):
    """
    Find the path from start to the goal using Greedy Best-first Search Algorithm
    The algorithm is implemented based on the description on Wikipedia:
    https://en.wikipedia.org/wiki/Best-first_search#Greedy_BFS
    Notice: GBFS is suboptimal algorithm, so the solution MAY NOT BE OPTIMAL!
    :param estimate: Heuristics used in a_star search
    :param edges: Search space, as a 2D list
    :param start: Start point, as a tuple
    :param goals: Goal points, as a set of all dots
    :return: The path (if found) from start to goal, or None
    """
    print('Analytics: start node ' + str(start) + ', dots node ' + str(goals))

    goals_to_indices = {g: i for i, g in enumerate(goals, 2)}

    start = init_state(start, goals)
    if start[0:2] in goals:
        start = mark_visited(start[0:2], goals_to_indices, start)

    # The set of nodes already evaluated
    visited = set()

    # For each node, the cost of getting from the start node to that node.
    # The cost of going from start to start is zero.
    g_score = {start: 0}

    # For each node, the total cost of getting from the start node to the dots
    # by passing by that node. That value is partly known, partly heuristic.
    # For the first node, that value is completely heuristic.
    # f_score = {start: naive_estimator(start, dots_visited[start], goals)}
    f_score = {start: estimate(start, goals, edges)}

    # The set of currently discovered nodes that are not evaluated yet.
    # Initially, only the start node is known.
    # frontier is implemented as a priority queue
    frontier = Frontier()
    frontier.add(start, f_score[start])

    # For each node, which node it can most efficiently be reached from.
    # If a node can be reached from many nodes, came_from will eventually contain the
    # most efficient previous step.
    came_from = {}

    while frontier:
        current, current_f_score = frontier.pop_nearest()
        if current[2:].count(1) == len(current) - 2:
            print('Analytics: ' + str(len(visited)) +
                  ' expanded nodes, out of ' +
                  str(len(edges) * (2**(len(current) - 2))) + ' nodes')
            return reconstruct_path(came_from, current)

        visited.add(current)
        for neighbor in expand_multidots(current, edges):
            if neighbor[0:2] in goals:
                neighbor = mark_visited(neighbor[0:2], goals_to_indices,
                                        neighbor)

            if neighbor not in visited:
                # Subtract 1 here because the edge_maps contains both start and end for
                # the shortest path between dots
                g_through_current = g_score[current] + len(
                    edges[current[0:2]][neighbor[0:2]]) - 1

                if (neighbor not in frontier
                        or g_through_current < g_score[neighbor]):
                    # Discover a new node or a better path

                    came_from[neighbor] = current
                    g_score[neighbor] = g_through_current

                    f_score[neighbor] = (g_score[neighbor] +
                                         estimate(neighbor, goals, edges))
                    frontier.add(neighbor, f_score[neighbor])

    return None