Exemplo n.º 1
0
    def https_duplicate(self, old_url):
        """Avoid https and http duplicate.

		If old url is secure (https), must delete insecure url if exists,
		then return secure url (old url).
		If old url is insecure (http), must delete it if secure url exists,
		then return secure url (new url)

		:param old_url: old url
		:type old_url: str
		:return: url to add and url to delete

		"""
        tell('url to send: ' + old_url, severity=-1)
        new_url = database.convert_secure(old_url)
        new_exists = self.doc_exists(new_url)

        if database.url_is_secure(old_url):
            # old_url start with https
            if new_exists:  # Start with http
                return old_url, new_url
            return old_url, None
        # old_url is insecure, start with http
        if new_exists:  # Secure url exists
            if self.doc_exists(old_url):  # Insecure exists
                return new_url, old_url
            return new_url, None
        return old_url, None
Exemplo n.º 2
0
    def send_doc(self, webpage_infos):
        """Send document informations to database.

		:param infos: informations to send to database
		:type infos: list
		:return: True if an error occured

		"""
        error = False  # no error
        response = self.connection()
        result, response = self.send_command(
            "SELECT popularity, last_crawl, domain FROM {} WHERE url = %s".
            format(self.t[0]), (webpage_infos['url']), True)
        if 'error' in response:
            tell('Popularity and last_crawl query failed: ' + response)
            error = True
        if result != ():
            # Url found in database, there is an answer:
            last_crawl = result[0][1]  # datetime.datetime object
            if (datetime.now() - last_crawl) > CRAWL_DELAY:
                # The program have already crawled this website
                error = self.update(webpage_infos, result)
            else:
                tell('Recently crawled: ' + webpage_infos['url'])
        else:
            # Url not found in database, the url doesn't exist in the database,
            # we add it:
            error = self.insert(webpage_infos)
        self.close_connection()
        return error  # All is correct
Exemplo n.º 3
0
	def read_links(self):
		"""Get url of next webpage.

		Check the size of curent reading links and increment it if over.

		:return: url of webpage to crawl

		"""
		self.domains = swiftea_bot_links.get_domains()

		filename_ptr, reading_line_number = swiftea_bot_links.get_filename_read(
			self.domains,
			self.crawl_option
		)

		filename = data.DIR_LINKS + str(filename_ptr)

		if path.exists(filename):
			with open(filename, 'r', errors='replace', encoding='utf8') as myfile:
				list_links = myfile.read().splitlines()  # List of urls
		else:
			tell('Reading file not found in get_url: ' + filename, 4)
			return 'error'

		# If it's the last links of the file:
		if len(list_links) == reading_line_number-1:
			self.domains[filename_ptr]['completed'] = 1
			return '#level_complete#'

		tell('File {0}, line {1}'.format(
			str(filename_ptr),
			str(reading_line_number)), severity=0)
		url = list_links[reading_line_number-1]

		return url
Exemplo n.º 4
0
	def tell_progress(self, upload=True):
		message = 'Uploading' if upload else 'Downloading'
		if self.nb_files != 0:
			percent = round(self.downuploaded_files * 100 / self.nb_files, 2)
			message += ' {}% ({}/{})'.format(percent, self.downuploaded_files, self.nb_files)
			tell(message)
		else:
			tell('No progress data')
Exemplo n.º 5
0
	def download_lists_words(self):
		"""Download stopwords and badwords."""
		tell('download list of words')
		self.connection()
		for filename in ['en.stopwords.txt', 'fr.stopwords.txt', 'en.badwords.txt', 'fr.badwords.txt']:
			type_ = filename[3:-4] + '/'
			self.cd(self.FTP_DATA + type_)
			self.get(type_ + filename, filename)
		self.disconnect()
Exemplo n.º 6
0
	def get_url(self):
		url = self.read_links()
		if url == '#level_complete#':
			tell('Level complete, new level: ' + str(self.crawl_option['level']))
			self.crawl_option['level'] += 1
			swiftea_bot_links.save_domains(self.domains)
			if (self.crawl_option['level'] < self.crawl_option['target-level']):
				return self.read_links()
			return '#target-reached#'
		return url
Exemplo n.º 7
0
	def save_inverted_index_json(self, inverted_index):
		"""Save inverted-index in local.

		Save it in a json file when we can't send it.

		:param inverted_index: inverted-index
		:type inverted_index: dict

		"""
		tell('Save inverted-index in save file')
		with open(data.FILE_INDEX, 'w') as myfile:
			json.dump(inverted_index, myfile, ensure_ascii=False)
Exemplo n.º 8
0
	def get_inverted_index(self):
		"""Get inverted-index in local.

		Called after a connection error. Read a json file that contains the inverted-index.
		Delete this file after reading it.

		:return: inverted-index

		"""
		tell('Get inverted-index from save file')
		with open(data.FILE_INDEX, 'r') as myfile:
			inverted_index = json.load(myfile)
		remove(data.FILE_INDEX)
		return convert_keys(inverted_index)
Exemplo n.º 9
0
    def get_doc_id(self, url):
        """Get id of a document in database.

		:param url: url of webpage
		:type url: str
		:return: id of webpage or None if not found

		"""
        result, response = self.send_command(
            "SELECT id FROM {} WHERE url = %s".format(self.t[0]), (url))
        if 'error' in response[1]:
            tell('Failed to get id: ' + response)
            return None
        return str(result[0])
Exemplo n.º 10
0
	def save_inverted_index(self, inverted_index):
		"""Save inverted index in `.sif` files."""
		tell('Save inverted index in `.sif` files.')
		for language in inverted_index:
			if not path.isdir(self.DIR_INDEX + language):
				mkdir(self.DIR_INDEX + language)
			for first_letter in inverted_index[language]:
				if not path.isdir(self.DIR_INDEX + language + '/' + first_letter):
					mkdir(self.DIR_INDEX + language + '/' + first_letter)

				for two_letters in inverted_index[language][first_letter]:
					index = inverted_index[language][first_letter][two_letters]
					filename = language + '/' + first_letter + '/' + two_letters + '.sif'
					with open(self.DIR_INDEX + filename, 'w', encoding='utf-8') as myfile:
						json.dump(index, myfile, ensure_ascii=False)
Exemplo n.º 11
0
    def indexing(self):
        """Index crawled webpages.

		get id of each documents and index them.

		"""
        module.tell('Indexing', severity=2)
        for webpage_infos in self.infos:
            doc_id = self.database.get_doc_id(webpage_infos['url'])
            if doc_id is None:
                module.safe_quit()
            module.tell('Indexing {0} {1}'.format(doc_id,
                                                  webpage_infos['url']))
            self.index_manager.add_doc(webpage_infos['keywords'], doc_id,
                                       webpage_infos['language'])
Exemplo n.º 12
0
def check_connection(url='https://github.com'):
	"""Test internet connection.

	Try to connect to a website.

	:param url: url used to test the connection
	:return: True if connected to internet

	"""
	try:
		requests.get(url)
	except requests.exceptions.RequestException:
		tell('No connection')
		return False
	else:
		return True
Exemplo n.º 13
0
    def del_one_doc(self, url, table=None):
        """Delete document corresponding to url.

		:param url: url of webpage
		:type url: str
		:return: status message

		"""
        if table is None:
            table = self.t[0]
        tell('Delete from {} doc: {}'.format(table, url))
        response = self.send_command(
            "DELETE FROM {} WHERE url = %s".format(table), (url))
        if 'error' in response[1] or response[1][1] != 'Send command: ok':
            tell('Doc not removed: {0}, {1}'.format(url, response[1]))
        return response[1]
Exemplo n.º 14
0
 def test_check_size_files(self):
     file_manager.FileManager.check_size_files(self)
     self.max_size_file = 1
     module.tell('Simple message')
     module.tell('Simple message')
     file_manager.FileManager.check_size_files(self)
     module.tell('Simple message')
     module.tell('Simple message')
     file_manager.FileManager.check_size_files(self)
Exemplo n.º 15
0
    def doc_exists(self, url, table=None):  # TODO: refacto: une get_doc_id
        """Check if `url` is in database.

		:param url: url corresponding to doc
		:type url: str
		:return: True if doc exists

		"""
        if table is None:
            table = self.t[0]
        result, response = self.send_command(
            "SELECT EXISTS(SELECT * FROM {} WHERE url=%s)".format(table),
            (url))
        if 'error' in response:
            tell('Failed to check row: ' + response)
            return None
        return result[0] == 1
Exemplo n.º 16
0
    def send_to_db(self):
        """Send all informations about crawled webpages to database.

		Can delete some documents to avoid http and https duplicates.

		"""
        module.tell('Send to database', severity=2)
        for webpage_infos in self.infos:
            webpage_infos['url'], url_to_del = self.database.https_duplicate(
                webpage_infos['url'])
            if url_to_del:
                self.delete_bad_url(url_to_del)
            module.tell('New url (to add): ' + webpage_infos['url'],
                        severity=-1)
            error = self.database.send_doc(webpage_infos)
            if error:
                module.safe_quit()
Exemplo n.º 17
0
	def read_inverted_index(self):
		"""Get inverted-index in local.

		Read all files created to send inverted-index.

		:return: inverted-index

		"""
		tell('Get inverted-index in local')
		inverted_index = dict()
		for language in listdir(self.DIR_INDEX):
			inverted_index[language] = dict()
			for first_letter in listdir(self.DIR_INDEX + language):
				inverted_index[language][first_letter] = dict()
				for filename in listdir(self.DIR_INDEX + language + '/' + first_letter):
					with open(self.DIR_INDEX + language + '/' + first_letter + '/' + filename, 'r', encoding='utf-8') as myfile:
						inverted_index[language][first_letter][filename[:-4]] = json.load(myfile)
		return convert_keys(inverted_index)
Exemplo n.º 18
0
    def suggestions(self):
        """Get the five first URLs from Suggestion table and delete them.

		:return: list of url in Suggestion table and delete them

		"""
        result, response = self.send_command(
            "SELECT url FROM suggestion LIMIT 5", fetchall=True)
        if 'error' in response[1] or result is None:
            tell('Failed to get url: ' + response)
            return None

        suggested_links = list()
        for element in result:
            if len(suggested_links) < 5:
                suggested_links.append(element[0])
                self.del_one_doc(element[0], self.t[1])
        return suggested_links
Exemplo n.º 19
0
	def check_size_files(self):
		for filelog in [data.FILE_EVENTS, data.FILE_ERRORS]:
			filearchive = filelog[:-3] + 'zip'
			if not path.exists(filelog):
				continue
			with open(filelog, 'r') as myfile:
				content = myfile.readlines()
			if len(content) > data.MAX_SIZE:
				if not path.exists(filearchive):
					ZipFile(file=filearchive, mode='w').close()
					filename = '0'
				else:
					with ZipFile(filearchive, 'r') as myzip:
						filename = str(int(myzip.namelist()[-1])+1)  # The last one +1
				rename(filelog, filename)
				with ZipFile(filearchive, 'a') as myzip:
					myzip.write(filename)
				remove(filename)
				tell('Archiving ' + filelog + ': ' + filename, severity=-1)
Exemplo n.º 20
0
    def insert(self, infos):
        """Insert a new document in database.

		:param infos: doc infos
		:type infos: dict()
		:return: True is an arror occured

		"""
        tell('Adding ' + infos['url'])
        response = self.send_command(
            """INSERT INTO {} (title, description, url, first_crawl, last_crawl, language,
popularity, score, homepage, sanesearch, favicon, domain)
VALUES (%s, %s, %s, NOW(), NOW(), %s, 1, %s, %s, %s, %s, %s)""".format(
                self.t[0]),
            (infos['title'], infos['description'], infos['url'],
             infos['language'], infos['score'], infos['homepage'],
             infos['sanesearch'], infos['favicon'], self.domain))
        if 'error' in response[1][1]:
            tell('Failed to add: ' + str(response))
            return True
        return False
Exemplo n.º 21
0
    def suggestions(self):
        """Suggestions:

		Get 5 urls from database, delete them, crawl them,
		send all informations about them, index them.

		"""
        suggestions = self.database.suggestions()
        if suggestions is None:
            module.tell('Failed to get suggestions')
        else:
            suggestions = data_processing.clean_links(suggestions)
            if suggestions:
                module.tell('Suggestions', severity=2)
                for url in suggestions:
                    result = self.crawl_webpage(url)
                    # result[0]: webpage_infos ; result[1]: links
                    if result:
                        self.infos.append(result[0])
                        self.file_manager.save_links(result[1])
                self.send_to_db()
                self.indexing()
                self.infos.clear(
                )  # Reset the list of dict of informations of websites.
            else:
                module.tell('No suggestions')
Exemplo n.º 22
0
    def crawl_webpage(self, url):
        """Crawl the given url.

		Get webpage source code, feed it to the parser, manager extracting data,
		manager redirections and can delete some documents to avoid duplicates.

		:param url: url of webpage
		:type url: str

		"""
        module.tell('Crawling ' + url)
        # Get webpage's html code:
        new_url, html_code, nofollow, score, all_urls = self.web_connection.get_code(
            url)
        if html_code is None:
            self.delete_bad_url(
                all_urls)  # Failed to get code, must delete from database.
            return None
        if html_code == 'no connection':
            module.safe_quit()
        if html_code == 'ignore':  # There was something wrong and maybe a redirection.
            self.delete_bad_url(all_urls)
            return None
        module.tell('New url: ' + new_url, severity=0)
        webpage_infos, list_links = self.site_informations.get_infos(
            new_url, html_code, nofollow, score)
        self.delete_bad_url(all_urls,
                            webpage_infos['language'])  # Except new url
        webpage_infos['url'] = new_url

        if webpage_infos['title'] != '':
            if module.can_add_doc(
                    self.infos,
                    webpage_infos):  # check for duplicate only with url
                self.crawled_websites += 1
                return webpage_infos, list_links
            return None
        self.delete_bad_url(new_url, webpage_infos['language'])
        return None
Exemplo n.º 23
0
    def sane_search(self, keywords, language, max_ratio=.2):
        """Filter pages not suitable for a young audience.

		:param: keywords: webpage's keywords
		:type keywords: list
		:pram language: found website language
		:type language: str
		:return: True or False

		"""
        badwords = self.BADWORDS[language]
        nb_badwords = 0
        nb_words = len(keywords)
        if nb_words == 0:
            return False
        for keyword in keywords:
            if keyword in badwords:
                nb_badwords += 1
        ratio = nb_badwords / nb_words
        if ratio >= max_ratio:
            tell('bad site detected')
            return True
        return False
Exemplo n.º 24
0
    def update(self, infos, result):
        """Update a document in database.

		:param infos: doc infos
		:type infos: dict()
		:param popularity: new doc popularity
		:type popularity: int
		:return: True if an arror occured

		"""
        tell('Updating ' + infos['url'])

        cmd = """
UPDATE {} SET title=%s, description=%s, last_crawl=NOW(), language=%s,
popularity=%s, score=%s, homepage=%s, sanesearch=%s, favicon=%s
WHERE url = %s""".format(self.t[0])
        response = self.send_command(
            cmd, (infos['title'], infos['description'], infos['language'],
                  result[0][0] + 1, infos['score'], infos['homepage'],
                  infos['sanesearch'], infos['favicon'], infos['url']))
        if 'error' in response[1]:
            tell('Failed to [update: ' + response[1], -2)
            return True
        return False
Exemplo n.º 25
0
    def delete_bad_url(self, urls, language='*'):
        """Delete bad doc if exists.

		Check if doc exists in database and delete it from database and inverted-index.

		:param url: url to delete
		:type url: str or list

		"""
        if isinstance(urls, str):
            urls = [urls]
        for url in urls:
            doc_exists = self.database.doc_exists(url)
            if doc_exists:
                doc_id = self.database.get_doc_id(url)
                if doc_id:
                    self.database.del_one_doc(url)
                    self.index_manager.delete_doc_id(doc_id, language)
                else:
                    module.safe_quit()
            elif doc_exists is None:
                module.safe_quit()
            else:
                module.tell('Ignore: ' + url, severity=-1)
Exemplo n.º 26
0
	def get_inverted_index(self):
		"""Get inverted-index.

		:return: inverted-index and True if an error occured

		"""
		tell('Get inverted-index from server')
		self.downuploaded_files = 0
		inverted_index = dict()
		self.connection()
		self.cd(self.FTP_INDEX)
		self.nb_files = self.countfiles()  # Count files on server (prepare to download)
		list_language = self.listdir()

		for language in list_language:
			self.cd(language)
			if not path.isdir(self.DIR_INDEX + language):
				mkdir(self.DIR_INDEX + language)
			inverted_index[language] = dict()
			list_first_letter = self.listdir()
			for first_letter in list_first_letter:
				self.tell_progress(False)
				self.cd(first_letter)
				if not path.isdir(self.DIR_INDEX + language + '/' + first_letter):
					mkdir(self.DIR_INDEX +  language + '/' + first_letter)
				inverted_index[language][first_letter] = dict()
				list_filename = self.listdir()
				for filename in list_filename:
					inverted_index[language][first_letter][filename[:-4]] = self.download(language, first_letter, filename)

				self.cd('..')
			self.cd('..')

		self.disconnect()
		if inverted_index == dict():
			tell('No inverted-index on server', severity=0)
		else:
			tell('Transfer complete', severity=0)
		return inverted_index
Exemplo n.º 27
0
    def start(self):
        """Start main loop of crawling.

		Crawl 10 webpages, send documents to database, index them
		and save the configurations (line number in links file, ...).
		Send the inverted-index and check for suggestions each 500 crawled webpages.

		Do it until the user want stop crawling or occured an error.

		"""
        module.tell('Starting with base urls')
        self.get_inverted_index()
        if not path.exists(data.FILE_LINKS):
            links.save_domains([{
                'domain': '',
                'level': -1,
                'completed': 0,
                'line': 1,
                'file': 0
            }])
        run = True
        while run:
            stats_crawl = time()
            self.suggestions()
            for _ in range(self.l1):
                module.tell('Crawl', severity=2)
                begining = time()
                while len(self.infos) < self.l2:
                    begining = time()
                    # Start of crawling loop
                    url = self.file_manager.get_url()
                    if url == 'error':
                        module.safe_quit()

                    result = self.crawl_webpage(url)
                    # result[0]: webpage_infos, result[1]: links

                    if result:
                        self.infos.append(result[0])
                        # save links and get next url:
                        self.file_manager.save_links(result[1])

                    with open(data.DIR_STATS + 'stat_crawl_one_webpage',
                              'a') as myfile:
                        myfile.write(str(time() - begining) + '\n')

                    # End of crawling loop

                module.tell('{} new documents!'.format(self.crawled_websites),
                            severity=-1)

                self.send_to_db()
                self.indexing()

                module.stats_webpages(begining, time())

                self.infos.clear(
                )  # Reset the list of dict of informations of websites.
                self.file_manager.check_stop_crawling()
                self.file_manager.save_config()
                if self.file_manager.run == 'false':
                    module.tell('User wants stop program')
                    module.safe_quit()

            # End of loop range(n)
            self.suggestions()
            self.send_inverted_index()
            self.file_manager.check_size_files()
            module.stats_crawl(stats_crawl, time())
Exemplo n.º 28
0
	def upload(self, language, first_letter, two_letters, index):
		FTP_INDEX = language + '/' + first_letter + '/' + two_letters + '.sif'
		tell('uploading {} in {}'.format(self.DIR_INDEX + FTP_INDEX, two_letters + '.sif'))
		self.put(self.DIR_INDEX + FTP_INDEX, two_letters + '.sif')
		self.downuploaded_files += 1
Exemplo n.º 29
0
	def send_inverted_index(self, inverted_index):
		"""Send inverted-index.

		:param inverted_index: inverted-index to send
		:type inverted_index: dict
		:return: True if an error occured

		"""
		tell('send inverted-index')
		self.downuploaded_files = 0
		self.nb_files = count_files_index(inverted_index)  # Count files from index (prepare to upload)
		self.connection()
		files = self.listdir()
		if self.FTP_INDEX not in files:
			self.mkdir(self.FTP_INDEX)
		self.cd(self.FTP_INDEX)
		tell('go to ' + self.FTP_INDEX)

		for language in inverted_index:
			list_language = self.listdir()
			if language not in list_language:
				self.mkdir(language)
			self.cd(language)
			tell('go to ' + language)
			for first_letter in inverted_index[language]:
				self.tell_progress()
				list_first_letter = self.listdir()
				if first_letter not in list_first_letter:
					self.mkdir(first_letter)

				self.cd(first_letter)
				tell('go to ' + first_letter)
				for two_letters in inverted_index[language][first_letter]:
					index = inverted_index[language][first_letter][two_letters]
					self.upload(language, first_letter, two_letters, index)

				self.cd('..')
				tell('go back')
			self.cd('..')
			tell('go back')

		self.disconnect()
		tell('Transfer complete', severity=0)
		return False
Exemplo n.º 30
0
    def get_infos(self, url, code, nofollow, score):
        """Manage all searches of webpage's informations.

		:param url: url of webpage
		:type url: str
		:param score: score of webpage
		:type score: int
		:param code: source code of webpage
		:type code: str
		:param nofollow: if we take links of webpage
		:type nofollow: bool
		:return: links, title, description, key words, language,
			score, number of words

		"""
        results = dict()
        results['homepage'] = 1 if searches.is_homepage(url) else 0

        self.parser.feed(code)

        results['title'] = searches.clean_text(
            searches.capitalize(self.parser.title))  # Find title and clean it

        keywords = searches.clean_text(self.parser.keywords.lower()).split()

        # Language:
        if self.parser.language != '':
            language = self.parser.language
            score += 1
        else:
            language = self.detect_language(keywords)

        if language in self.STOPWORDS and self.parser.title != '':
            keywords = self.clean_keywords(keywords, language)
            keywords.extend(
                self.clean_keywords(results['title'].lower().split(),
                                    language))
            infos_url = urlparse(url)
            path_position = infos_url.path.rfind('.')
            path = infos_url.path[:path_position]
            keywords.extend(self.clean_keywords(path, language))

            results['sanesearch'] = self.sane_search(keywords, language)
            results['language'] = language
            results['keywords'] = keywords

            # Description:
            if self.parser.description == '':
                results['description'] = searches.clean_text(
                    searches.capitalize(self.parser.first_title))
            else:
                results['description'] = searches.clean_text(
                    searches.capitalize(self.parser.description))

            # Css:
            if self.parser.css:
                score += 1

            base_url = searches.get_base_url(url)

            # Links:
            if nofollow:
                links = list()
            else:
                links = data_processing.clean_links(self.parser.links,
                                                    base_url)
                searches.stats_links(len(links))
            if self.parser.favicon != '':
                results['favicon'] = self.clean_favicon(
                    self.parser.favicon, base_url)
            else:
                results['favicon'] = ''
        else:
            tell('No language or title', severity=0)
            results = {'title': ''}
            links = list()
            results['language'] = '*'

        results['score'] = score
        return results, links