def crawl(configuration_file_path): """The main process to crawl the target websites. Crawl the websites or elements data, according to configuration. :param configuration_file_path: The file path of configuration, named spider.conf. :return: The websites or pics saved in ./output/, in form of .html or .jpg. """ # Create logger for debugging. output_log.init_log('./log/crawl_html') # Catch the raised error from parse_config function. parameter_dictionary = read_config.parse_config(configuration_file_path) if parameter_dictionary == 1: logging.error('\nThe error occurs in read_config.py module.') return 1 # Create queue and import seed html address into queue. try: url_list_file = parameter_dictionary['url_list_file'] except KeyError as e: logging.error( '\nThe caught error is:\nKeyError in parameter_dictionary ' 'with %s.' % str(e.args)) return 1 unique_html_queue, unique_html_set = save_seed_html.save_seed_to_list( url_list_file) # Create output directory if not exist. try: if not os.path.exists(parameter_dictionary['output_directory']): os.mkdir(parameter_dictionary['output_directory']) except OSError as e: logging.error('\nThe caught error is:\nThe directory ' '%s is not existed.' % str(e.filename)) return 1 # Set and start multiple threads for crawling websites. threads_num = int(parameter_dictionary['thread_count']) start_time = time.time() logging.info('Start time is %s.' % start_time) spider_threads = create_threads.\ ConsumerAndProducerThreads(unique_html_set, threads_num, unique_html_queue, crawl_and_parse_website.crawl_and_save_html, parameter_dictionary) spider_threads.start_threads() # Join the multiple threads until all the threads finish the task in queue, # meaning when the task count equals to zero. logging.info('\nThreads join.') unique_html_queue.join() # Record the running time. time_interval = time.time() - start_time logging.info('\nTime interval is %s seconds.' % time_interval)
def test_read_config(self): """Test the module 'read_config.py'. Returns: Boolean. """ # Import the logger. output_log.init_log(self.log_file) # Import parameter_dictionary. parameter_dictionary = read_config.parse_config(self.config_file_name) # Test results. self.assertTrue('thread_count' in parameter_dictionary.keys())
def test_output_log(self): """Test the module 'output_log.py'. Returns: Boolean. """ # Import the logger. output_log.init_log(self.log_file) # Execute the function. log_address = '%s.log' % self.log_file logging.info('Here is the INFO information.') # Test results. self.assertTrue(os.path.exists(log_address))
def main(): """The main function to check normal url format. :return: The output file contains urls with their results. The content format is 'url\tresult'. """ # Input the file name. parser = argparse.ArgumentParser( description='This is script to check the normal url format.') parser.add_argument('-i', required=True, help='The input file path.') parser.add_argument('-o', required=True, help='The output file path.') # Create logger for debugging. ol.init_log('./log/url_check') # Read, parse and check the urls in file. args = parser.parse_args() file_path = args.i output_path = args.o # Total records number. with open(file_path) as f: total_records = len(f.readlines()) with open(file_path) as f: # Record the starting time. start_time = time.time() logging.info('Start analyzing the url format and outputing results. ' 'Start time is %s.' % start_time) count = 0.0 with open(output_path, 'a') as fr: for line in f.readlines(): count += 1 # Add count number. if count % 1000 == 0: logging.info('The percent of running records is %s' % count / total_records) line = line.strip() result = check_url_format(line) # Output the content with format 'url+result'. fr.writelines(line + '\t' + result + '\n') # Record ending time. end_time = time.time() logging.info('End analyzing the url format and outputing results. ' 'End time is %s. \nAnd the duration is %s' % (end_time, end_time - start_time))
def test_save_html_into_file(self): """Test the module 'save_html_into_file.py'. Returns: Boolean. """ # Import the logger. output_log.init_log(self.log_file) # Import parameter_dictionary. parameter_dictionary = read_config.parse_config(self.config_file_name) # Execute the function. save_html_into_file.save_html_to_file('test_file.html', 'This is the test content.', '1', **parameter_dictionary) target_address = parameter_dictionary['output_directory'] + '/' + \ 'test_file.html'.replace('/', '_').replace(':', '-') # Test results. self.assertTrue(os.path.exists(target_address))
def test_crawl_and_parse_website(self): """Test the module 'crawl_and_parse_website.py'. Returns: Boolean. """ # Import the logger. output_log.init_log(self.log_file) # Import parameter_dictionary. parameter_dictionary = read_config.parse_config(self.config_file_name) # Execute the function. html_address = 'https://www.cnblogs.com/buptzym/p/6933868.html' sublink_list, html_content = crawl_and_parse_website.\ save_sublinks_to_queue(html_address, **parameter_dictionary) # Test results. self.assertTrue(len(sublink_list) != 0) self.assertTrue(html_content != 'e.code') self.assertTrue(html_content != '')
import scipy import warnings import logging from scipy.stats import kstest from scipy.stats import shapiro import scipy.stats as spstats from sklearn.preprocessing import * import data_clean as dc from config import output_log from util import PowerTransformer warnings.filterwarnings('ignore') # Create logger for debugging. output_log.init_log('./log/crawl_html') class CategoryCombiner(object): """The class to combine categories with little instance. """ def __init__(self, cate_columns, discard_ratio=0.01): """Initialize class with given parameters. :param cate_columns: List. The list of category columns, which would be processed. :param discard_ratio: The ratio set to filter out categories which should be combined. """ # Assign values of parameters. self.cate_columns = cate_columns
import os import os.path as osp import warnings from keras import backend as K from keras.models import load_model import tensorflow as tf from tensorflow.python.framework import graph_io from tensorflow.python.framework.graph_util import convert_variables_to_constants from config import output_log # Set necessary environment. warnings.filterwarnings("ignore") # Create logger for debugging. output_log.init_log('./log/update_url') def freeze_session(session, keep_var_names=None, output_names=None, clear_devices=True): """ Freezes the state of a session into a pruned computation graph. Creates a new computation graph where variable nodes are replaced by constants taking their current value in the session. The new graph will be pruned so sub-graphs that are not necessary to compute the requested outputs are removed. :param session: The TensorFlow session to be frozen. :param keep_var_names: A list of variable names that should not be frozen, or None to freeze all the variables in the graph. :param output_names: Names of the relevant graph outputs. :param clear_devices: Remove the device directives from the graph for better portability.
#!/usr/bin/env python # -*- coding: utf-8 -*- # Import necessary packages. import argparse import logging import os from config import output_log # Create logger for debugging. output_log.init_log('./log/converter') def convert_to_num(ip_address): """Convert ip into number. Convert ip address into 32-bit integer number. :param ip_address: The ip address, eg. 127.125.5.1. :return: The 32-bit number converted from ip address. """ # Iterate through ip. number = 0 part_num = 0 part_str = '' for i in xrange(len(ip_address)): num = len(ip_address) - i - 1 if ip_address[num] == '.' or num == 0: if num == 0: part_str = ip_address[num] + part_str