示例#1
0
def crawl(configuration_file_path):
    """The main process to crawl the target websites.

    Crawl the websites or elements data, according to configuration.

    :param configuration_file_path: The file path of configuration, named spider.conf.

    :return: The websites or pics saved in ./output/, in form of .html or .jpg.
    """
    # Create logger for debugging.
    output_log.init_log('./log/crawl_html')

    # Catch the raised error from parse_config function.
    parameter_dictionary = read_config.parse_config(configuration_file_path)
    if parameter_dictionary == 1:
        logging.error('\nThe error occurs in read_config.py module.')
        return 1

    # Create queue and import seed html address into queue.
    try:
        url_list_file = parameter_dictionary['url_list_file']
    except KeyError as e:
        logging.error(
            '\nThe caught error is:\nKeyError in parameter_dictionary '
            'with %s.' % str(e.args))
        return 1
    unique_html_queue, unique_html_set = save_seed_html.save_seed_to_list(
        url_list_file)

    # Create output directory if not exist.
    try:
        if not os.path.exists(parameter_dictionary['output_directory']):
            os.mkdir(parameter_dictionary['output_directory'])
    except OSError as e:
        logging.error('\nThe caught error is:\nThe directory '
                      '%s is not existed.' % str(e.filename))
        return 1

    # Set and start multiple threads for crawling websites.
    threads_num = int(parameter_dictionary['thread_count'])
    start_time = time.time()
    logging.info('Start time is %s.' % start_time)
    spider_threads = create_threads.\
        ConsumerAndProducerThreads(unique_html_set, threads_num, unique_html_queue,
                                   crawl_and_parse_website.crawl_and_save_html,
                                   parameter_dictionary)
    spider_threads.start_threads()

    # Join the multiple threads until all the threads finish the task in queue,
    # meaning when the task count equals to zero.
    logging.info('\nThreads join.')
    unique_html_queue.join()

    # Record the running time.
    time_interval = time.time() - start_time
    logging.info('\nTime interval is %s seconds.' % time_interval)
    def test_read_config(self):
        """Test the module 'read_config.py'.

        Returns: Boolean.
        """
        # Import the logger.
        output_log.init_log(self.log_file)
        # Import parameter_dictionary.
        parameter_dictionary = read_config.parse_config(self.config_file_name)

        # Test results.
        self.assertTrue('thread_count' in parameter_dictionary.keys())
    def test_output_log(self):
        """Test the module 'output_log.py'.

        Returns: Boolean.
        """
        # Import the logger.
        output_log.init_log(self.log_file)

        # Execute the function.
        log_address = '%s.log' % self.log_file
        logging.info('Here is the INFO information.')

        # Test results.
        self.assertTrue(os.path.exists(log_address))
示例#4
0
def main():
    """The main function to check normal url format.

    :return: The output file contains urls with their results.
            The content format is 'url\tresult'.
    """
    # Input the file name.
    parser = argparse.ArgumentParser(
        description='This is script to check the normal url format.')
    parser.add_argument('-i', required=True, help='The input file path.')
    parser.add_argument('-o', required=True, help='The output file path.')

    # Create logger for debugging.
    ol.init_log('./log/url_check')

    # Read, parse and check the urls in file.
    args = parser.parse_args()
    file_path = args.i
    output_path = args.o

    # Total records number.
    with open(file_path) as f:
        total_records = len(f.readlines())
    with open(file_path) as f:
        # Record the starting time.
        start_time = time.time()
        logging.info('Start analyzing the url format and outputing results. '
                     'Start time is %s.' % start_time)
        count = 0.0
        with open(output_path, 'a') as fr:
            for line in f.readlines():
                count += 1  # Add count number.
                if count % 1000 == 0:
                    logging.info('The percent of running records is %s' %
                                 count / total_records)
                line = line.strip()
                result = check_url_format(line)

                # Output the content with format 'url+result'.
                fr.writelines(line + '\t' + result + '\n')

    # Record ending time.
    end_time = time.time()
    logging.info('End analyzing the url format and outputing results. '
                 'End time is %s. \nAnd the duration is %s' %
                 (end_time, end_time - start_time))
    def test_save_html_into_file(self):
        """Test the module 'save_html_into_file.py'.

        Returns: Boolean.
        """
        # Import the logger.
        output_log.init_log(self.log_file)
        # Import parameter_dictionary.
        parameter_dictionary = read_config.parse_config(self.config_file_name)

        # Execute the function.
        save_html_into_file.save_html_to_file('test_file.html', 'This is the test content.',
                                              '1', **parameter_dictionary)
        target_address = parameter_dictionary['output_directory'] + '/' + \
            'test_file.html'.replace('/', '_').replace(':', '-')

        # Test results.
        self.assertTrue(os.path.exists(target_address))
    def test_crawl_and_parse_website(self):
        """Test the module 'crawl_and_parse_website.py'.

        Returns: Boolean.
        """
        # Import the logger.
        output_log.init_log(self.log_file)
        # Import parameter_dictionary.
        parameter_dictionary = read_config.parse_config(self.config_file_name)

        # Execute the function.
        html_address = 'https://www.cnblogs.com/buptzym/p/6933868.html'
        sublink_list, html_content = crawl_and_parse_website.\
            save_sublinks_to_queue(html_address, **parameter_dictionary)

        # Test results.
        self.assertTrue(len(sublink_list) != 0)
        self.assertTrue(html_content != 'e.code')
        self.assertTrue(html_content != '')
示例#7
0
import scipy
import warnings
import logging

from scipy.stats import kstest
from scipy.stats import shapiro
import scipy.stats as spstats
from sklearn.preprocessing import *

import data_clean as dc
from config import output_log
from util import PowerTransformer

warnings.filterwarnings('ignore')
# Create logger for debugging.
output_log.init_log('./log/crawl_html')


class CategoryCombiner(object):
    """The class to combine categories with little instance.

    """
    def __init__(self, cate_columns, discard_ratio=0.01):
        """Initialize class with given parameters.

        :param cate_columns: List. The list of category columns, which would be processed.
        :param discard_ratio: The ratio set to filter out categories which
            should be combined.
        """
        # Assign values of parameters.
        self.cate_columns = cate_columns
示例#8
0
import os
import os.path as osp
import warnings

from keras import backend as K
from keras.models import load_model
import tensorflow as tf
from tensorflow.python.framework import graph_io
from tensorflow.python.framework.graph_util import convert_variables_to_constants

from config import output_log

# Set necessary environment.
warnings.filterwarnings("ignore")
# Create logger for debugging.
output_log.init_log('./log/update_url')


def freeze_session(session, keep_var_names=None, output_names=None, clear_devices=True):
    """  Freezes the state of a session into a pruned computation graph.

    Creates a new computation graph where variable nodes are replaced by
    constants taking their current value in the session. The new graph will be
    pruned so sub-graphs that are not necessary to compute the requested
    outputs are removed.

    :param session: The TensorFlow session to be frozen.
    :param keep_var_names: A list of variable names that should not be frozen,
                         or None to freeze all the variables in the graph.
    :param output_names: Names of the relevant graph outputs.
    :param clear_devices: Remove the device directives from the graph for better portability.
示例#9
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Import necessary packages.
import argparse
import logging
import os

from config import output_log

# Create logger for debugging.
output_log.init_log('./log/converter')


def convert_to_num(ip_address):
    """Convert ip into number.

    Convert ip address into 32-bit integer number.

    :param ip_address: The ip address, eg. 127.125.5.1.

    :return: The 32-bit number converted from ip address.
    """
    # Iterate through ip.
    number = 0
    part_num = 0
    part_str = ''
    for i in xrange(len(ip_address)):
        num = len(ip_address) - i - 1
        if ip_address[num] == '.' or num == 0:
            if num == 0:
                part_str = ip_address[num] + part_str