Python DirectoryHelper示例，helper_directory.DirectoryHelper Python示例

示例#1

0

显示文件

文件： simplewebcrawler_v2.py 项目： vdmitriyev/simplewebcrawler

    def __init__(self, start_url, domain):
        """
            initial method:
                - initiates helper class;
                - checks the temp directory existance
        """

        self.helper = DirectoryHelper()
        #self.helper.prepare_working_directory()

        try:
            self.work_dir = self.helper.work_dir
        except:
            self.work_dir = '__temp__'

        if not os.path.exists(self.work_dir):
            os.makedirs(self.work_dir)

        if start_url is None:
            print '[e] specify start URL'

        start_url = BSCrawler.adjust_url(start_url)

        self.urls = SetQueue()
        self.urls.put(start_url)
        self.domain = domain
        self.website_graph = {}

        print '[i] files will be saved into folder "{0}"'.format(self.work_dir)

示例#2

0

显示文件

class WordCloudHelper():
    """Class that is responsinble for data masks and fonts."""
    def __init__(self, work_dir=None):
        """
          (obj, str) -> None

          Initializing the class.
        """

        # assert work_dir is not None, \
        #       "work_dir should not be None"

        self.dir_helper = DirectoryHelper(work_dir)
        self.save_dir = self.dir_helper.work_dir
        self.dir_helper.prepare_working_directory()

        print '[i] working directory prepared'

    def load_fonts(self, selected_fonts=None):
        """
          (obj, list) -> dict

          Loading fonts as specified in the list or by itereting folder with fonts.
        """

        BASE_FOLDER = self.dir_helper.upper_directory() + 'fonts\\'

        fonts = {}

        if selected_fonts is not None:
            for font in selected_fonts:
                fonts[font] = BASE_FOLDER + font + '.ttf'
        else:
            files = [f for f in os.listdir(BASE_FOLDER)]
            for f in files:
                if (f[-4:].lower() in ('.ttf')):
                    fonts[f[:-4]] = BASE_FOLDER + f

        return fonts

    def load_masks(self, selected_masks=None):
        """
            (obj, list) -> dict

            Loading masks as specified in the list or by itereting folder with masks.
        """

        BASE_FOLDER = self.dir_helper.upper_directory() + 'masks\\'

        masks = {}
        if selected_masks is not None:
            for mask in selected_masks:
                masks[mask] = BASE_FOLDER + mask + '.png'
        else:
            files = [f for f in os.listdir(BASE_FOLDER)]
            for f in files:
                if (f[-4:].lower() in ('.png')):
                    masks[f[:-4]] = BASE_FOLDER + f

        return masks

示例#3

0

显示文件

    def __init__(self, work_dir=None):
        """
          (obj, str) -> None

          Initializing the class.
        """

        # assert work_dir is not None, \
        #       "work_dir should not be None"

        self.dir_helper = DirectoryHelper(work_dir)
        self.save_dir = self.dir_helper.work_dir
        self.dir_helper.prepare_working_directory()

        print '[i] working directory prepared'

示例#4

0

显示文件

    def config_stopwords(self):
        """
          (obj) -> None

        """

        # processing custom stop words from specified files
        custom_stopwords = list()
        dh = DirectoryHelper()
        for sw_file in stop_words_files:
            sw_fullpath = stop_words_folder + '\\' + sw_file
            sw_file = dh.read_file_utf8(sw_fullpath)
            for sw_single in sw_file.split('\n'):
                custom_stopwords.append(sw_single[:-1])

        # adding fetched stopwords
        super(HabrServiceToWC, self).config_stopwords(custom_stopwords)

示例#5

0

显示文件

文件： habr-to-wordcloud.py 项目： Anhmike/services-to-wordcloud

    def config_stopwords(self):
        """
          (obj) -> None

        """

        # processing custom stop words from specified files
        custom_stopwords = list()
        dh = DirectoryHelper()
        for sw_file in stop_words_files:
            sw_fullpath = stop_words_folder + '\\' + sw_file
            sw_file = dh.read_file_utf8(sw_fullpath)
            for sw_single in sw_file.split('\n'):
                custom_stopwords.append(sw_single[:-1])

        # adding fetched stopwords
        super(HabrServiceToWC, self).config_stopwords(custom_stopwords)

示例#6

0

显示文件

文件： simplewebcrawler.py 项目： vdmitriyev/simplewebcrawler

    def __init__(self):
        """
            initial method:
                - initiates helper class;
                - checks the temp directory existance
        """

        self.helper = DirectoryHelper()
        #self.helper.prepare_working_directory()
        try:
            self.work_dir = self.helper.work_dir
        except:
            self.work_dir = '__temp__'

        if not os.path.exists(self.work_dir):
            os.makedirs(self.work_dir)

        print '[i] files will be saved into folder "{0}"'.format(self.work_dir)

示例#7

0

显示文件

文件： helper_wordcloud.py 项目： drewmccalmont/services-to-wordcloud

    def __init__(self, work_dir=None):
        """
          (obj, str) -> None

          Initializing the class.
        """

        # assert work_dir is not None, \
        #       "work_dir should not be None"

        self.dir_helper = DirectoryHelper(work_dir)
        self.save_dir = self.dir_helper.work_dir
        self.dir_helper.prepare_working_directory()

        print '[i] working directory prepared'

示例#8

0

显示文件

文件： uolbibliography.py 项目： vdmitriyev/uol-bibliography-data

    def __init__(self):
        """  Initial method that:

            - initiates helper class;
            - checks the temp directory existence;
        """
        self.logger = hlp.custom_logger()
        self.helper = DirectoryHelper()
        #self.helper.prepare_working_directory()
        try:
            self.work_dir = self.helper.work_dir
        except:
            self.work_dir = '__temp__'

        if not os.path.exists(self.work_dir):
            os.makedirs(self.work_dir)

        self.logger.info('[i] files will be saved into folder "{0}"'.format(self.work_dir))

示例#9

0

显示文件

文件： uolbibliography.py 项目： vdmitriyev/uol-bibliography-data

class BSCrawler():
    """ Crawling the HTML page and fetching data into table forms."""

    UA = 'Mozilla/5.0 (X11; U; FreeBSD i386; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'

    def __init__(self):
        """  Initial method that:

            - initiates helper class;
            - checks the temp directory existence;
        """
        self.logger = hlp.custom_logger()
        self.helper = DirectoryHelper()
        #self.helper.prepare_working_directory()
        try:
            self.work_dir = self.helper.work_dir
        except:
            self.work_dir = '__temp__'

        if not os.path.exists(self.work_dir):
            os.makedirs(self.work_dir)

        self.logger.info('[i] files will be saved into folder "{0}"'.format(self.work_dir))

    def crawl(self, mergedata, urlfile=None):
        """Method that extracts URLs from given file and process them.

        Args:
            urlfile: file that contains URLS to be processed
        """

        self.logger.info('[i] given URls will be processed')

        # data buffer for all processed URLs
        data = []

        if urlfile:
             with codecs.open(urlfile, 'r', encoding='utf8') as f_urls:
                for line in f_urls:
                    stripped = line.strip()
                    if not stripped.startswith('#') and not len(stripped) == 0 and stripped.startswith('http'):
                        self.logger.info('[i] following URL is going to be parsed:\n {0}'.format(stripped))

                        try:
                            doc = self.download_document(stripped)
                            data += self.process_uol_bibliography_tbl(doc)
                            sleep(SLEEP_TIME_IN_SECONDS)
                        except Exception as ex:
                            self.logger.error('[e] exception: {0}, arguments: {1}'.format(ex.message, ex.args))

                    # else:
                    #     self.logger.info('[i] following URL was ignored:\n {0}'.format(stripped))

        # processing graduated PhDs of Computer Science
        url_graduated = 'http://www.uni-oldenburg.de/informatik/studium-lehre/promotion/promotionen/'
        sleep(SLEEP_TIME_IN_SECONDS)
        doc_gradauted = self.download_document(url_graduated)
        self.process_uol_graduated_phds(doc=doc_gradauted, output_file_name='cs-graduated-phds')

        # merging together all processed data
        if mergedata:
            csv = self.data_as_csv(data)
            self.helper.save_file(os.path.join(self.work_dir, 'uolbibliography-merged.csv'), csv)

        self.logger.info('[i] given URls were processed')

    def download_document(self, url):
        """ Downloading HTML page and storing inside string.

        Args:
            url: URL to be downloaded
        Returns:
            downloaded HTML
        """

        html = None
        try:
            req = urllib2.Request(url=url, headers={'User-Agent': self.UA})
            hdl = urllib2.urlopen(req)
            html = hdl.read()
        except Exception as ex:
            self.logger.error('[e] exception: {0}, arguments: {1}'.format(ex.message, ex.args))

        return html

    def get_data_from_table(self, table_body):
        """ Getting data from HTML table with BS."""

        data = []

        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() + " " for ele in cols] # Add extra space for empty values
            #data.append([elem for ele in cols if ele]) # Get rid of empty values
            data.append([ele.strip() for ele in cols]) # Take all values

        return data

    def validate_file_name(self, file_name):
        """ Removes all symbols that are not file name conform.

        Args:
            file_name: name of the file to be validated
        Returns:
            valid name of the file
        """

        valid_chars = '-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
        file_name = ''.join(c for c in file_name if c in valid_chars)
        return file_name

    # def save_images(self, img_list, folder):
    #     """
    #         (obj, list, str) -> None

    #         Saving images to the folders
    #     """

    #     img_dir = self.work_dir + folder + '\\'
    #     self.helper.create_directory_ondemand(img_dir)
    #     for img in img_list:
    #         img_full_url = img['src']
    #         if img['src'][:4] != 'http':
    #             img_full_url = self.main_url + img['src']
    #         img_extention = img_full_url[img_full_url.rfind('.'):]
    #         image = self.download_document(img_full_url)

    #         if image is not None:
    #             gen_img_name = img_dir + self.helper.gen_file_name(extention='')
    #             self.helper.save_img_file(gen_img_name + img_extention, image)
    #         else:
    #             self.logger.info('[i] this image is not found: {0}'.format(img_full_url))

    def decode_abbreviations(self, row, size = 8, lang = 'DE'):
        """ Decoding particular abbreviations within given data. """

        if size != 8: return None

        mapping = {}
        mapping['DE'] = {'AU':'Aufsatz', 'MO': 'Monographie', 'ZS':'Zeitschrift', 'SW': 'Sammelwerksbeitrag'}
        mapping['EN'] = {'AU':'Article', 'MO': 'Monograph', 'ZS':'Journal', 'SW': 'ContributionsToACollectiveWork'}

        if row[3] in mapping[lang]:
            row[3] = mapping[lang][row[3]]

        return row

    def data_as_csv(self, data, size = 8):
        """ Getting data as CSV. """

        resulting_csv = ''
        DELIMETER = '","'

        # adding header
        header_values = ['Fach', 'Autor/in', 'Titel', 'Typ', 'Meldetag', 'Punktzahl', ' ZahlOldenburgerAutoren', 'Jahr']
        header_row = DELIMETER.join(header_values)
        resulting_csv = '"' + header_row + '"' + '\n'

        for row in data:
            if len(row) == size:
                csv_row = DELIMETER.join(value.replace('"', "") for value in self.decode_abbreviations(row))
                resulting_csv +=  '"' + csv_row + '"' + '\n'

        return resulting_csv

    def process_uol_graduated_phds(self, doc, output_file_name = None):
        """ Processing given HTML to extract graduated PhDs.

        Args:
            doc:    document to be processed
        """

        bs_html = BeautifulSoup(doc, 'html5lib')

        # getting name of the file from HTML
        if output_file_name is None:
            output_file_name = self.helper.gen_file_name(extention='')

        # validating
        output_file_name = self.validate_file_name(output_file_name)

        # getting data from HTML table
        table = bs_html.find('table', attrs={'class':'farbe_lichtblau breite100'})
        table_body = table.find('tbody')
        data = self.get_data_from_table(table_body)

        # data from list to CSV
        csv_data = ''
        for row in data:
            fow_as_csv = '","'.join(row)
            fow_as_csv = '"' + fow_as_csv + '"' + '\n'
            csv_data += fow_as_csv

        self.helper.save_file(os.path.join(self.work_dir, output_file_name + '.csv'), csv_data)

    def process_uol_bibliography_tbl(self, doc, output_file_name = None):
        """ Processing given HTML to extract publication information from UOL's Hochschulbibliografie.

        Args:
            doc:    document to be processed
        Returns:
            table as a collection of Python lists
        """

        def is_valid_row(data_chunk):
            """ Validating any particular row given as input.

            Args:
                data_chunk: some random row

            Returns:
                True/False according to the validity of the data
            """

            if len(data_chunk) < 1: return False
            if 'Gesamtpunkte' in data_chunk[0]: return False
            return True

        bs_html = BeautifulSoup(doc, 'html5lib')

        # getting name of the file from HTML
        if output_file_name is None:
            div_tag = bs_html.find('div', attrs={'id' : 'inhalt', 'class':'floatbox'})
            h1_tag = div_tag.find('h1')
            if h1_tag is not None:
                output_file_name = h1_tag.get_text(separator=u' ')
            else:
                output_file_name = self.helper.gen_file_name(extention='')

        # validating file name
        output_file_name = self.validate_file_name(output_file_name)

        # getting data from HTML table
        table = bs_html.find('table', attrs={'class':'infotabelle'})
        table_body = table.find('tbody')
        data = self.get_data_from_table(table_body)

        # cleaning table data
        cleaned_data = []
        for row in data:
            if is_valid_row(row):
                cleaned_data.append(row)

        csv_data = self.data_as_csv(cleaned_data)

        # text data for debugging, uncomment if needed
        # prettified_html = bs_html.prettify()
        # self.helper.save_file(os.path.join(self.work_dir, output_file_name + '.html'), prettified_html)
        # text_from_html = bs_html.get_text()
        # self.helper.save_file(os.path.join(self.work_dir, output_file_name + '.txt'), text_from_html)

        # get proper file name length
        target_file = os.path.join(self.work_dir, output_file_name + '.csv')
        if len(target_file) > 250:
            target_file = target_file[:250] + '.csv'
            #self.logger.info(target_file)

        self.helper.save_file(target_file, csv_data)

        return cleaned_data

示例#10

0

显示文件

文件： simplewebcrawler.py 项目： vdmitriyev/simplewebcrawler

class BSCrawler():

    UA = 'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.2.9) Gecko/20100913 Firefox/3.6.9'

    def __init__(self):
        """
            initial method:
                - initiates helper class;
                - checks the temp directory existance
        """

        self.helper = DirectoryHelper()
        #self.helper.prepare_working_directory()
        try:
            self.work_dir = self.helper.work_dir
        except:
            self.work_dir = '__temp__'

        if not os.path.exists(self.work_dir):
            os.makedirs(self.work_dir)

        print '[i] files will be saved into folder "{0}"'.format(self.work_dir)

    def crawl(self):
        """
            (obj) -> None

            Method that extracts urls from 'dispatcher' dictionary and process them.
        """

        for url in self.dispatcher:
            print '[i] following url is going to be parsed:\n {0}'.format(url)
            self.main_url = url
            doc = self.download_document(url)
            self.dispatcher[url](self, doc)

    def crawl_dynamic(self):
        """
            (obj) -> None

            Method that extracts urls from 'dispatcher' in a dynamic way.
            It means that the initial URL patter should contain '%s' string patter to be formatted.
        """

        for url in self.dispatcher_dynamic:
            print '[i] following url is going to be parsed in a dynamic way:\n {0}'.format(url)
            self.dynamic_url = url
            tag = self.dispatcher_dynamic[url][1]
            self.dispatcher_dynamic[url][0](self, tag)

    def download_document(self, url):
        """
            (obj,str) -> (str)

            Downloading html page and storing inside string.
        """

        html = None
        try:
            req = urllib2.Request(url=url, headers={'User-Agent': self.UA})
            hdl = urllib2.urlopen(req)
            html = hdl.read()
        except Exception as ex:
            print '[e] exception: {0}, arguments: {1}'.format(ex.message, ex.args)

        return html

    def save_images(self, img_list, folder):
        """
            (obj, list, str) -> None

            Saving images to the folders
        """

        img_dir = self.work_dir + folder + '\\'
        self.helper.create_directory_ondemand(img_dir)
        for img in img_list:
            img_full_url = img['src']
            if img['src'][:4] != 'http':
                img_full_url = self.main_url + img['src']
            img_extention = img_full_url[img_full_url.rfind('.'):]
            image = self.download_document(img_full_url)

            if image is not None:
                gen_img_name = img_dir + self.helper.gen_file_name(extention='')
                self.helper.save_img_file(gen_img_name + img_extention, image)
            else:
                print ('[i] this image is not found: {0}'.format(img_full_url))


    def process_biblio_oldb(self, doc):
        """
            (obj, str) -> None

            Processing given html to extract information about publication rates.
        """

        soup = BeautifulSoup(doc)


        gen_new_name = self.helper.gen_file_name(extention='')

        prettified_html = soup.prettify()
        text_from_html = soup.get_text()

        rows = soup.findAll('tr')
        section = ''
        dataset = {}

        # setting locale
        try:
            locale.setlocale(locale.LC_ALL, 'de_DE')
        except:
            locale.setlocale(locale.LC_ALL, 'deu_deu')

        #years = ['2010:', '2011:', '2012:', '2013:', '2014:']
        years = ['2012:', '2013:', '2014:']

        for row in rows:
            if row.attrs:
                section = row.text
                dataset[section] = {}
            else:
                cells = row.findAll('td')
                for cell in cells:

                    if 'Gesamtpunkte' in cell.text:

                        text = cell.text.replace(u'\xa0', u' ')

                        splitted_ = text.split(u' ')
                        filtered_ = filter(lambda a: a != u'', splitted_)
                        filtered_ = filtered_[1:]

                        name = ''

                        def biblio_get_name(_input_list, years):
                            """
                            (obj, list) -> str

                            Building an proper name for the biblio link.
                            """
                            name = ''
                            for value in _input_list:
                                if value in years:
                                    break;
                                name = name + value + ' '

                            name_splitted = name.split(',')
                            name_result = name_splitted[1].rstrip().lstrip() + ' ' + name_splitted[0].rstrip().lstrip()

                            return name_result

                        name = biblio_get_name(filtered_, years)
                        dataset[name] = ''

                        temp_dict = {}

                        for index, value in enumerate(filtered_):
                            if filtered_[index] in years:
                                temp_dict[filtered_[index]] = filtered_[index + 1]

                        dataset[name] = temp_dict

        final_dataset = {}
        for value in dataset:
            years_dict = {}
            for year in years:
                try:
                    # years_dict[year[:len(year)-1]] = float(dataset[value][year])
                    years_dict[year[:len(year)-1]] = locale.atof(dataset[value][year])
                except Exception as ex:
                    #print ('[e] exception: {0}'.format(ex))
                    years_dict[year[:len(year)-1]] = float(0.0)
            final_dataset[value] = years_dict

        output_by_year = ''

        keyfunc = ''
        for year in years:
            sorted_ = sorted(final_dataset.keys(), key = lambda x: final_dataset[x][year[:len(year)-1]])
            output_by_year = output_by_year + '-----------------' + year[:len(year)-1] + '-----------------------' + '\n'
            for value in sorted_:
                output_by_year = output_by_year + value + ' ' + str(final_dataset[value]) + '\n'

        DELIMETER = ','
        output_as_csv = 'NAME' + DELIMETER + DELIMETER.join(years) + '\n'

        for row_key in final_dataset:
            tmp_row = row_key + DELIMETER
            for year in years:
                tmp_row += str(final_dataset[row_key][year[:-1]]) + DELIMETER
            output_as_csv += tmp_row[:-1] + '\n'

        # text data for debugging, uncomment if needed
        self.helper.save_file(self.work_dir + gen_new_name + '.html', prettified_html)
        self.helper.save_file(self.work_dir + gen_new_name + '.txt', text_from_html)
        self.helper.save_file(self.work_dir + os.path.sep + gen_new_name + '.csv', output_as_csv)
        self.helper.save_file(self.work_dir + os.path.sep + gen_new_name + '.output', output_by_year)


    def process_tutiempo_weather(self, tag):
        """
            (obj) -> None

            Processing weather data of the city from the tutiempo.

            KRASNOYARSK: http://en.tutiempo.net/climate/ws-284935.html
            NOVOSIBIRSK: http://en.tutiempo.net/climate/ws-296340.html
        """

        delimeter = ';'

        def internal_parser(html):
            """
                (str) -> dict

                Internal parser that parses given html accoding to it's structure.
            """

            soup = BeautifulSoup(html)
            data = []
            table = soup.find('table', attrs={'class':'medias mensuales'})
            rows = table.findAll('tr')
            for row in rows:
                cols = row.findAll('td')
                cols = [ele.text.strip() for ele in cols]
                data.append([ele for ele in cols if ele]) # Get rid of empty values

            daily_weather = {}
            for row in data:
                if len(row) > 3:
                    try:
                        day = int(row[0])
                        if day >= 1 and day <= 31:
                            daily_weather[day] = [row[2], row[3]]
                    except:
                        pass
                        #print '[e] exception: {}'.format(str(ex))

            # sorting values to have them in order
            daily_weather = collections.OrderedDict(sorted(daily_weather.items()))
            return daily_weather

        #params for testing
        years = ['1999']
        months = ['01']

        #params for real processing
        #years = ['1999', '2000', '2001', '2002']
        #months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
        try:
            if years is None or len(years) < 1:
                raise Exception('years should be defined')
            if months is None or len(months) < 1:
                raise Exception('months should be defined')
        except Exception as ex:
            print '[e] exception: {0}, arguments: {1}'.format(ex.message, ex.args)
            return

        result_csv = list()
        for year in years:
            for month in months:
                month_year = '{0}-{1}'.format(month,year)
                url_to_download = self.dynamic_url % (month_year)

                # priting info about url to download
                print '[i] url to download {}'.format(url_to_download)

                month_weather = internal_parser(self.download_document(url_to_download))
                for key in month_weather:
                    tmp_time = '{}-{}-{}'.format(str(year), str(month), str(key))
                    tmp_max = str(month_weather[key][0])
                    tmp_min = str(month_weather[key][1])
                    result_csv.append([tmp_time, tmp_max, tmp_min])

                # setting the main script to sleep for some seconds
                sleep(SLEEP_TIME_IN_SECONDS)

        # generating new file name
        _new_file_name = str(uuid.uuid1())

        # converting to the csv
        csv_output = ''

        # setting header of the csv
        prefix = tag[0:3]
        csv_output =  prefix + 'Time' + delimeter + prefix + 'MaxTemp' + delimeter + prefix + 'MinTemp' + '\n'

        for row in result_csv:
            tmp = ''
            for value in row:
                tmp += value + delimeter
            csv_output += tmp[:-1*len(delimeter)] + '\n'

        # saving csv output to in the file
        csv_full_path = self.work_dir + os.path.sep + tag + '-' + _new_file_name + '.csv'
        self.helper.save_file(csv_full_path, csv_output)
        print '[i] data in csv saved to the {}'.format(csv_full_path)

    # dispatchers for extract and parsing one single web page
    # it actually shows statistics of last three years starting from given
    dispatcher = {
            'http://diglib.bis.uni-oldenburg.de/hsb/statistik/?page=hsb_institut&jahr=2014&inst=20100': process_biblio_oldb,
        }

    # dispatchers for extract and parsing multiple web pages
    dispatcher_dynamic = {
            'http://en.tutiempo.net/climate/%s/ws-284935.html': [process_tutiempo_weather, "KRASNOYARSK"],
            'http://en.tutiempo.net/climate/%s/ws-296340.html': [process_tutiempo_weather, "NOVOSIBIRSK"],
        }

示例#11

0

显示文件

文件： simplewebcrawler_v2.py 项目： vdmitriyev/simplewebcrawler

class BSCrawler():

    UA = 'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.2.9) Gecko/20100913 Firefox/3.6.9'

    def __init__(self, start_url, domain):
        """
            initial method:
                - initiates helper class;
                - checks the temp directory existance
        """

        self.helper = DirectoryHelper()
        #self.helper.prepare_working_directory()

        try:
            self.work_dir = self.helper.work_dir
        except:
            self.work_dir = '__temp__'

        if not os.path.exists(self.work_dir):
            os.makedirs(self.work_dir)

        if start_url is None:
            print '[e] specify start URL'

        start_url = BSCrawler.adjust_url(start_url)

        self.urls = SetQueue()
        self.urls.put(start_url)
        self.domain = domain
        self.website_graph = {}

        print '[i] files will be saved into folder "{0}"'.format(self.work_dir)

    def download_document(self, url):
        """
            (obj,str) -> (str)

            Downloading html page and storing inside string.
        """

        html = None
        try:
            req = urllib2.Request(url=url, headers={'User-Agent': self.UA})
            hdl = urllib2.urlopen(req)

            content_type = hdl.info().dict['content-type']

            if  ACCEPTABLE_CONTENT_TYPE in content_type:
                html = hdl.read()
            else:
                print '[i] ignored content-type was {0}'.format(content_type)

        except Exception as ex:
            print '[e] exception: {0}, arguments: {1}'.format(ex.message, ex.args)

        return html

    @staticmethod
    def adjust_url(url):
        """
            Removing the last slash if presented
        """

        if url[-1:] == '/':
            url = url[:-1]

        return url

    def crawl(self):
        """
            (obj) -> None

            Method that extracts urls from 'dispatcher' dictionary and process them.
        """

        visited = set()

        while not self.urls.empty(): # iterating over queue

            url = self.urls.get()
            print '[i] parsing following url {0}'.format(url)

            visited.add(url)
            html = self.download_document(url)

            if html is None:
                continue

            soup = BeautifulSoup(html)

            # extracting additional links
            for line in soup.findAll('a'):
                url_potential = line.get('href')
                if url_potential != '#' and \
                   url_potential is not None:

                   url_potential = BSCrawler.adjust_url(url_potential)

                   self.update_website_graph(from_link=url, to_link=url_potential)

                   if  url_potential not in visited and \
                        self.domain in url_potential:
                            self.urls.put(url_potential) #adding to the queue

            def proper_filename(url):
                html_file_name = url.replace('https://', '')
                html_file_name = html_file_name.replace('http://', '')
                html_file_name = html_file_name.replace(':', '')
                #h_file_name = h_file_name.replace('/', os.path.sep) + str(uuid.uuid1())
                html_file_name = html_file_name.replace('/', '-') + str(uuid.uuid1())

                return self.work_dir + os.path.sep + html_file_name + '.html'


            # saving html to a file
            full_file_name = proper_filename(url)
            self.helper.save_file(full_file_name, html)
            print '[i] html was saved to the {}'.format(full_file_name)

    def update_website_graph(self, from_link, to_link):
        """
            (obj, str) -> None

            Updating graph of a web site
        """

        if not from_link in self.website_graph:
            self.website_graph[from_link] = []

        self.website_graph[from_link] += [to_link]

示例#12

0

显示文件

文件： helper_wordcloud.py 项目： drewmccalmont/services-to-wordcloud

class WordCloudHelper():

    """Class that is responsinble for data masks and fonts."""

    def __init__(self, work_dir=None):
        """
          (obj, str) -> None

          Initializing the class.
        """

        # assert work_dir is not None, \
        #       "work_dir should not be None"

        self.dir_helper = DirectoryHelper(work_dir)
        self.save_dir = self.dir_helper.work_dir
        self.dir_helper.prepare_working_directory()

        print '[i] working directory prepared'

    def load_fonts(self, selected_fonts=None):
        """
          (obj, list) -> dict

          Loading fonts as specified in the list or by itereting folder with fonts.
        """

        BASE_FOLDER = self.dir_helper.upper_directory() + 'fonts/'

        fonts = {}

        if selected_fonts is not None:
            for font in selected_fonts:
                fonts[font] = BASE_FOLDER + font + '.ttf'
        else:
            files = [f for f in os.listdir(BASE_FOLDER)]
            for f in files:
                if (f[-4:].lower() in ('.ttf')):
                    fonts[f[:-4]] = BASE_FOLDER + f

        return fonts

    def load_masks(self, selected_masks=None):
        """
            (obj, list) -> dict

            Loading masks as specified in the list or by itereting folder with masks.
        """

        BASE_FOLDER = self.dir_helper.upper_directory() + 'masks/'

        masks = {}
        if selected_masks is not None:
            for mask in selected_masks:
                masks[mask] = BASE_FOLDER + mask + '.png'
        else:
            files = [f for f in os.listdir(BASE_FOLDER)]
            for f in files:
                if (f[-4:].lower() in ('.png')):
                    masks[f[:-4]] = BASE_FOLDER + f

        return masks