def test_OpenCloseSingle(self):
        """Write 1 item, close, reopen checking if same item is there"""

        q = Queue(self.path)
        q.put('var1')
        del q
        q = Queue(self.path)
        self.assertEqual(1, q.qsize())
        self.assertEqual('var1', q.get())
        q.task_done()
    def test_PartialWrite(self):
        """Test recovery from previous crash w/ partial write"""

        q = Queue(self.path)
        for i in range(100):
            q.put('var%d' % i)
        del q
        with open(os.path.join(self.path, 'q00000'), 'ab') as f:
            pickle.dump('文字化け', f)
        q = Queue(self.path)
        self.assertEqual(100, q.qsize())
        for i in range(100):
            self.assertEqual('var%d' % i, q.get())
            q.task_done()
        with self.assertRaises(Empty):
            q.get_nowait()
    def test_ClearOldFile(self):
        """put until reaching chunksize, then get without calling task_done"""
        q = Queue(self.path, chunksize=10)
        for i in range(15):
            q.put('var1')

        for i in range(11):
            q.get()

        q = Queue(self.path, chunksize=10)
        self.assertEqual(q.qsize(), 15)

        for i in range(11):
            q.get()
            q.task_done()
        self.assertEqual(q.qsize(), 4)
    def test_RandomReadWrite(self):
        """Test random read/write"""

        q = Queue(self.path)
        n = 0
        for i in range(1000):
            if random.random() < 0.5:
                if n > 0:
                    q.get_nowait()
                    q.task_done()
                    n -= 1
                else:
                    with self.assertRaises(Empty):
                        q.get_nowait()
            else:
                q.put('var%d' % random.getrandbits(16))
                n += 1
    def test_GarbageOnHead(self):
        """Adds garbage to the queue head and let the internal integrity
        checks fix it"""

        q = Queue(self.path)
        q.put('var1')
        del q

        with open(os.path.join(self.path, 'q00001'), 'a') as fd:
            fd.write('garbage')

        q = Queue(self.path)
        q.put('var2')

        self.assertEqual(2, q.qsize())
        self.assertEqual('var1', q.get())
        q.task_done()
    def test_OpenCloseOneHundred(self):
        """Write 1000 items, close, reopen checking if all items are there"""

        q = Queue(self.path)
        for i in range(1000):
            q.put('var%d' % i)
        del q
        q = Queue(self.path)
        self.assertEqual(1000, q.qsize())
        for i in range(1000):
            data = q.get()
            self.assertEqual('var%d' % i, data)
            q.task_done()
        with self.assertRaises(Empty):
            q.get_nowait()
        # assert adding another one still works
        q.put('foobar')
        data = q.get()
示例#7
0
class Crawler(object):
    def __init__(self, site):
        '''
        (Crawler, str) -> Crawler
        creates a Crawler with a given origin_url
        '''
        self.site = site
        self.filters = site.referringsitefilter_set.all()
        self.domain = urlparse(site.url).netloc
        # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/
        # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter
        self.ignore_filter = ScalableBloomFilter(initial_capacity=10000000,
                                                 error_rate=0.00001)
        ignore_filter_dir = '../ignore_filter/'
        if not os.path.exists(ignore_filter_dir):
            os.makedirs(ignore_filter_dir)
            self.ignore_filter = ScalableBloomFilter(initial_capacity=10000000,
                                                     error_rate=0.00001)
            try:
                f = open(
                    '../ignore_filter/' + self.site.name + '_ignore_file.txt',
                    'r+')
                f.write(self.ignore_filter)
            except IOError:
                f = open(
                    '../ignore_filter/' + self.site.name + '_ignore_file.txt',
                    'w+')
            f.close()
        else:
            if (not (os.path.exists('../ignore_filter/' + self.site.name +
                                    '_ignore_file.txt'))):
                f = open(
                    '../ignore_filter/' + self.site.name + '_ignore_file.txt',
                    'w+')
                f.close()

            with open('../ignore_filter/' + self.site.name +
                      '_ignore_file.txt',
                      'r+',
                      buffering=False) as ignore_filter_file:
                try:
                    for line in ignore_filter_file:
                        self.ignore_filter.add(line.decode('utf8').rstrip())
                except Exception as e:
                    logging.info(str(e))
            ignore_filter_file.close()
        self.visited_count = 0

        tmpqueuetmp_dir = '../tmpqueue/tmp/'
        if not os.path.exists(tmpqueuetmp_dir):
            os.makedirs(tmpqueuetmp_dir)

        slugified_name = slugify(unicode(site.name))
        tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name)
        if not os.path.exists(tmpqueue_dir):
            os.makedirs(tmpqueue_dir)

        self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir)

        # Initial url
        if (self.site.is_shallow == False):
            self.to_visit.put(site.url)
        else:
            self.to_visit.put((site.url, str(0)))

        # Limit
        self.limit = common.get_config()["crawler"]["limit"]
        # Specifies how deep the shallow crawler should go; "1" is the lowest option for this
        self.level = common.get_config()["crawler"]["level"]
        """
        self.probabilistic_n = common.get_config()["crawler"]["n"]
        self.probabilistic_k = common.get_config()["crawler"]["k"]

        self.db = psycopg2.connect(host='localhost',
                                   database=common.get_config()["crawler"]["postgresql"]["name"],
                                   user=common.get_config()["crawler"]["postgresql"]["user"],
                                   password=common.get_config()["crawler"]["postgresql"]["password"])

        self.cursor = self.db.cursor()
        self.already_added_urls = set()
        self.visited_table = "visited_" + str(site.id)
        self.tovisit_table = "tovisit_" + str(site.id)

        #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table)
        #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)")
        self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table)
        self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))")

        #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,))
        self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,))

        self.db.commit()
        """

    def __iter__(self):
        return self

    def next(self):
        '''
        (Crawler) -> newspaper.Article
        returns the next article in the sequence
        '''

        #standard non-recursive tree iteration
        with open('../ignore_filter/' + self.site.name + '_ignore_file.txt',
                  'a') as ignore_filter_file:
            try:
                current_level = 0
                while (True):
                    if (self.limit > 0 and self.visited_count > self.limit):
                        raise StopIteration('Limit reached: {:d}'.format(
                            self.limit))
                    # if(self.pages_visited > self.probabilistic_n):
                    #     raise StopIteration
                    # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1")
                    # row = self.cursor.fetchone()
                    # if(row):
                    #     row_id = row[0]
                    #     current_url = row[1]
                    #     self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,))
                    # else:
                    #     raise StopIteration

                    # if(self._should_skip()):
                    #     logging.info(u"skipping {0} randomly".format(current_url))
                    #     continue
                    try:
                        if (self.site.is_shallow):
                            current = self.to_visit.get_nowait()
                            current_url = current[0]
                            current_level = current[1]
                            logging.info(u"Shallow on level {0} {1}".format(
                                current_level, current_url))
                        else:
                            current_url = self.to_visit.get_nowait()
                    except Empty:
                        self.site.is_shallow = True  # On line 26 the site gets set TO DELETE
                        self.to_visit.put((self.site.url, str(0)))
                        self.ignore_filter = ScalableBloomFilter(
                            initial_capacity=10000000, error_rate=0.00001)
                        ignore_filter_file.close()
                        os.remove('../ignore_filter/' + self.site.name +
                                  '_ignore_file.txt')
                        logging.info("stopped iteration")
                        logging.info(u"{0}".format(self.site.url))
                        raise ZeroDivisionError

                    logging.info(u"visiting {0}".format(current_url))
                    self.visited_count += 1
                    #use newspaper to download and parse the article
                    article = ExplorerArticle(current_url)
                    article.download()
                    if (self.site.is_shallow):
                        if (int(current_level) > self.level):
                            continue
                    # get urls from the article
                    for link in article.get_links():
                        url = urljoin(current_url, link.href, False)
                        if self.url_in_filter(url, self.filters):
                            logging.info(
                                u"skipping url \"{0}\" because it matches filter"
                                .format(url))
                            continue
                        try:
                            parsed_url = urlparse(url)
                            parsed_as_list = list(parsed_url)

                            if (parsed_url.scheme != u"http"
                                    and parsed_url.scheme != u"https"):
                                logging.info(
                                    u"skipping url with invalid scheme: {0}".
                                    format(url))
                                continue
                            parsed_as_list[5] = ''
                            url = urlunparse(
                                urlnorm.norm_tuple(*parsed_as_list))
                        except Exception as e:
                            logging.info(
                                u"skipping malformed url {0}. Error: {1}".
                                format(url, str(e)))
                            continue
                        if (not parsed_url.netloc.endswith(self.domain)):
                            continue
                        # If the url have been added to ignore list, skip
                        if (url in self.ignore_filter):
                            continue
                        # Ignores the subscribe links for many domains
                        if (u"subscribe" in url or "subscribe" in url and
                                not (u"-subscribe" in url or "-subscribe"
                                     or u"subscribe-" in url or "subscribe-")):
                            continue

                        # Append the url to to_visit queue
                        if (self.site.is_shallow):
                            self.to_visit.put(
                                (url, str(int(current_level) + 1)))
                            logging.info(
                                u"added {0} to the to_visit as well as the level {1}"
                                .format(url, str(int(current_level) + 1)))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")
                        else:
                            self.to_visit.put(url)
                            logging.info(
                                u"added {0} to the to_visit".format(url))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")

                    # Update the Queue
                    self.to_visit.task_done()

                    return article

            except StopIteration as e:
                raise e
            except ValueError as e:
                raise ValueError
            except Exception as e:
                raise e

    def url_in_filter(self, url, filters):
        """
        Checks if any of the filters matches the url.
        Filters can be in regex search or normal string comparison.
        """
        for filt in filters:
            if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE))
                    or (not filt.regex and filt.pattern in url)):
                return True
        return False
示例#8
0
class Crawler(object):
    def __init__(self, site):
        '''
        (Crawler, str) -> Crawler
        creates a Crawler with a given origin_url
        '''
        self.site = site
        self.filters = site.referringsitefilter_set.all()
        self.domain = urlparse(site.url).netloc
        # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/
        # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter
        self.ignore_filter = ScalableBloomFilter(
                initial_capacity=10000000,
                error_rate=0.00001)
        ignore_filter_dir='../ignore_filter/'
        if not os.path.exists(ignore_filter_dir):
            os.makedirs(ignore_filter_dir)
            self.ignore_filter = ScalableBloomFilter(
                initial_capacity=10000000,
                error_rate=0.00001)
            try:
            	f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+')
            	f.write(self.ignore_filter)
            except IOError:
            	f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+')
            f.close()
        else:
            if (not(os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))):
                f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+')
                f.close()

            with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file:
                try:
                    for line in ignore_filter_file:
                        self.ignore_filter.add(line.decode('utf8').rstrip())
                except Exception as e:
                    logging.info(str(e))
            ignore_filter_file.close()
        self.visited_count = 0

        tmpqueuetmp_dir='../tmpqueue/tmp/'
        if not os.path.exists(tmpqueuetmp_dir):
            os.makedirs(tmpqueuetmp_dir)

        slugified_name = slugify(unicode(site.name))
        tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name)
        if not os.path.exists(tmpqueue_dir):
            os.makedirs(tmpqueue_dir)

        self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir)

        # Initial url
        if (self.site.is_shallow == False):
            self.to_visit.put(site.url)
        else:
            self.to_visit.put((site.url, str(0)))

        # Limit
        self.limit = common.get_config()["crawler"]["limit"]
        # Specifies how deep the shallow crawler should go; "1" is the lowest option for this
        self.level = common.get_config()["crawler"]["level"]
        """
        self.probabilistic_n = common.get_config()["crawler"]["n"]
        self.probabilistic_k = common.get_config()["crawler"]["k"]

        self.db = psycopg2.connect(host='localhost',
                                   database=common.get_config()["crawler"]["postgresql"]["name"],
                                   user=common.get_config()["crawler"]["postgresql"]["user"],
                                   password=common.get_config()["crawler"]["postgresql"]["password"])

        self.cursor = self.db.cursor()
        self.already_added_urls = set()
        self.visited_table = "visited_" + str(site.id)
        self.tovisit_table = "tovisit_" + str(site.id)

        #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table)
        #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)")
        self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table)
        self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))")

        #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,))
        self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,))

        self.db.commit()
        """

    def __iter__(self):
        return self

    def next(self):
        '''
        (Crawler) -> newspaper.Article
        returns the next article in the sequence
        '''

        #standard non-recursive tree iteration
        with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file:
            try:
                current_level = 0
                while(True):
                    if (self.limit > 0 and self.visited_count > self.limit):
                        raise StopIteration('Limit reached: {:d}'.format(self.limit))
                    # if(self.pages_visited > self.probabilistic_n):
                    #     raise StopIteration
                    # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1")
                    # row = self.cursor.fetchone()
                    # if(row):
                    #     row_id = row[0]
                    #     current_url = row[1]
                    #     self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,))
                    # else:
                    #     raise StopIteration

                    # if(self._should_skip()):
                    #     logging.info(u"skipping {0} randomly".format(current_url))
                    #     continue
                    try:
                        if (self.site.is_shallow):
                            current = self.to_visit.get_nowait()
                            current_url = current[0]
                            current_level = current[1]
                            logging.info(u"Shallow on level {0} {1}".format(current_level, current_url))
                        else:
                            current_url = self.to_visit.get_nowait()
                    except Empty:
                        self.site.is_shallow = True # On line 26 the site gets set TO DELETE
                        self.to_visit.put((self.site.url, str(0)))
                        self.ignore_filter = ScalableBloomFilter(
                        initial_capacity=10000000,
                        error_rate=0.00001)
                        ignore_filter_file.close()
                        os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt')
                        logging.info("stopped iteration")
                        logging.info(u"{0}".format(self.site.url))
                        raise ZeroDivisionError


                    logging.info(u"visiting {0}".format(current_url))
                    self.visited_count += 1
                    #use newspaper to download and parse the article
                    article = ExplorerArticle(current_url)
                    article.download()
                    if (self.site.is_shallow):
                        if (int(current_level) > self.level):
                            continue
                    # get urls from the article
                    for link in article.get_links():
                        url = urljoin(current_url, link.href, False)
                        if self.url_in_filter(url, self.filters):
                            logging.info(u"skipping url \"{0}\" because it matches filter".format(url))
                            continue
                        try:
                            parsed_url = urlparse(url)
                            parsed_as_list = list(parsed_url)

                            if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"):
                                logging.info(u"skipping url with invalid scheme: {0}".format(url))
                                continue
                            parsed_as_list[5] = ''
                            url = urlunparse(urlnorm.norm_tuple(*parsed_as_list))
                        except Exception as e:
                            logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e)))
                            continue
                        if(not parsed_url.netloc.endswith(self.domain)):
                            continue
                        # If the url have been added to ignore list, skip
                        if (url in self.ignore_filter):
                            continue
                        # Ignores the subscribe links for many domains
                        if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")):
                        	continue

                        # Append the url to to_visit queue
                        if (self.site.is_shallow):
                            self.to_visit.put((url, str(int(current_level) + 1)))
                            logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1)))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")
                        else:
                            self.to_visit.put(url)
                            logging.info(u"added {0} to the to_visit".format(url))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")

                    # Update the Queue
                    self.to_visit.task_done()


                    return article


            except StopIteration as e:
                raise e
            except ValueError as e:
                raise ValueError
            except Exception as e:
                raise e

    def url_in_filter(self, url, filters):
        """
        Checks if any of the filters matches the url.
        Filters can be in regex search or normal string comparison.
        """
        for filt in filters:
            if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE)) or
                (not filt.regex and filt.pattern in url)):
                return True
        return False
示例#9
0
class Rotator(object):
    def __init__(self, config):
        self.config = config
        self.dateformat = config['dateformat']
        self.keep_files = int(config['rotate'])
        self.now = datetime.datetime.now()
        self.dateext = self.now.strftime(self.dateformat)
        self.mode = config['mode']
        self.compress = config['compress']
        self.user = config['user']
        self.group = config['group']
        self.sharedscripts = config['sharedscripts']
        self.destext = config['destext']
        self.copy = config['copy']
        self.copytohdfs = config['copytohdfs']
        self.prerotates = config['prerotate']
        self.postrotates = config['postrotate']
        self.hdfs_config = config['hdfs']
        self.queuepath = config['queuepath']
        self.queue_chunksize = 1000
        self.queue_block_timeout = 30
        self.queue = Queue(self.queuepath, self.queue_chunksize)
        self.client = None
        if self.hdfs_config:
            self.client = hdfs.InsecureClient(**self.hdfs_config)

    def get_rotated_dir(self, path):
        destext = self.now.strftime(self.destext)
        dest_dir = '{}-{}'.format(path, destext)
        return dest_dir

    def get_rotated_time(self, dest_path):
        dateext = dest_path.rsplit('-', 1)[-1]
        # remove gz ext
        dateext = dateext.split('.')[0]
        return datetime.datetime.strptime('-{}'.format(dateext),
                                          self.dateformat)

    def is_rotated_file(self, dest_path):
        try:
            t = self.get_rotated_time(dest_path)
            return bool(t)
        except:
            return False

    def get_dest_path(self, path):
        rotated_dir = self.get_rotated_dir(path)
        filename = os.path.split(path)[-1]
        dest_path = os.path.join(rotated_dir,
                                 '{}{}'.format(filename, self.dateext))
        return dest_path

    def remove_old_files(self, path):
        rotated_dir = self.get_rotated_dir(path)
        filename = os.path.split(path)[-1]
        path = os.path.join(rotated_dir, filename)
        glob_path = '{}-*'.format(path)
        files = [f for f in glob.glob(glob_path) if self.is_rotated_file(f)]
        files.sort(key=self.get_rotated_time, reverse=True)
        for f in files[self.keep_files:]:
            os.remove(f)

    def create_rotated_dir(self, path):
        rotated_dir = self.get_rotated_dir(path)
        makedirs(rotated_dir, 0755)
        chown(rotated_dir, self.user, self.group)

    def rename_file(self, path):
        self.create_rotated_dir(path)
        dest_path = self.get_dest_path(path)
        shutil.move(path, dest_path)

        self.queue.put((path, dest_path), timeout=self.queue_block_timeout)

        os.chmod(dest_path, self.mode)
        chown(dest_path, self.user, self.group)
        return dest_path

    def compress_file(self, dest_path):
        gzip(dest_path)
        return '{}.gz'.format(dest_path)

    def _copy_file(self, path, from_, to):
        if not to:
            return
        dest = os.path.normpath(path.replace(from_, to))
        dest_dir = os.path.dirname(dest)
        if not os.path.exists(dest_dir):
            makedirs(dest_dir, 0755)
            chown(dest_dir, self.user, self.group)
        if path.startswith(from_):
            shutil.copy(path, dest)

    def copy_file(self, dest_path):
        if isinstance(self.copy, dict):
            self.copy = [self.copy]

        for item in self.copy:
            to = item.get('to')
            from_ = item.get('from', '')
            self._copy_file(dest_path, from_, to)

    def _copy_to_hdfs(self, client, path, from_, to):
        if not to:
            return
        dest = os.path.normpath(path.replace(from_, to))
        if path.startswith(from_):
            client.upload(dest, path, overwrite=True, cleanup=True)

    def copy_to_hdfs(self, path):
        if not (self.copytohdfs and self.hdfs_config):
            return
        for item in self.copytohdfs:
            to = item.get('to')
            from_ = item.get('from', '')
            self._copy_to_hdfs(self.client, path, from_, to)

    def secure_copy(self):
        to_be_clean = set()
        while True:
            try:
                path, rotated_path = self.queue.get_nowait()
                rotated_path_before = rotated_path
                if not os.path.exists(rotated_path):
                    self.queue.task_done()
                    continue

                if self.compress:
                    rotated_path = self.compress_file(rotated_path)
                if self.copy:
                    self.copy_file(rotated_path)

                if self.copytohdfs:
                    self.copy_to_hdfs(rotated_path)

                self.queue.task_done()

                if self.compress:
                    os.remove(rotated_path_before)

                to_be_clean.add(path)
            except Empty:
                break
            except Exception as e:
                print e

        for path in to_be_clean:
            self.remove_old_files(path)

    def rotate(self):
        if self.sharedscripts:
            self.prerotate()

        for f in iterate_log_paths(self.config['paths']):
            if not self.sharedscripts:
                self.prerotate()

            self.rename_file(f)

            if not self.sharedscripts:
                self.postrotate()

        if self.sharedscripts:
            self.postrotate()

        self.secure_copy()

    def prerotate(self):
        for cmd in self.prerotates:
            run(cmd)

    def postrotate(self):
        for cmd in self.postrotates:
            run(cmd)
示例#10
0
class Rotator(object):
    def __init__(self, config):
        self.paths = config['paths']

        self.mode = int(config['mode'], 8)
        self.user = config['user']
        self.group = config['group']

        # FIXME: Handle rotated files keeping correctly
        # self.keep_files = int(config['rotate'])
        self.compress = config['compress']

        self.copy = config['copy']
        self.copytohdfs = config['copytohdfs']
        self.hdfs_config = config['hdfs']
        self.hdfs_client = None
        if self.hdfs_config:
            self.hdfs_client = hdfs.InsecureClient(**self.hdfs_config)

        self.dateformat = config['dateformat']
        self.now = datetime.datetime.now()
        self.timestamp = self.now.strftime(self.dateformat)
        self.destext = config['destext']

        self.fnformat = config['fnformat']
        if not self.fnformat:
            raise ValueError("'fnformat' cannot be empty")

        self.sharedscripts = config['sharedscripts']
        self.prerotates = config['prerotate']
        self.postrotates = config['postrotate']

        self.queuepath = config['queuepath']
        self.queue_chunksize = 1000
        self.queue_block_timeout = 30
        self.queue = Queue(self.queuepath, self.queue_chunksize)

    def get_rotated_dir(self, path):
        destext = self.now.strftime(self.destext)
        dest_dir = '{}-{}'.format(path, destext)
        return dest_dir

    def get_dest_path(self, path):
        rotated_dir = self.get_rotated_dir(path)
        logname = os.path.basename(path)
        dest_path = os.path.join(
            rotated_dir,
            self.fnformat.format(logname=logname,
                                 timestamp=self.timestamp,
                                 hostname=socket.gethostname()))
        return dest_path

    def create_rotated_dir(self, path):
        rotated_dir = self.get_rotated_dir(path)
        makedirs(rotated_dir, 0o755)
        chown(rotated_dir, self.user, self.group)

    def rename_file(self, path):
        self.create_rotated_dir(path)

        dest_path = self.get_dest_path(path)
        shutil.move(path, dest_path)

        self.queue.put((path, dest_path), timeout=self.queue_block_timeout)

        os.chmod(dest_path, self.mode)
        chown(dest_path, self.user, self.group)
        return dest_path

    def compress_file(self, dest_path):
        gzip(dest_path)
        return '{}.gz'.format(dest_path)

    def _copy_file(self, path, from_, to):
        if not to:
            return
        dest = os.path.normpath(path.replace(from_, to))
        dest_dir = os.path.dirname(dest)
        if not os.path.exists(dest_dir):
            makedirs(dest_dir, 0o755)
            chown(dest_dir, self.user, self.group)
        if path.startswith(from_):
            shutil.copy2(path, dest)

    def copy_file(self, dest_path):
        if isinstance(self.copy, dict):
            self.copy = [self.copy]

        for item in self.copy:
            to = item.get('to')
            from_ = item.get('from', '')
            self._copy_file(dest_path, from_, to)

    def _copy_to_hdfs(self, client, path, from_, to):
        if not to:
            return
        dest = os.path.normpath(path.replace(from_, to))
        if path.startswith(from_):
            client.upload(dest, path, overwrite=True, cleanup=True)

    def copy_to_hdfs(self, path):
        if not (self.copytohdfs and self.hdfs_config):
            return
        for item in self.copytohdfs:
            to = item.get('to')
            from_ = item.get('from', '')
            self._copy_to_hdfs(self.hdfs_client, path, from_, to)

    def secure_copy(self):
        while True:
            try:
                path, rotated_path = self.queue.get_nowait()
                rotated_path_before = rotated_path
                if not os.path.exists(rotated_path):
                    self.queue.task_done()
                    continue

                if self.compress:
                    rotated_path = self.compress_file(rotated_path)

                if self.copy:
                    self.copy_file(rotated_path)

                if self.copytohdfs:
                    self.copy_to_hdfs(rotated_path)

                if self.compress:
                    os.remove(rotated_path_before)

                self.queue.task_done()

            except Empty:
                break
            except Exception as e:
                print(e)
                raise

    def rotate(self):
        if self.sharedscripts:
            self.prerotate()

        for f in iterate_log_paths(self.paths):
            if is_empty_file(f):
                continue

            if not self.sharedscripts:
                self.prerotate()

            self.rename_file(f)

            if not self.sharedscripts:
                self.postrotate()

        if self.sharedscripts:
            self.postrotate()

        self.secure_copy()

    def prerotate(self):
        for cmd in self.prerotates:
            run(cmd)

    def postrotate(self):
        for cmd in self.postrotates:
            run(cmd)