Exemplo n.º 1
0
    def item_found(self, response):
        found_resource = response.url 
        LOGGER.log(CUSTOM_LOGGING.RES_FOUND, found_resource)
        
        item = Result()
        item['url'] = found_resource

        # STORE THE RESULTS         
        # Opening the file...
        target = open(self.sessionFilename, 'a+')
        target.write(found_resource)
        target.write("\n")
        
        # ... and close
        target.close()
        
        return item
Exemplo n.º 2
0
    def __init__(self, *args, **kwargs):

        if kwargs.get('where') is not None:
            self.allowed_domains = ['%s.svn.wordpress.org' % kwargs.get('where')]
            self.start_urls = ['http://%s.svn.wordpress.org/' % kwargs.get('where')]
        else:
            # Default repository domains: THEMES and PLUGINS 
            self.allowed_domains = [
                'themes.svn.wordpress.org',
                'plugins.svn.wordpress.org'
            ]
                                    
            # Default repository urls: THEMES and PLUGINS 
            self.start_urls = [
                'http://themes.svn.wordpress.org/',
                'http://plugins.svn.wordpress.org/'
            ]       
        

        if kwargs.get('what') is None:
            LOGGER.warning("WHAT parameter not set. Default value: ''")
            self.what = "" 
        else:
            self.what = kwargs.get('what')

        
        # Remove the used extension
        # @todo: Define a policy for the exclusion...
        extensions_toexclude = EXTENSIONS.ALL
        
        # Exclude the what .ext from those to exclude during crawl
        try:
            import os
            whatName, whatExtension = os.path.splitext(self.what)
            whatExtension = whatExtension.translate(None, ".")
            extensions_toexclude.remove(whatExtension)
        except ValueError:
            pass # or scream
    
    
        # add case-insensitive regex path 
        whatRgx = '(?i)' + re.escape(self.what)        
    
        
        self.rules = (
            Rule(SgmlLinkExtractor(allow=(), deny=(whatRgx, ),deny_extensions=(extensions_toexclude),)),
            Rule(SgmlLinkExtractor(allow=(whatRgx, ),deny_extensions=(extensions_toexclude),), callback='item_found'),
        )
        
        #Log before proceed
        LOGGER.info('Starting with parameters:')
        LOGGER.info('WHERE: ' + ' - '.join(map(str,self.allowed_domains)))
        LOGGER.info('WHAT: ' + self.what )
        
        # super() has to be called after the rules
        super(WPSpider, self).__init__(*args, **kwargs)        
Exemplo n.º 3
0
    def item_found(self, response):
        found_resource = response.url
        LOGGER.log(CUSTOM_LOGGING.RES_FOUND, found_resource)

        item = Result()
        item['url'] = found_resource

        # STORE THE RESULTS
        # Opening the file...
        target = open(self.sessionFilename, 'a+')
        target.write(found_resource)
        target.write("\n")

        # ... and close
        target.close()

        return item
Exemplo n.º 4
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time    : 2019/1/14 5:26 PM
# @Author  : w8ay
# @File    : data.py
from lib.collector import Collector
from lib.log import LOGGER

logger = LOGGER()


class PATHS:
    ROOT_PATH = ''
    PLUGIN_PATH = ''
    OUTPUT_PATH = ''
    DATA_PATH = ''


collector = Collector()
Exemplo n.º 5
0
# -*- coding: utf-8 -*-
#
#  Copyright 2012 Silvano Wegener & Daniel Henschel

from lib.log import LOGGER

import lib.settings as settings
import lib.basic as basic
import lib.log as log
import lib.database as database
import lib.connection as connection

import threading


LOGGER.write(log.LOGTAGS[0],'ShareLockHomes V'+settings.VERSION,'starting up...')
configuration = basic.initiateShareLockHomes()
#################################################################
server = connection.SockServer(connection.ServerHandler, configuration.get()['server'])
cserver = connection.SockServer(connection.ClientHandler, configuration.get()['client'])
threads = {}
threads['server'] = threading.Thread(target=server.serve_forever)
threads['cserver'] = threading.Thread(target=cserver.serve_forever)
threads['server'].daemon = True
threads['cserver'].daemon = True
threads['server'].start()
threads['cserver'].start()
################################################################# 
basic.waitForCtrlC()
basic.quit()
Exemplo n.º 6
0
    def __init__(self, *args, **kwargs):

        if kwargs.get('where') is not None:
            self.allowed_domains = [
                '%s.svn.wordpress.org' % kwargs.get('where')
            ]
            self.start_urls = [
                'http://%s.svn.wordpress.org/' % kwargs.get('where')
            ]
        else:
            # Default repository domains: THEMES and PLUGINS
            self.allowed_domains = [
                'themes.svn.wordpress.org', 'plugins.svn.wordpress.org'
            ]

            # Default repository urls: THEMES and PLUGINS
            self.start_urls = [
                'http://themes.svn.wordpress.org/',
                'http://plugins.svn.wordpress.org/'
            ]

        if kwargs.get('what') is None:
            LOGGER.warning("WHAT parameter not set. Default value: ''")
            self.what = ""
        else:
            self.what = kwargs.get('what')

        # Remove the used extension
        # @todo: Define a policy for the exclusion...
        extensions_toexclude = EXTENSIONS.ALL

        # Exclude the what .ext from those to exclude during crawl
        try:
            import os
            whatName, whatExtension = os.path.splitext(self.what)
            whatExtension = whatExtension.translate(None, ".")
            extensions_toexclude.remove(whatExtension)
        except ValueError:
            pass  # or scream

        # add case-insensitive regex path
        whatRgx = '(?i)' + re.escape(self.what)

        self.rules = (
            Rule(
                SgmlLinkExtractor(
                    allow=(),
                    deny=(whatRgx, ),
                    deny_extensions=(extensions_toexclude),
                )),
            Rule(SgmlLinkExtractor(
                allow=(whatRgx, ),
                deny_extensions=(extensions_toexclude),
            ),
                 callback='item_found'),
        )

        #Log before proceed
        LOGGER.info('Starting with parameters:')
        LOGGER.info('WHERE: ' + ' - '.join(map(str, self.allowed_domains)))
        LOGGER.info('WHAT: ' + self.what)

        # super() has to be called after the rules
        super(WPSpider, self).__init__(*args, **kwargs)