Пример #1
0
    def process(self,item,file_name=None):

        downloadFileHandler = urllib2.urlopen(item.getValue())
        if file_name is None:
            file_name = item.getValue().split('/')[-1]
        directory = self.__config[FILE_FOLDER] + self.__getFileSubfolder(item)

        if not os.path.exists(directory):
            logger.info("Creating folder "+directory)
            os.makedirs(directory)

        # TODO use FILE_ADD_DATE to add date YYYYMMDDhhmm
        logger.info("Downloading filehandler to "+directory+file_name)

        filehandler = open(directory + file_name,'w')
        filehandler.write(downloadFileHandler.read())
        filehandler.close()
        # Return result
        if self.__config[FILE_ADD_AS_METADATA]:
            item.setMetadataValue(FILE_ADD_AS_METADATA_FIELD,filehandler)
            return[item]
        else:
            newFileItem = BaseItem(None)
            newFileItem.setParent(item)
            newFileItem.setValue(filehandler)
            return [newFileItem]
Пример #2
0
    def process(self,item):
        """ Process a HTML item , retrieving whatever is in config
            - item should contain a valid web url
        """
        try:
            if FROM_TEXT  in self.config and self.config[FROM_TEXT]:
                logger.debug("Using text to obtain BS object")
                htmlBS = BeautifulSoup(item.getValue())
            else:
                logger.debug("Request to "+ str(item.getValue()))
                try:
                    headers = self.__getHeaders()
                    if headers is not None:
                        htmlBS = BeautifulSoup(requests.get(item.getValue(),headers=headers).text)
                    else:
                        htmlBS = BeautifulSoup(requests.get(item.getValue()).text)
                except:
                    logger.error("Some errors requesting item value.Returning [] ")
                    traceback.print_exc()
                    return []
            bsHtmlItem = BaseItem({"parent":item})
            bsHtmlItem.setValue(htmlBS)

            bsProcessor = BSProcessor(self.config)

            return bsProcessor.process(bsHtmlItem)

        except:
            logger.error("Errors during html processing : ignoring")
            traceback.print_exc()
            return []
Пример #3
0
    def process(self, item):
        """ Process a HTML item , retrieving whatever is in config
            - item should contain a valid web url
        """
        try:
            if FROM_TEXT in self.config and self.config[FROM_TEXT]:
                logger.debug("Using text to obtain BS object")
                htmlBS = BeautifulSoup(item.getValue())
            else:
                logger.debug("Request to " + str(item.getValue()))
                try:
                    headers = self.__getHeaders()
                    if headers is not None:
                        htmlBS = BeautifulSoup(
                            requests.get(item.getValue(),
                                         headers=headers).text)
                    else:
                        htmlBS = BeautifulSoup(
                            requests.get(item.getValue()).text)
                except:
                    logger.error(
                        "Some errors requesting item value.Returning [] ")
                    traceback.print_exc()
                    return []
            bsHtmlItem = BaseItem({"parent": item})
            bsHtmlItem.setValue(htmlBS)

            bsProcessor = BSProcessor(self.config)

            return bsProcessor.process(bsHtmlItem)

        except:
            logger.error("Errors during html processing : ignoring")
            traceback.print_exc()
            return []
Пример #4
0
    def all(self):

        result = []
        dbResult = self.__collection.find();
        for dbEntity in dbResult:
            item = BaseItem(dbEntity["metadata"])
            item.setValue(dbEntity["value"])
            result.append(item)

        return result
Пример #5
0
def parse_html_from_text_test():

    html_text = "<html><head><title>title</title></head><body><p class='pclass'> my text </p></html>"
    item = BaseItem(None)
    item.setValue(html_text)
    processor = HtmlProcessor({FROM_TEXT:True,"find":{"p":{"class":"pclass"}}})

    result = processor.process(item)
    print result
    assert len(result) == 1
Пример #6
0
    def all(self):

        result = []
        dbResult = self.__collection.find()
        for dbEntity in dbResult:
            item = BaseItem(dbEntity["metadata"])
            item.setValue(dbEntity["value"])
            result.append(item)

        return result
Пример #7
0
def parse_html_from_text_test():

    html_text = "<html><head><title>title</title></head><body><p class='pclass'> my text </p></html>"
    item = BaseItem(None)
    item.setValue(html_text)
    processor = HtmlProcessor({
        FROM_TEXT: True,
        "find": {
            "p": {
                "class": "pclass"
            }
        }
    })

    result = processor.process(item)
    print result
    assert len(result) == 1
Пример #8
0
    def process(self, item):

        result = []

        if item is None:
            logger.info("None item : returning None")
            return None
        else:

            foundItems = self.internalProcess(item.getValue(), item)
            if foundItems is not None:
                logger.info("Found " + str(len(foundItems)) + " item(s)")
                result.extend(foundItems)
            else:
                logger.warn("No items found")
                resultItem = BaseItem({"parent": item})
                resultItem.setValue(item)
                result.append(resultItem)

        return result
Пример #9
0
    def process(self,item):

        result = []
        if self.__config[FILE_OP]==FILE_OP_STORE:
            # Write item value and newline, adds metadata filename and returns item
            self.__filehandler.write(item.getValue()+"\n")
            item.setMetadataValue(FILE_METADATA_FILENAME,self.__config[FILE_NAME])
            result = [item]
        elif self.__config[FILE_OP]==FILE_OP_RETRIEVE:
            # Open the file, strip lines and generate new items
            lines = [line.strip() for line in open(self.__config[FILE_NAME],"r")]
            for l in lines:
                item = BaseItem({"parent",item})
                item.setValue(l)
                result.append(item)
            pass
        else:
            raise "Unknown operation"

        return result
Пример #10
0
    def process(self,item):

        result = []

        if item is None:
            logger.info("None item : returning None")
            return None
        else:

            foundItems = self.internalProcess(item.getValue(),item)
            if foundItems is not None:
                logger.info("Found "+ str(len(foundItems))+" item(s)")
                result.extend(foundItems)
            else:
                logger.warn("No items found")
                resultItem = BaseItem({"parent":item})
                resultItem.setValue(item)
                result.append(resultItem)

        return result
Пример #11
0
    def all(self):

        result = []
        # Obtain all keys
        keys = self._r.keys()

        #For each key, get value
        for k in keys:
            value = self._r.get(k)
            result.append(BaseItem({"origin": "redis"}, value))
        #return result
        return result
Пример #12
0
    def process(self, item, file_name=None):

        downloadFileHandler = urllib2.urlopen(item.getValue())
        if file_name is None:
            file_name = item.getValue().split('/')[-1]
        directory = self.__config[FILE_FOLDER] + self.__getFileSubfolder(item)

        if not os.path.exists(directory):
            logger.info("Creating folder " + directory)
            os.makedirs(directory)

        # TODO use FILE_ADD_DATE to add date YYYYMMDDhhmm
        logger.info("Downloading filehandler to " + directory + file_name)

        filehandler = open(directory + file_name, 'w')
        filehandler.write(downloadFileHandler.read())
        filehandler.close()
        # Return result
        if self.__config[FILE_ADD_AS_METADATA]:
            item.setMetadataValue(FILE_ADD_AS_METADATA_FIELD, filehandler)
            return [item]
        else:
            newFileItem = BaseItem(None)
            newFileItem.setParent(item)
            newFileItem.setValue(filehandler)
            return [newFileItem]
Пример #13
0
    def internalProcess(self, bsObject, item):

        result = []

        if "find" in self.__config and self.__config["find"] is not None:
            logger.debug("Processing 'find' config " +
                         str(self.__config["find"]))
            findConfigDict = self.__config["find"]
            # Process all find config definitions
            for findKey in findConfigDict:

                if findConfigDict[findKey] is not None:
                    foundElements = bsObject.find_all(findKey,
                                                      findConfigDict[findKey])
                else:
                    foundElements = bsObject.findAll(findKey)

                for foundElement in foundElements:
                    foundItem = BaseItem({"parent": item})
                    foundItem.setValue(foundElement)
                    result.append(foundItem)

        elif "get" in self.__config and self.__config["get"] is not None:

            logger.debug("Processing 'get' config '" +
                         str(self.__config["get"]) + "'")
            getConfigDict = self.__config["get"]
            for getKey in getConfigDict:
                logger.debug("Processing GET '" + getKey + "'")
                if getKey is not None:

                    getItem = BaseItem({"parent": item})
                    getItem.setValue(bsObject.get(getKey))
                    result.append(getItem)

                else:
                    raise "Error : should provide something to GET"
        elif "text" in self.__config:
            result.append(BaseItem({"parent": item}, bsObject.text))
        else:
            logger.warning(
                "Nothing to process internally. Returning same item")
            result = [item]

        return result
Пример #14
0
    def process(self, item):

        result = []
        if self.__config[FILE_OP] == FILE_OP_STORE:
            # Write item value and newline, adds metadata filename and returns item
            self.__filehandler.write(item.getValue() + "\n")
            item.setMetadataValue(FILE_METADATA_FILENAME,
                                  self.__config[FILE_NAME])
            result = [item]
        elif self.__config[FILE_OP] == FILE_OP_RETRIEVE:
            # Open the file, strip lines and generate new items
            lines = [
                line.strip() for line in open(self.__config[FILE_NAME], "r")
            ]
            for l in lines:
                item = BaseItem({"parent", item})
                item.setValue(l)
                result.append(item)
            pass
        else:
            raise "Unknown operation"

        return result
Пример #15
0
    def getHeaders(self):
        #1. Obtain from "useragents.txt" file
        filename = "./useragents.txt"
        if self.config is not None and RANDOM_USER_AGENT_FILE in self.config:
            filename = self.config[RANDOM_USER_AGENT_FILE]

        useragents = FileProcessor({
            FILE_NAME: filename,
            FILE_OP: FILE_OP_RETRIEVE
        }).process(BaseItem(None, filename))

        #2. Obtain one from the resulting list
        from random import choice
        return {"User-Agent": choice(useragents).getValue()}
Пример #16
0
def parse_html_error_test():

    html_text = "<bad format></test>"
    item = BaseItem(None, html_text)

    processor = HtmlProcessor({
        FROM_TEXT: True,
        "find": {
            "p": {
                "class": "pclass"
            }
        }
    })

    result = processor.process(item)

    assert result == []
Пример #17
0
    def process(self, item):

        result = []
        logger.debug(" Item value type : " + str(type(item.getValue())))
        if type(item.getValue()) == type(
                TransmissionClientConfig(None, None, None, None)):
            clientConfig = item.getValue()
            tclient = transmissionrpc.Client(clientConfig.host,
                                             clientConfig.port,
                                             clientConfig.user,
                                             clientConfig.password)

            torrents = tclient.get_torrents()
            for t in torrents:
                self.__changeTorrentStatus(tclient, t)
                result.append(BaseItem({"torrent-transmission": True}, t))

        else:
            logger.warning("Unknown item value type " +
                           str(type(item.getValue())))

        return result
Пример #18
0
    def internalProcess(self,bsObject,item):

        result = []

        if "find" in self.__config and self.__config["find"] is not None:
            logger.debug("Processing 'find' config " + str(self.__config["find"]))
            findConfigDict = self.__config["find"]
            # Process all find config definitions
            for findKey in findConfigDict:

                if findConfigDict[findKey] is not None:
                    foundElements = bsObject.find_all(findKey, findConfigDict[findKey])
                else:
                    foundElements = bsObject.findAll(findKey)

                for foundElement in foundElements:
                    foundItem = BaseItem({"parent":item})
                    foundItem.setValue(foundElement)
                    result.append(foundItem)

        elif "get" in self.__config and self.__config["get"] is not None:

            logger.debug("Processing 'get' config '" + str(self.__config["get"]) + "'")
            getConfigDict = self.__config["get"]
            for getKey in getConfigDict:
                logger.debug("Processing GET '"+getKey+"'")
                if getKey is not None:

                    getItem = BaseItem({"parent":item})
                    getItem.setValue(bsObject.get(getKey))
                    result.append(getItem)

                else:
                    raise "Error : should provide something to GET"
        elif "text" in self.__config:
            result.append(BaseItem({"parent":item},bsObject.text))
        else:
            logger.warning("Nothing to process internally. Returning same item")
            result = [item]

        return result
Пример #19
0
# Example : http://bandaancha.eu/

import logging
from pype.html import HtmlProcessor, BSProcessor
from pype.model import BaseItem


logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

htmlprocessor = HtmlProcessor({"find":{"h2":{"class":"title"}}})
h2processor = BSProcessor({"find":{"a":{"itemprop":"url"}}})

base = BaseItem(None)
base.setValue("http://bandaancha.eu/")
result = htmlprocessor.process(base)

for e in result:
    if e is None:
        print "NONE"
    else:
        print str(e.getValue()) +" <<<<< " +  str(e.getMetadataValue("parent"))
        aResult = h2processor.process(e)
        for a in aResult:
            print "Found link ----- " + str(a.getValue())
            print str(a.getValue().get("href"))
Пример #20
0
def evaluate_false_AlreadyProcessedCondition():

    condition = AlreadyProcessedCondition({})

    assert not condition.evaluate(BaseItem(None,"any value"))
Пример #21
0
from pype.model import BaseItem
from pype.storage import FILE_NAME, FILE_OP, FILE_OP_STORE, FILE_OP_RETRIEVE,\
    FileProcessor
from pype.extra_processor import LogItemsProcessor

items = []
for i in range(1, 20):
    items = items + [BaseItem(None, "item" + str(i))]

filename = "./test_fileprocessor.txt"
writeprocessor = FileProcessor({FILE_NAME: filename, FILE_OP: FILE_OP_STORE})

for i in range(1, 2):
    writeprocessor.processList(items)

readprocessor = FileProcessor({FILE_NAME: filename, FILE_OP: FILE_OP_RETRIEVE})

result = readprocessor.processList([BaseItem(None, filename)])

#output results
print "Showin readed items -as strings- from file"
loggerprocessor = LogItemsProcessor(None)
loggerprocessor.processList(result)

print "File " + filename + " will be removed now"
#Delete file
import os
os.remove(filename)

# Checking #Issue_44
filename = "file.sh"
Пример #22
0
from pype.model import BaseItem

# REDIS Processor Sample
# Requires a valid redis server running

# Datasource config values
redisds = RedisDataSource({
    REDIS_DATASOURCE_CONFIG: {
        REDIS_DATASOURCE_CONFIG_HOST: "192.168.10.10",
        REDIS_DATASOURCE_CONFIG_PORT: "6379",
        REDIS_DATASOURCE_CONFIG_DB: "test"
    }
})

store_processor = RedisStoreProcessor({REDIS_DATASOURCE: redisds})

# Store 10 items
items = []
for i in range(10):
    items.append(BaseItem({}, "item" + str(i)))

result = store_processor.processList(items)

# Obtains values
get_processor = RedisGetProcessor({REDIS_DATASOURCE: redisds})

getresult = get_processor.process(BaseItem({}, "ignore"))

for e in getresult:
    print str(e)
Пример #23
0
# Example : http://bandaancha.eu/

import logging
from pype.html import HtmlProcessor, BSProcessor
from pype.model import BaseItem

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

htmlprocessor = HtmlProcessor({"find": {"h2": {"class": "title"}}})
h2processor = BSProcessor({"find": {"a": {"itemprop": "url"}}})

base = BaseItem(None)
base.setValue("http://bandaancha.eu/")
result = htmlprocessor.process(base)

for e in result:
    if e is None:
        print "NONE"
    else:
        print str(e.getValue()) + " <<<<< " + str(e.getMetadataValue("parent"))
        aResult = h2processor.process(e)
        for a in aResult:
            print "Found link ----- " + str(a.getValue())
            print str(a.getValue().get("href"))
Пример #24
0
from pype.model import BaseItem
from pype.extra_processor import AddItemsProcessor, ADDITEMS_PREPEND,\
    ADDITEMS_POSTPEND

item1 = BaseItem(None, "item1")
item2 = BaseItem(None, "item2")
processor = AddItemsProcessor({
    ADDITEMS_PREPEND: [item1],
    ADDITEMS_POSTPEND: [item1]
})

result = processor.processList([])

for e in result:
    print str(e)
from pype.torrent import TransmissionClientConfig, \
    TransmissionChangeStatusProcessor, \
    TRANSMISSION_TORRENT_OPERATION, \
    TRANSMISSION_TORRENT_CURRENT_STATUS, TRANSMISSION_TORRENT_OPERATION_START
from pype.model import BaseItem

# logging
import logging

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

# Transmission torrent change status sample
# Starts all stopped torrents

transmission_config = TransmissionClientConfig("host", "port", "user",
                                               "password")

processor = TransmissionChangeStatusProcessor({
    TRANSMISSION_TORRENT_CURRENT_STATUS:
    'stopped',
    TRANSMISSION_TORRENT_OPERATION:
    TRANSMISSION_TORRENT_OPERATION_START
})

item = BaseItem(None, transmission_config)

result = processor.process(item)
Пример #26
0
    REDIS_DATASOURCE_CONFIG_HOST, REDIS_DATASOURCE_CONFIG_PORT,\
    REDIS_DATASOURCE_CONFIG_DB
from pype.model import BaseItem, HASH_ONCE

#logging
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)


# DRY processor using REDIS

redisds = RedisDataSource({REDIS_DATASOURCE_CONFIG:{REDIS_DATASOURCE_CONFIG_HOST:"192.168.10.10",
                                                    REDIS_DATASOURCE_CONFIG_PORT:"6379",
                                                    REDIS_DATASOURCE_CONFIG_DB:"test"}})

nonexisting = redisds.get(BaseItem(None,"test"))
print "Non existing result " + str(nonexisting)

storedItem = BaseItem({HASH_ONCE:True},"stored value into redis")
redisds.store(storedItem)

existing = redisds.get(storedItem)

print "Existing value :[" + str(existing) + "] stored"
redisds.delete(storedItem)

print "The value has been deleted : should not be found ->" + str(redisds.get(storedItem))

# .all sample
for i in range(10):
    redisds.store(BaseItem(None,"value"+str(i)))
Пример #27
0
def evaluate_false_ContainsTextCondition_test():

    containsTextCondition = ContainsTextCondition({"value":"falsetest"})

    assert not containsTextCondition.evaluate(BaseItem(None,"This is a test value that evaluates to true"))
Пример #28
0
from pype.model import BaseItem
from pype.html import HtmlProcessor

#logging
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

item = BaseItem(None,"https://github.com")

htmlprocessor = HtmlProcessor({})

result = htmlprocessor.process(item)

print str(result[0])
Пример #29
0
# Url with all user agents (HTML format)
url = "http://www.useragentstring.com/pages/All/"

# Steps of pype chain processor
# 1. Get HTML from URL and filter by li elements
# 2. Get div with id=liste in the resulting html
# 3. Get all <a elements within the li elements
# 4. Get all a values (text) from step 3
# 5. Store in a file
# This file is used in the RandomUserAgentHeadersProvider

#logging
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

item = BaseItem(None, url)

chain = ChainProcessor({
    PROCESSORS_LIST: [
        HtmlProcessor(None),
        BSProcessor({"find": {
            "div": {
                "id": "liste"
            }
        }}),
        BSProcessor({"find": {
            "li": None
        }}),
        BSProcessor({"find": {
            "a": None
        }}),