def process(self,item,file_name=None): downloadFileHandler = urllib2.urlopen(item.getValue()) if file_name is None: file_name = item.getValue().split('/')[-1] directory = self.__config[FILE_FOLDER] + self.__getFileSubfolder(item) if not os.path.exists(directory): logger.info("Creating folder "+directory) os.makedirs(directory) # TODO use FILE_ADD_DATE to add date YYYYMMDDhhmm logger.info("Downloading filehandler to "+directory+file_name) filehandler = open(directory + file_name,'w') filehandler.write(downloadFileHandler.read()) filehandler.close() # Return result if self.__config[FILE_ADD_AS_METADATA]: item.setMetadataValue(FILE_ADD_AS_METADATA_FIELD,filehandler) return[item] else: newFileItem = BaseItem(None) newFileItem.setParent(item) newFileItem.setValue(filehandler) return [newFileItem]
def process(self,item): """ Process a HTML item , retrieving whatever is in config - item should contain a valid web url """ try: if FROM_TEXT in self.config and self.config[FROM_TEXT]: logger.debug("Using text to obtain BS object") htmlBS = BeautifulSoup(item.getValue()) else: logger.debug("Request to "+ str(item.getValue())) try: headers = self.__getHeaders() if headers is not None: htmlBS = BeautifulSoup(requests.get(item.getValue(),headers=headers).text) else: htmlBS = BeautifulSoup(requests.get(item.getValue()).text) except: logger.error("Some errors requesting item value.Returning [] ") traceback.print_exc() return [] bsHtmlItem = BaseItem({"parent":item}) bsHtmlItem.setValue(htmlBS) bsProcessor = BSProcessor(self.config) return bsProcessor.process(bsHtmlItem) except: logger.error("Errors during html processing : ignoring") traceback.print_exc() return []
def process(self, item): """ Process a HTML item , retrieving whatever is in config - item should contain a valid web url """ try: if FROM_TEXT in self.config and self.config[FROM_TEXT]: logger.debug("Using text to obtain BS object") htmlBS = BeautifulSoup(item.getValue()) else: logger.debug("Request to " + str(item.getValue())) try: headers = self.__getHeaders() if headers is not None: htmlBS = BeautifulSoup( requests.get(item.getValue(), headers=headers).text) else: htmlBS = BeautifulSoup( requests.get(item.getValue()).text) except: logger.error( "Some errors requesting item value.Returning [] ") traceback.print_exc() return [] bsHtmlItem = BaseItem({"parent": item}) bsHtmlItem.setValue(htmlBS) bsProcessor = BSProcessor(self.config) return bsProcessor.process(bsHtmlItem) except: logger.error("Errors during html processing : ignoring") traceback.print_exc() return []
def all(self): result = [] dbResult = self.__collection.find(); for dbEntity in dbResult: item = BaseItem(dbEntity["metadata"]) item.setValue(dbEntity["value"]) result.append(item) return result
def parse_html_from_text_test(): html_text = "<html><head><title>title</title></head><body><p class='pclass'> my text </p></html>" item = BaseItem(None) item.setValue(html_text) processor = HtmlProcessor({FROM_TEXT:True,"find":{"p":{"class":"pclass"}}}) result = processor.process(item) print result assert len(result) == 1
def all(self): result = [] dbResult = self.__collection.find() for dbEntity in dbResult: item = BaseItem(dbEntity["metadata"]) item.setValue(dbEntity["value"]) result.append(item) return result
def parse_html_from_text_test(): html_text = "<html><head><title>title</title></head><body><p class='pclass'> my text </p></html>" item = BaseItem(None) item.setValue(html_text) processor = HtmlProcessor({ FROM_TEXT: True, "find": { "p": { "class": "pclass" } } }) result = processor.process(item) print result assert len(result) == 1
def process(self, item): result = [] if item is None: logger.info("None item : returning None") return None else: foundItems = self.internalProcess(item.getValue(), item) if foundItems is not None: logger.info("Found " + str(len(foundItems)) + " item(s)") result.extend(foundItems) else: logger.warn("No items found") resultItem = BaseItem({"parent": item}) resultItem.setValue(item) result.append(resultItem) return result
def process(self,item): result = [] if self.__config[FILE_OP]==FILE_OP_STORE: # Write item value and newline, adds metadata filename and returns item self.__filehandler.write(item.getValue()+"\n") item.setMetadataValue(FILE_METADATA_FILENAME,self.__config[FILE_NAME]) result = [item] elif self.__config[FILE_OP]==FILE_OP_RETRIEVE: # Open the file, strip lines and generate new items lines = [line.strip() for line in open(self.__config[FILE_NAME],"r")] for l in lines: item = BaseItem({"parent",item}) item.setValue(l) result.append(item) pass else: raise "Unknown operation" return result
def process(self,item): result = [] if item is None: logger.info("None item : returning None") return None else: foundItems = self.internalProcess(item.getValue(),item) if foundItems is not None: logger.info("Found "+ str(len(foundItems))+" item(s)") result.extend(foundItems) else: logger.warn("No items found") resultItem = BaseItem({"parent":item}) resultItem.setValue(item) result.append(resultItem) return result
def all(self): result = [] # Obtain all keys keys = self._r.keys() #For each key, get value for k in keys: value = self._r.get(k) result.append(BaseItem({"origin": "redis"}, value)) #return result return result
def process(self, item, file_name=None): downloadFileHandler = urllib2.urlopen(item.getValue()) if file_name is None: file_name = item.getValue().split('/')[-1] directory = self.__config[FILE_FOLDER] + self.__getFileSubfolder(item) if not os.path.exists(directory): logger.info("Creating folder " + directory) os.makedirs(directory) # TODO use FILE_ADD_DATE to add date YYYYMMDDhhmm logger.info("Downloading filehandler to " + directory + file_name) filehandler = open(directory + file_name, 'w') filehandler.write(downloadFileHandler.read()) filehandler.close() # Return result if self.__config[FILE_ADD_AS_METADATA]: item.setMetadataValue(FILE_ADD_AS_METADATA_FIELD, filehandler) return [item] else: newFileItem = BaseItem(None) newFileItem.setParent(item) newFileItem.setValue(filehandler) return [newFileItem]
def internalProcess(self, bsObject, item): result = [] if "find" in self.__config and self.__config["find"] is not None: logger.debug("Processing 'find' config " + str(self.__config["find"])) findConfigDict = self.__config["find"] # Process all find config definitions for findKey in findConfigDict: if findConfigDict[findKey] is not None: foundElements = bsObject.find_all(findKey, findConfigDict[findKey]) else: foundElements = bsObject.findAll(findKey) for foundElement in foundElements: foundItem = BaseItem({"parent": item}) foundItem.setValue(foundElement) result.append(foundItem) elif "get" in self.__config and self.__config["get"] is not None: logger.debug("Processing 'get' config '" + str(self.__config["get"]) + "'") getConfigDict = self.__config["get"] for getKey in getConfigDict: logger.debug("Processing GET '" + getKey + "'") if getKey is not None: getItem = BaseItem({"parent": item}) getItem.setValue(bsObject.get(getKey)) result.append(getItem) else: raise "Error : should provide something to GET" elif "text" in self.__config: result.append(BaseItem({"parent": item}, bsObject.text)) else: logger.warning( "Nothing to process internally. Returning same item") result = [item] return result
def process(self, item): result = [] if self.__config[FILE_OP] == FILE_OP_STORE: # Write item value and newline, adds metadata filename and returns item self.__filehandler.write(item.getValue() + "\n") item.setMetadataValue(FILE_METADATA_FILENAME, self.__config[FILE_NAME]) result = [item] elif self.__config[FILE_OP] == FILE_OP_RETRIEVE: # Open the file, strip lines and generate new items lines = [ line.strip() for line in open(self.__config[FILE_NAME], "r") ] for l in lines: item = BaseItem({"parent", item}) item.setValue(l) result.append(item) pass else: raise "Unknown operation" return result
def getHeaders(self): #1. Obtain from "useragents.txt" file filename = "./useragents.txt" if self.config is not None and RANDOM_USER_AGENT_FILE in self.config: filename = self.config[RANDOM_USER_AGENT_FILE] useragents = FileProcessor({ FILE_NAME: filename, FILE_OP: FILE_OP_RETRIEVE }).process(BaseItem(None, filename)) #2. Obtain one from the resulting list from random import choice return {"User-Agent": choice(useragents).getValue()}
def parse_html_error_test(): html_text = "<bad format></test>" item = BaseItem(None, html_text) processor = HtmlProcessor({ FROM_TEXT: True, "find": { "p": { "class": "pclass" } } }) result = processor.process(item) assert result == []
def process(self, item): result = [] logger.debug(" Item value type : " + str(type(item.getValue()))) if type(item.getValue()) == type( TransmissionClientConfig(None, None, None, None)): clientConfig = item.getValue() tclient = transmissionrpc.Client(clientConfig.host, clientConfig.port, clientConfig.user, clientConfig.password) torrents = tclient.get_torrents() for t in torrents: self.__changeTorrentStatus(tclient, t) result.append(BaseItem({"torrent-transmission": True}, t)) else: logger.warning("Unknown item value type " + str(type(item.getValue()))) return result
def internalProcess(self,bsObject,item): result = [] if "find" in self.__config and self.__config["find"] is not None: logger.debug("Processing 'find' config " + str(self.__config["find"])) findConfigDict = self.__config["find"] # Process all find config definitions for findKey in findConfigDict: if findConfigDict[findKey] is not None: foundElements = bsObject.find_all(findKey, findConfigDict[findKey]) else: foundElements = bsObject.findAll(findKey) for foundElement in foundElements: foundItem = BaseItem({"parent":item}) foundItem.setValue(foundElement) result.append(foundItem) elif "get" in self.__config and self.__config["get"] is not None: logger.debug("Processing 'get' config '" + str(self.__config["get"]) + "'") getConfigDict = self.__config["get"] for getKey in getConfigDict: logger.debug("Processing GET '"+getKey+"'") if getKey is not None: getItem = BaseItem({"parent":item}) getItem.setValue(bsObject.get(getKey)) result.append(getItem) else: raise "Error : should provide something to GET" elif "text" in self.__config: result.append(BaseItem({"parent":item},bsObject.text)) else: logger.warning("Nothing to process internally. Returning same item") result = [item] return result
# Example : http://bandaancha.eu/ import logging from pype.html import HtmlProcessor, BSProcessor from pype.model import BaseItem logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) htmlprocessor = HtmlProcessor({"find":{"h2":{"class":"title"}}}) h2processor = BSProcessor({"find":{"a":{"itemprop":"url"}}}) base = BaseItem(None) base.setValue("http://bandaancha.eu/") result = htmlprocessor.process(base) for e in result: if e is None: print "NONE" else: print str(e.getValue()) +" <<<<< " + str(e.getMetadataValue("parent")) aResult = h2processor.process(e) for a in aResult: print "Found link ----- " + str(a.getValue()) print str(a.getValue().get("href"))
def evaluate_false_AlreadyProcessedCondition(): condition = AlreadyProcessedCondition({}) assert not condition.evaluate(BaseItem(None,"any value"))
from pype.model import BaseItem from pype.storage import FILE_NAME, FILE_OP, FILE_OP_STORE, FILE_OP_RETRIEVE,\ FileProcessor from pype.extra_processor import LogItemsProcessor items = [] for i in range(1, 20): items = items + [BaseItem(None, "item" + str(i))] filename = "./test_fileprocessor.txt" writeprocessor = FileProcessor({FILE_NAME: filename, FILE_OP: FILE_OP_STORE}) for i in range(1, 2): writeprocessor.processList(items) readprocessor = FileProcessor({FILE_NAME: filename, FILE_OP: FILE_OP_RETRIEVE}) result = readprocessor.processList([BaseItem(None, filename)]) #output results print "Showin readed items -as strings- from file" loggerprocessor = LogItemsProcessor(None) loggerprocessor.processList(result) print "File " + filename + " will be removed now" #Delete file import os os.remove(filename) # Checking #Issue_44 filename = "file.sh"
from pype.model import BaseItem # REDIS Processor Sample # Requires a valid redis server running # Datasource config values redisds = RedisDataSource({ REDIS_DATASOURCE_CONFIG: { REDIS_DATASOURCE_CONFIG_HOST: "192.168.10.10", REDIS_DATASOURCE_CONFIG_PORT: "6379", REDIS_DATASOURCE_CONFIG_DB: "test" } }) store_processor = RedisStoreProcessor({REDIS_DATASOURCE: redisds}) # Store 10 items items = [] for i in range(10): items.append(BaseItem({}, "item" + str(i))) result = store_processor.processList(items) # Obtains values get_processor = RedisGetProcessor({REDIS_DATASOURCE: redisds}) getresult = get_processor.process(BaseItem({}, "ignore")) for e in getresult: print str(e)
# Example : http://bandaancha.eu/ import logging from pype.html import HtmlProcessor, BSProcessor from pype.model import BaseItem logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) htmlprocessor = HtmlProcessor({"find": {"h2": {"class": "title"}}}) h2processor = BSProcessor({"find": {"a": {"itemprop": "url"}}}) base = BaseItem(None) base.setValue("http://bandaancha.eu/") result = htmlprocessor.process(base) for e in result: if e is None: print "NONE" else: print str(e.getValue()) + " <<<<< " + str(e.getMetadataValue("parent")) aResult = h2processor.process(e) for a in aResult: print "Found link ----- " + str(a.getValue()) print str(a.getValue().get("href"))
from pype.model import BaseItem from pype.extra_processor import AddItemsProcessor, ADDITEMS_PREPEND,\ ADDITEMS_POSTPEND item1 = BaseItem(None, "item1") item2 = BaseItem(None, "item2") processor = AddItemsProcessor({ ADDITEMS_PREPEND: [item1], ADDITEMS_POSTPEND: [item1] }) result = processor.processList([]) for e in result: print str(e)
from pype.torrent import TransmissionClientConfig, \ TransmissionChangeStatusProcessor, \ TRANSMISSION_TORRENT_OPERATION, \ TRANSMISSION_TORRENT_CURRENT_STATUS, TRANSMISSION_TORRENT_OPERATION_START from pype.model import BaseItem # logging import logging logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) # Transmission torrent change status sample # Starts all stopped torrents transmission_config = TransmissionClientConfig("host", "port", "user", "password") processor = TransmissionChangeStatusProcessor({ TRANSMISSION_TORRENT_CURRENT_STATUS: 'stopped', TRANSMISSION_TORRENT_OPERATION: TRANSMISSION_TORRENT_OPERATION_START }) item = BaseItem(None, transmission_config) result = processor.process(item)
REDIS_DATASOURCE_CONFIG_HOST, REDIS_DATASOURCE_CONFIG_PORT,\ REDIS_DATASOURCE_CONFIG_DB from pype.model import BaseItem, HASH_ONCE #logging import logging logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) # DRY processor using REDIS redisds = RedisDataSource({REDIS_DATASOURCE_CONFIG:{REDIS_DATASOURCE_CONFIG_HOST:"192.168.10.10", REDIS_DATASOURCE_CONFIG_PORT:"6379", REDIS_DATASOURCE_CONFIG_DB:"test"}}) nonexisting = redisds.get(BaseItem(None,"test")) print "Non existing result " + str(nonexisting) storedItem = BaseItem({HASH_ONCE:True},"stored value into redis") redisds.store(storedItem) existing = redisds.get(storedItem) print "Existing value :[" + str(existing) + "] stored" redisds.delete(storedItem) print "The value has been deleted : should not be found ->" + str(redisds.get(storedItem)) # .all sample for i in range(10): redisds.store(BaseItem(None,"value"+str(i)))
def evaluate_false_ContainsTextCondition_test(): containsTextCondition = ContainsTextCondition({"value":"falsetest"}) assert not containsTextCondition.evaluate(BaseItem(None,"This is a test value that evaluates to true"))
from pype.model import BaseItem from pype.html import HtmlProcessor #logging import logging logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) item = BaseItem(None,"https://github.com") htmlprocessor = HtmlProcessor({}) result = htmlprocessor.process(item) print str(result[0])
# Url with all user agents (HTML format) url = "http://www.useragentstring.com/pages/All/" # Steps of pype chain processor # 1. Get HTML from URL and filter by li elements # 2. Get div with id=liste in the resulting html # 3. Get all <a elements within the li elements # 4. Get all a values (text) from step 3 # 5. Store in a file # This file is used in the RandomUserAgentHeadersProvider #logging import logging logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) item = BaseItem(None, url) chain = ChainProcessor({ PROCESSORS_LIST: [ HtmlProcessor(None), BSProcessor({"find": { "div": { "id": "liste" } }}), BSProcessor({"find": { "li": None }}), BSProcessor({"find": { "a": None }}),