Пример #1
0
	def parse(self,url):
		"""
		Finds deliverables page and parse data
			
		@type  url: string
		@param url: String defining initial url for deliverables search.
		
		"""
		# URL of the project
		self.opt_url = url
		
		# initialize main html handler and parser
		self.htmlhandler = GetHTMLAndParse()

		# searching deliverable page
		self.pagesearch = GetDelivPage(self.opt_url,
			verbose=self.opt['verbose'],
			debug=self.opt['debug'])
						 
		# extracting informations from page
		self.recordhandler = GetDelivRecords(debug=self.opt['debug'])
		
		# Proceed with extraction
		self.links = None
		self.main()
Пример #2
0
    def __init__(self, options=opt, url=None):
        # get options
        self.opt = options
        if url != None:
            self.opt_url = url
        else:
            self.opt_url = self.opt.url

        # initialize main html handler and parser
        self.htmlhandler = GetHTMLAndParse()

        # searching deliverable page
        self.pagesearch = GetDelivPage(self.opt_url,
                                       verbose=self.opt.verbose,
                                       debug=self.opt.debug,
                                       addkeyw=self.opt.regexp)

        # extracting informations from page
        self.recordhandler = GetDelivRecords(debug=self.opt.debug)
Пример #3
0
		    def __init__(self, options=opt, url=None):
			# get options
			self.opt = options
			if url != None:
			    self.opt_url = url
			else:
			    self.opt_url = self.opt.url
			
			# initialize main html handler and parser
			self.htmlhandler = GetHTMLAndParse()

			# searching deliverable page
			self.pagesearch = GetDelivPage(self.opt_url,
						       verbose=self.opt.verbose,
						       debug=self.opt.debug,
						       addkeyw=self.opt.regexp)
						       
			# extracting informations from page
			self.recordhandler = GetDelivRecords(debug=self.opt.debug)
Пример #4
0
class Deliverables:
    		   # static options
    	            opt = {
                    'debug': False,
                    'verbose': False,
		    'regexp': None,
		    'quiet': False,
		    'page': False,
		    'file': None,
		    'storefile': False}
	    
		    def __init__(self, options=opt, url=None):
			# get options
			self.opt = options
			if url != None:
			    self.opt_url = url
			else:
			    self.opt_url = self.opt.url
			
			# initialize main html handler and parser
			self.htmlhandler = GetHTMLAndParse()

			# searching deliverable page
			self.pagesearch = GetDelivPage(self.opt_url,
						       verbose=self.opt.verbose,
						       debug=self.opt.debug,
						       addkeyw=self.opt.regexp)
						       
			# extracting informations from page
			self.recordhandler = GetDelivRecords(debug=self.opt.debug)
		       
		    def __debug(self,msg):
			if self.opt.debug:
			   print("Debug message:    " +str(msg));
			
		   
		    """ Main method handling all objects """
		    def main(self):
			

			# Searching deliverable page
			if self.opt.page:
			    self.links = [self.opt_url]
			else:
			    self.links = self.pagesearch.get_deliverable_page() 

			    ##################################
			    if self.links[0] == -1:
				return self.links

			    if self.opt.verbose:
				print "*"*80
				print "Deliverable page: ", " ".join(self.links)
				print "*"*80

			pr = RRSProject()

			#Project - Url relationship
			if not self.opt.page:
			   pr_url = RRSUrl(link=self.opt_url)
			   pr_url_rel = RRSRelationshipProjectUrl()
			   pr_url_rel.set_entity(pr_url)
			   pr['url'] = pr_url_rel

		       
			self.recordhandler.process_pages(self.links)

			records = self.recordhandler.get_deliverables()

			if type(records) == list:
			    #create relationship Project Publication
			    for r in records:
				rel = RRSRelationshipPublicationProject()
				#print unicode(r['title'])
				rel.set_entity(r)
				pr['publication'] = rel
			    #create XML from RRSProject
			    output    = StringIO.StringIO()
			    converter = Model2XMLConverter(stream=output)
			    converter.convert(pr)
			    out       = output.getvalue()
			    output.close()
			    #Either return RRSProject object or XML in string or store result into a file           
			    if self.opt.storefile:

				r = self._storeToFile(self.opt_url,out)
				#test if store ok
				if r[0]!=1:
				    print r[1]
			       
			    else:
				print out.encode('UTF-8')
			    return pr

			else:
			    return records


		    
		    def _storeToFile(self,url,res):   
		       """ From url generates filename, creates file and save res into it"""
		       name = url.replace(':', '.').replace("/", "").replace("?", "").replace("#", "")
		       file_name = name+".xml"
		       filepath = os.path.join(os.getcwd(), file_name)

		       try:
			  fw = open(filepath, "w")
		       except:
			  return (-1, 'Cannot make output file.')

		       try:
			  fw.write(res.encode('UTF-8'))
		       except:
			  return (-2, 'Cannot write data to output file.')

		       fw.flush()
		       fw.close()

		       return (1, 'OK')
Пример #5
0
class Deliverables:
    # static options
    opt = {
        'debug': False,
        'verbose': False,
        'regexp': None,
        'quiet': False,
        'page': False,
        'file': None,
        'storefile': False
    }

    def __init__(self, options=opt, url=None):
        # get options
        self.opt = options
        if url != None:
            self.opt_url = url
        else:
            self.opt_url = self.opt.url

        # initialize main html handler and parser
        self.htmlhandler = GetHTMLAndParse()

        # searching deliverable page
        self.pagesearch = GetDelivPage(self.opt_url,
                                       verbose=self.opt.verbose,
                                       debug=self.opt.debug,
                                       addkeyw=self.opt.regexp)

        # extracting informations from page
        self.recordhandler = GetDelivRecords(debug=self.opt.debug)

    def __debug(self, msg):
        if self.opt.debug:
            print("Debug message:    " + str(msg))

    """ Main method handling all objects """

    def main(self):

        # Searching deliverable page
        if self.opt.page:
            self.links = [self.opt_url]
        else:
            self.links = self.pagesearch.get_deliverable_page()

            ##################################
            if self.links[0] == -1:
                return self.links

            if self.opt.verbose:
                print "*" * 80
                print "Deliverable page: ", " ".join(self.links)
                print "*" * 80

        pr = RRSProject()

        #Project - Url relationship
        if not self.opt.page:
            pr_url = RRSUrl(link=self.opt_url)
            pr_url_rel = RRSRelationshipProjectUrl()
            pr_url_rel.set_entity(pr_url)
            pr['url'] = pr_url_rel

        self.recordhandler.process_pages(self.links)

        records = self.recordhandler.get_deliverables()

        if type(records) == list:
            #create relationship Project Publication
            for r in records:
                rel = RRSRelationshipPublicationProject()
                #print unicode(r['title'])
                rel.set_entity(r)
                pr['publication'] = rel
            #create XML from RRSProject
            output = StringIO.StringIO()
            converter = Model2XMLConverter(stream=output)
            converter.convert(pr)
            out = output.getvalue()
            output.close()
            #Either return RRSProject object or XML in string or store result into a file
            if self.opt.storefile:

                r = self._storeToFile(self.opt_url, out)
                #test if store ok
                if r[0] != 1:
                    print r[1]

            else:
                print out.encode('UTF-8')
            return pr

        else:
            return records

    def _storeToFile(self, url, res):
        """ From url generates filename, creates file and save res into it"""
        name = url.replace(':', '.').replace("/",
                                             "").replace("?",
                                                         "").replace("#", "")
        file_name = name + ".xml"
        filepath = os.path.join(os.getcwd(), file_name)

        try:
            fw = open(filepath, "w")
        except:
            return (-1, 'Cannot make output file.')

        try:
            fw.write(res.encode('UTF-8'))
        except:
            return (-2, 'Cannot write data to output file.')

        fw.flush()
        fw.close()

        return (1, 'OK')
Пример #6
0
class Deliverables:
	"""
	Class implementing interface for purpose of using this module in other projects
	"""

	pr = None
	
	deliverables_rrs_xml = ""
	
	regexps = []
	
	def __init__(self,debug=False, verbose=False, quiet=True):
		"""
		Constructor of the class. Initialize deliverables extractor interface

		@type  debug: boolean
		@param debug: Prints debugging additional information
		
		@type  quiet: boolean
		@param quiet: No function will output anything on STDOUT when True.
		
		@type  verbose: boolean
		@param verbose: Prints additional information about parsing on STDOUT when True.
		
		"""
		self.opt = {
			'debug': False,
			'verbose': verbose,
			'regexp': None,
			'quiet': quiet,
			# We actually do not permit selecting single page without search
			# in this version of interface
			'page': False,
			'file': None,
			# Mechanism of storing file has been overloaded
			# No file is stored. Output RRS-XML is stored in atribute instead
			'storefile': True}
		
		links = None


	def parse(self,url):
		"""
		Finds deliverables page and parse data
			
		@type  url: string
		@param url: String defining initial url for deliverables search.
		
		"""
		# URL of the project
		self.opt_url = url
		
		# initialize main html handler and parser
		self.htmlhandler = GetHTMLAndParse()

		# searching deliverable page
		self.pagesearch = GetDelivPage(self.opt_url,
			verbose=self.opt['verbose'],
			debug=self.opt['debug'])
						 
		# extracting informations from page
		self.recordhandler = GetDelivRecords(debug=self.opt['debug'])
		
		# Proceed with extraction
		self.links = None
		self.main()
		
	def parse_page(self,deliverables_url):
		"""
		Finds deliverables page and parse data
			
		@type  deliverables_url: string
		@param deliverables_url: String defining url for deliverables extraction.
		
		"""

		# initialize main html handler and parser
		self.htmlhandler = GetHTMLAndParse()
						 
		# extracting informations from page
		self.recordhandler = GetDelivRecords(debug=self.opt['debug'])

		# URL of the project
		self.opt_url = deliverables_url
		
		# Proceed with extraction
		self.links = [deliverables_url]
		self.main()

	def main(self):
		"""
		Method implementing actions choosen by parameters in constructor.
		"""

		# Searching deliverable page
		if not self.links:
			self.pagesearch._sigwords.extend(self.regexps)
			self.links = self.pagesearch.get_deliverable_page()
		##################################
		if self.links[0] == -1:
			return self.links

		if self.opt['verbose']:
			print "*"*80
			print "Deliverable page: ", " ".join(self.links)
			print "*"*80

		self.pr = RRSProject()

		#Project - Url relationship
		if not self.opt['page']:
			pr_url = RRSUrl(link=self.opt_url)
			pr_url_rel = RRSRelationshipProjectUrl()
			pr_url_rel.set_entity(pr_url)
			self.pr['url'] = pr_url_rel
		self.recordhandler.process_pages(self.links)

		records = self.recordhandler.get_deliverables()

		if type(records) == list:
			#create relationship Project Publication
			self.records = records
			for r in records:
				rel = RRSRelationshipPublicationProject()
				rel.set_entity(r)
				self.pr['publication'] = rel
				#create XML from RRSProject
				output = StringIO.StringIO()
				converter = Model2XMLConverter(stream=output)
				converter.convert(self.pr)
				out = output.getvalue()
				output.close()
				#Either return RRSProject object or XML in string or store result into a file  
				if self.opt['storefile']:

					r = self._storeToFile(self.opt_url,out)
					#test if store ok
					if r[0]!=1:
						print r[1]
			 
				else:
					print out.encode('UTF-8')
				return self.pr

		else:
			return records
	
	def _storeToFile(self,url,res):
		"""
		Overrides method from original Deliverables class. This method just saves
		the RRS XML string to object atribute.
		
		@type  res: string
		@param res: Output RRS XML string for writing into object atribute.
		
		@type  url: string
		@param url: For compatibility with Deliverables class method only. It is not used.
		
		@return:  (1, 'OK')
		"""
		self.deliverables_rrs_xml = res.encode('UTF-8')

		return (1, 'OK')
	
	def get_deliverables(self):
		"""
		Access method to object of project with references to all parsed
		deliverables. It runs parsing only when necesseary.
		
		@return:  None when any error is found or RRSProject instance
		"""
		return self.pr
	
	def get_rrs_xml(self):
		"""
		Access method to object of project with references to all parsed
		deliverables. It runs parsing only when necesseary.
		
		@return:  String with RRS XML
		"""
		return self.deliverables_rrs_xml
		

	def get_json(self):
		"""
		Access method to data in form of JSON string.
		
		@return:  String in JSON
		"""	
		return xml2json(self.get_rrs_xml())
	
	def get_list(self):
		"""
		Access method to object of project with references to all parsed
		deliverables.
		
		@return:  List of RRSPublication instances
		"""	
		return self.records

	def __debug(self,msg):
		"""
		Prints debug message.
		
		@type  msg: string
		@param msg: String for printing on STDOUT
		"""
		if self.opt.debug:
			print("Debug message: " +str(msg));
			
	def add_regexp(self,regexp):
		"""
		Prints debug message.
		
		@type  regexp: string
		@param regexp: Regular expression pattern for adding to deliverables page ranking regexp list
		"""
		self.regexps.append(regexp)
		
	def remove_regexp(self,regexp):
		"""
		Prints debug message.
		
		@type  regexp: string
		@param regexp: Regular expression pattern for remove from deliverables page ranking regexp list
		"""
		self.regexps.remove(regexp)