예제 #1
0
 def discard_job(self,cmdline):
     """ Will discard your current job. 
         Should only be used if the source is extremely difficult or contains unwanted content like for example p**n.
         
         The @email is for finding the source that should be discarded, the @cause is the reason
         for discarding the source.
         
         Usage: discardjob [email] [sourceListId]"""
     if len(mu.get_second_arg(cmdline).split()) >1:
         email , sourcename = mu.get_second_arg(cmdline).split()
     else:
         print "Invalid command, Usage: discardjob [email] [sourceListId]"
         return
     print "Please enter the reason for discarding this source:"
     cause = str(raw_input("> "))
     
     if not cause or len(cause) < 2:
         print "Invalid cause given, please rerun the command with a proper cause."
         return
     
     
                 
     print "---"
     print "Discarding {1} assigned to: {0}".format(email , sourcename)
     print "Cause: '{0}'".format(cause)
     confirm = str( raw_input( "To confirm type: 'discard' without quotes: " ) )
             
     if confirm == "discard":
         self._sourcelist_id = -1
         
         r, p = self._proxy.discard_job(email , cause , sourcename)
         self._print_result(r, p)
     else:
         print "Confirmation failed - aborting."
예제 #2
0
    def get_url_and_encoding(self, cmdline):

        url = mu.get_second_arg(cmdline).strip()
        encoding = mu.get_second_arg(url).strip()
        url = url.replace(encoding, "", 1).strip()

        return {"url": url, "encoding": encoding}
예제 #3
0
 def assign_job(self, cmdline):
     """Assigns the job you just popped
     Usage: assignjob [email]"""
     email = mu.get_second_arg(cmdline)
     
     if not (self._sourcelist_id and self._sourcelist_id > 0):
         print "You must pop a job before assigning one."
         return
     
     r, p = self._proxy.assign_job(self._sourcelist_id, email)
     #
     
     if not r:
         self._print_result(r, p)
         return
     
     r2, p2 = self._proxy.is_prioritized(email)
     if r2:
         print "#" * 30            
         print "# This source is marked as high-priority."
         print "# And the finished transformation should therefore be sent"
         print "# to '*****@*****.**' upon completion."            
         print "#" * 30             
     else:
         self._print_result(r2, p2)
     
     self._print_result(r, p)
예제 #4
0
    def add_comment(self, cmdline):
        """ Appends a comment tagged with supplied email and current time.
            Can append comment to your currently assigned job, or a source you identify with id.
            Usage: addcomment [@email]
            """
        email = mu.get_second_arg(cmdline)
        
        if len(email) < 3:
            print "You must provide a valid email to comment from."
            print "see 'man addcomment' for more info."
            return
                        
        print "Enter id of the source you want to add comment to. Use 'jobinfo' to get id of sources"
        print "Press 'enter' without typing to add comment to your currently assigned job"
        sourceid = str( raw_input() )
        

        
        print "Type your comment, and submit by pressing 'Enter'"
        print "-- start comment --"
        comment = str(raw_input("> "))
        print "-- end comment --"
        
        print "Do you want to add the above comment to source with id={0}?".format(str(sourceid))
        print "Type 'yes' or 'y' to accept, any other key to cancel."            
        answer = raw_input("> ")
        if(answer.lower() != "yes" and answer.lower() != "y"):
            print "Add comment canceled by user."
            return
        
        
        r, p = self._proxy.add_comment(email, sourceid, comment)    
        self._print_result(r, p)
예제 #5
0
    def visualise(self, cmdline):
        """Visualise transformed.log
            Usage:visualise [sourcename]"""

        sourcename = mu.get_second_arg(cmdline).strip()
        source_space = os.path.join(tm.TMONKEY_DIR, 'sources', sourcename)
        if mu.check_files_in_dir(('transformed.log', ), source_space):
            print "transformed.log does not exists."
            return
        transformed_log = os.path.join(source_space, 'transformed.log')
        transformed_xml = os.path.join(source_space, 'transformed.xml')
        print "Converting transformed.log to transformed.xml ..."
        mu.convert_log_to_xml(transformed_log, transformed_xml)
        transformed_doc = xml.etree.ElementTree.parse(transformed_xml)
        transformed_elm = transformed_doc.getroot().getchildren()
        sum = len(transformed_elm)
        if sum < tm.RANDOM_PAGE_AMOUNT:
            amount = sum
        else:
            amount = tm.RANDOM_PAGE_AMOUNT
        pages = mu.get_randoms(amount, sum)
        randomPage_xml_dir = os.path.join(source_space, 'randomPage.xml')
        if not os.path.exists(randomPage_xml_dir):
            os.mkdir(randomPage_xml_dir)
        for i in range(amount):
            print "creating randomPage-%d.xml ..." % i
            randomPage_xml  = os.path.join(randomPage_xml_dir, 'randomPage-%d.xml' % i)
            randomPage_html = os.path.join(source_space, 'randomPage-%d.html' % i)
            with codecs.open(randomPage_xml, 'w', 'utf-8') as randomPage_file:
                randomPage_file.write('<?xml version="1.0" encoding="UTF-8"?>\n')
                randomPage_file.write(xml.etree.ElementTree.tostring(transformed_elm[pages[i]]))
            mu.convert_xml_to_html('randomPage.xml/randomPage-%d.xml' % i, 'randomPage.xml/visualise.xsl', 'file:/%s' % tm.VISUALISE_CSS_PATH, randomPage_html)
        shutil.copy(tm.VISUALISE_XSL_PATH, os.path.join(randomPage_xml_dir, 'visualise.xsl'))
        os.remove(transformed_xml)
예제 #6
0
 def test_job(self , cmdline):
     """Will enter test phase, allow assign new sources
         Usage: testjob [@email]"""
     
     email = mu.get_second_arg(cmdline)
     
     r, p = self._proxy.test_job(email)    
     self._print_result(r, p)
예제 #7
0
 def job_info(self, cmdline):
     """ Print name and id of the provided email's current job
         Usage:
         jobinfo [@email]
      """
     email = mu.get_second_arg(cmdline)
     r, p = self._proxy.job_info(email)    
     self._print_result(r, p)
예제 #8
0
 def parse_cmdline(self, cmdline):
     args = tu.get_second_arg(cmdline)
     if cmdline == 'sc':
         return dict(flag='local',
                     config_file=ts.SUBSOURCE_CRAWLER_CONFIG_FILE_PATH)
     else:
         if os.path.exists(args):
             return dict(flag='local', config_file=args)
         else:
             return dict(flag='db', source=args)
예제 #9
0
    def get_comment(self, cmdline):
        """Will return the comment for the given sourcelistId
            Usage: getcomment [sourceListId]"""
        
        sourcelist_id = mu.get_second_arg(cmdline)

        if re.match("^\s*\d+\s*$",sourcelist_id) == None:
            print "The sourceid is not a number according to ^\s*\d+\s*$"
            return
        
        r, p = self._proxy.get_comment(sourcelist_id)    
        self._print_result(r, p)
예제 #10
0
 def free_job(self, cmdline):
     """ This command will release your current job, making it available for others.
         Usage: freejob [@email]
                or
                freejob [sourcelistId]
      """
     
     second_arg = mu.get_second_arg(cmdline)
     r, p = self._proxy.free_job(second_arg)    
     self._print_result(r, p)
     
     # to make sure we have to pop a new job
     self._sourcelist_id = -1
예제 #11
0
 def get_feedback(self , cmdline):
     """Retrieve feedback of the source from Integrasco Chengdu
     """
     
     source_id = mu.get_second_arg(cmdline)
     
     if not re.match("^\d+$" , source_id):
         print "source id must be integer"
         return 
     
     r , p = self._proxy.get_feedback(source_id)
     self._print_result(r, p)
     
     
예제 #12
0
    def pop_job(self, cmdline):
        """pops the most prioritized job
        Usage: popjob
        """

        type = mu.get_second_arg(cmdline)
        
        if( not(type == "forum" or type == "blog") and len(type) > 0 ):
            print "You did not type forum or blog correctly"
            return
        self._sourcelist_id, p, comment = self._proxy.pop_job(type)
        print p
        print "comment******************"
        print comment
        print "*************************"
예제 #13
0
    def test(self, cmdline):
        """test source in tmonkey
            Usage:test [sourcename] ([maxThread] ([pageLimit]))
            maxThread and pageLimit for forum"""

        if tm.UPDATE_BEFORE_TEST:
            print "Updating directory of source ..."
            mu.update_dir(tm.SOURCE_DIR)

        args = mu.get_second_arg(cmdline).strip().split()
        if len(args) == 0:
            print "Invalid command, test [sourcename] ([maxThread] ([pageLimit]))"
            return
        elif len(args) == 1:
            self.sourcename, = args
            self.max_thread = '5'
            self.page_limit = '2'
        elif len(args) == 2:
            self.sourcename, self.max_thread = args
            self.page_limit = '2'
        elif len(args) == 3:
            self.sourcename, self.max_thread, self.page_limit = args

        print "Searching directory of %s ..." % self.sourcename
        self.sourcedir = mu.search_for_source(self.sourcename)
        if not self.sourcedir:
            print "Directory of %s doesn't exist.\n" % self.sourcename
            return

        self.sourcetype = self.get_source_type()
        if self.sourcetype == 'blog':
            process = BlogProcess(self.sourcename, self.sourcedir)
            config_files = ('%s.xq' % self.sourcename, 'config.xml', 'globalConfig.xml', 'subSourceConfig.xml')
        elif self.sourcetype == 'forum':
            process = ForumProcess(self.sourcename, self.sourcedir, string.atoi(self.max_thread), self.page_limit)
            config_files = ('%s-url.xq' % self.sourcename, '%s-thread.xq' % self.sourcename, 'finished.xml', 'webForumConfiguration.xml')
        self.test_source(process, self.sourcedir, config_files)
예제 #14
0
    def validate(self, cmdline):
        """Validate transformed.log and build report
            Usage:validate [sourcename]"""

        sourcename = mu.get_second_arg(cmdline).strip()
        source_space = os.path.join(tm.TMONKEY_DIR, 'sources', sourcename)
        if mu.check_files_in_dir(('transformed.log', ), source_space):
            print "transformed.log does not exists."
            return
        transformed_log = os.path.join(source_space, 'transformed.log')
        transformed_xml = os.path.join(source_space, 'transformed.xml')
        validate_xml_dir = os.path.join(source_space, 'validate.xml')
        validate_xml = os.path.join(validate_xml_dir, 'validate.xml')
        validate_html = os.path.join(source_space, 'validate.html')

        print "Converting transformed.log to transformed.xml ..."
        mu.convert_log_to_xml(transformed_log, transformed_xml)
        print "Validating transformed.xml ..."
        transformed_doc = xml.etree.ElementTree.parse(transformed_xml)
        validate_doc = xml.etree.ElementTree.parse(tm.VALIDATE_XML_PATH)
        if not os.path.exists(validate_xml_dir):
            os.mkdir(validate_xml_dir)
        with codecs.open(validate_xml, 'w', 'UTF-8') as validate_file:
            validate_file.write('<?xml version="1.0" encoding="UTF-8"?>\n')
            validate_file.write('<report>')
            for c in validate_doc.findall('check'):
                print 'checking %s ...' % c.find('field').text
                check_item(transformed_doc, c.find('field'), c.findall('item'),
                           validate_file)
            validate_file.write('</report>')
        mu.convert_xml_to_html('validate.xml/validate.xml',
                               'validate.xml/validate.xsl',
                               'file:/%s' % tm.VALIDATE_CSS_PATH,
                               validate_html)
        shutil.copy(tm.VALIDATE_XSL_PATH,
                    os.path.join(source_space, 'validate.xml', 'validate.xsl'))
        os.remove(transformed_xml)
예제 #15
0
 def commit_job(self, cmdline):
     """When finished with a job
     Usage: commitjob [email] [sourceListId] [package]
     
     note: package should archive with zip, with the following structure
     sourcename.zip
         --sourcename.xml                                         
         --sourcename.png                                           //this is logo file
         --webForumConfiguration.xml(forum)                         
         --sourcename-url.xq & sourcename-thread.xq(forum))
         --finished.xml(forum)
         --sourcename.xq(blog)
         --config.xml(blog)
         --comment.txt
     
         --sourcename.xml sample:
             <meta>
                 <name>istofsports.co.uk</name>
                 <link>http://istofsports.co.uk</link>
                 <gmt>3</gmt>
             </meta>
     """
     
     arg = mu.get_second_arg(cmdline)
     args = cmdline.strip().split();
     
     if len(args) != 4:
         print "Command not valid, see 'man commitjob' for more info."
         return False
     
     email = args[1]
     source_id = args[2];
     pack_path = args[3]
     
     print "Be patient, I hate to wait either!"
     try:
         file = open(pack_path , "rb")
         r , p = self._proxy.upload_file(os.path.basename(pack_path) , xmlrpclib.Binary(file.read()))
         self._print_result(r, p)
     except:
         print "Failed upload package to server, Check whether package is correct?"
         print "Needs commit again"
         return False
     
     is_prioritized, p2 = self._proxy.is_prioritized(arg)                            
     r, p = self._proxy.commit_job(email , source_id , os.path.basename(pack_path))    
             
     if not r:
         self._print_result(r, p)
         return
     
     # rest so assignjob will give the correct message
     self._sourcelist_id = -1
     
     
     if is_prioritized:
         print "#" * 30
         print "# The source you just committed is flagged as a high-priority."
         print "# Please send the transformation to:"
         print "# \t'*****@*****.**'"
         print "# as SOON as possible."
         print "#" * 30            
     
     self._print_result(r, p)
예제 #16
0
    def transform(self, cmdline):
        """transform url and build output.xml
            Usage:transform [url] ([sourcename])"""

        args = mu.get_second_arg(cmdline).strip().split()
        if len(args) == 0:
            print "Invalid command, transform [url] ([sourcename])"
            return
        elif len(args) == 1:
            url, = args
            sourcename = self.get_sourcename_from_url(url)
        elif len(args) == 2:
            url, sourcename = args

        sourcedir = mu.search_for_source(sourcename)
        if sourcedir == None:
            print "Directory of %s doesn't exist." % sourcename
            return
        source_thread_xq = os.path.join(sourcedir, '%s-thread.xq' % sourcename)
        source_webForumConfiguration_xml = os.path.join(
            sourcedir, 'webForumConfiguration.xml')
        file = mu.check_files_in_dir(
            ('%s-thread.xq' % sourcename, 'webForumConfiguration.xml'),
            sourcedir)
        if file:
            print "File [%s] does not exists." % file
            return
        ingentia_url_xq = os.path.join(tm.INGENTIA_PATH, 'conf',
                                       'transformation', '5.xq')
        ingentia_thread_xq = os.path.join(tm.INGENTIA_PATH, 'conf',
                                          'transformation', '6.xq')
        ingentia_subSourceConfig_xml = os.path.join(tm.INGENTIA_PATH, 'conf',
                                                    'configuration',
                                                    'subSourceConfig.xml')
        ingentia_webForumConfiguration_xml = os.path.join(
            tm.INGENTIA_PATH, 'conf', 'configuration',
            'webForumConfiguration.xml')
        shutil.copy(source_thread_xq, ingentia_thread_xq)
        shutil.copy(source_webForumConfiguration_xml,
                    ingentia_webForumConfiguration_xml)
        with codecs.open(ingentia_url_xq, 'w', 'utf-8') as url_file:
            url_file.write('xquery version "1.0";\n')
            url_file.write('<forum>\n')
            url_file.write('	<threads>\n')
            url_file.write('		<thread>%s</thread>\n' % url)
            url_file.write('	</threads>\n')
            url_file.write('	<pages/>\n')
            url_file.write('</forum>\n')
        with codecs.open(ingentia_subSourceConfig_xml, 'w',
                         'utf-8') as subSourceConfig_file:
            subSourceConfig_file.write(
                '<?xml version="1.0" encoding="UTF-8"?>\n')
            subSourceConfig_file.write(
                '<no.integrasco.immensus.storage.domain.source.SubSource>\n')
            subSourceConfig_file.write('	<subsourceid>1</subsourceid>\n')
            subSourceConfig_file.write('	<name>transform</name>\n')
            subSourceConfig_file.write('	<uri>http://www.google.com/</uri>\n')
            subSourceConfig_file.write(
                '</no.integrasco.immensus.storage.domain.source.SubSource>\n')
        os.chdir(tm.INGENTIA_PATH)
        subprocess.call(["java -jar %s -c legacythread" % tm.INGENTIA_JAR],
                        shell=True)