def discard_job(self,cmdline): """ Will discard your current job. Should only be used if the source is extremely difficult or contains unwanted content like for example p**n. The @email is for finding the source that should be discarded, the @cause is the reason for discarding the source. Usage: discardjob [email] [sourceListId]""" if len(mu.get_second_arg(cmdline).split()) >1: email , sourcename = mu.get_second_arg(cmdline).split() else: print "Invalid command, Usage: discardjob [email] [sourceListId]" return print "Please enter the reason for discarding this source:" cause = str(raw_input("> ")) if not cause or len(cause) < 2: print "Invalid cause given, please rerun the command with a proper cause." return print "---" print "Discarding {1} assigned to: {0}".format(email , sourcename) print "Cause: '{0}'".format(cause) confirm = str( raw_input( "To confirm type: 'discard' without quotes: " ) ) if confirm == "discard": self._sourcelist_id = -1 r, p = self._proxy.discard_job(email , cause , sourcename) self._print_result(r, p) else: print "Confirmation failed - aborting."
def get_url_and_encoding(self, cmdline): url = mu.get_second_arg(cmdline).strip() encoding = mu.get_second_arg(url).strip() url = url.replace(encoding, "", 1).strip() return {"url": url, "encoding": encoding}
def assign_job(self, cmdline): """Assigns the job you just popped Usage: assignjob [email]""" email = mu.get_second_arg(cmdline) if not (self._sourcelist_id and self._sourcelist_id > 0): print "You must pop a job before assigning one." return r, p = self._proxy.assign_job(self._sourcelist_id, email) # if not r: self._print_result(r, p) return r2, p2 = self._proxy.is_prioritized(email) if r2: print "#" * 30 print "# This source is marked as high-priority." print "# And the finished transformation should therefore be sent" print "# to '*****@*****.**' upon completion." print "#" * 30 else: self._print_result(r2, p2) self._print_result(r, p)
def add_comment(self, cmdline): """ Appends a comment tagged with supplied email and current time. Can append comment to your currently assigned job, or a source you identify with id. Usage: addcomment [@email] """ email = mu.get_second_arg(cmdline) if len(email) < 3: print "You must provide a valid email to comment from." print "see 'man addcomment' for more info." return print "Enter id of the source you want to add comment to. Use 'jobinfo' to get id of sources" print "Press 'enter' without typing to add comment to your currently assigned job" sourceid = str( raw_input() ) print "Type your comment, and submit by pressing 'Enter'" print "-- start comment --" comment = str(raw_input("> ")) print "-- end comment --" print "Do you want to add the above comment to source with id={0}?".format(str(sourceid)) print "Type 'yes' or 'y' to accept, any other key to cancel." answer = raw_input("> ") if(answer.lower() != "yes" and answer.lower() != "y"): print "Add comment canceled by user." return r, p = self._proxy.add_comment(email, sourceid, comment) self._print_result(r, p)
def visualise(self, cmdline): """Visualise transformed.log Usage:visualise [sourcename]""" sourcename = mu.get_second_arg(cmdline).strip() source_space = os.path.join(tm.TMONKEY_DIR, 'sources', sourcename) if mu.check_files_in_dir(('transformed.log', ), source_space): print "transformed.log does not exists." return transformed_log = os.path.join(source_space, 'transformed.log') transformed_xml = os.path.join(source_space, 'transformed.xml') print "Converting transformed.log to transformed.xml ..." mu.convert_log_to_xml(transformed_log, transformed_xml) transformed_doc = xml.etree.ElementTree.parse(transformed_xml) transformed_elm = transformed_doc.getroot().getchildren() sum = len(transformed_elm) if sum < tm.RANDOM_PAGE_AMOUNT: amount = sum else: amount = tm.RANDOM_PAGE_AMOUNT pages = mu.get_randoms(amount, sum) randomPage_xml_dir = os.path.join(source_space, 'randomPage.xml') if not os.path.exists(randomPage_xml_dir): os.mkdir(randomPage_xml_dir) for i in range(amount): print "creating randomPage-%d.xml ..." % i randomPage_xml = os.path.join(randomPage_xml_dir, 'randomPage-%d.xml' % i) randomPage_html = os.path.join(source_space, 'randomPage-%d.html' % i) with codecs.open(randomPage_xml, 'w', 'utf-8') as randomPage_file: randomPage_file.write('<?xml version="1.0" encoding="UTF-8"?>\n') randomPage_file.write(xml.etree.ElementTree.tostring(transformed_elm[pages[i]])) mu.convert_xml_to_html('randomPage.xml/randomPage-%d.xml' % i, 'randomPage.xml/visualise.xsl', 'file:/%s' % tm.VISUALISE_CSS_PATH, randomPage_html) shutil.copy(tm.VISUALISE_XSL_PATH, os.path.join(randomPage_xml_dir, 'visualise.xsl')) os.remove(transformed_xml)
def test_job(self , cmdline): """Will enter test phase, allow assign new sources Usage: testjob [@email]""" email = mu.get_second_arg(cmdline) r, p = self._proxy.test_job(email) self._print_result(r, p)
def job_info(self, cmdline): """ Print name and id of the provided email's current job Usage: jobinfo [@email] """ email = mu.get_second_arg(cmdline) r, p = self._proxy.job_info(email) self._print_result(r, p)
def parse_cmdline(self, cmdline): args = tu.get_second_arg(cmdline) if cmdline == 'sc': return dict(flag='local', config_file=ts.SUBSOURCE_CRAWLER_CONFIG_FILE_PATH) else: if os.path.exists(args): return dict(flag='local', config_file=args) else: return dict(flag='db', source=args)
def get_comment(self, cmdline): """Will return the comment for the given sourcelistId Usage: getcomment [sourceListId]""" sourcelist_id = mu.get_second_arg(cmdline) if re.match("^\s*\d+\s*$",sourcelist_id) == None: print "The sourceid is not a number according to ^\s*\d+\s*$" return r, p = self._proxy.get_comment(sourcelist_id) self._print_result(r, p)
def free_job(self, cmdline): """ This command will release your current job, making it available for others. Usage: freejob [@email] or freejob [sourcelistId] """ second_arg = mu.get_second_arg(cmdline) r, p = self._proxy.free_job(second_arg) self._print_result(r, p) # to make sure we have to pop a new job self._sourcelist_id = -1
def get_feedback(self , cmdline): """Retrieve feedback of the source from Integrasco Chengdu """ source_id = mu.get_second_arg(cmdline) if not re.match("^\d+$" , source_id): print "source id must be integer" return r , p = self._proxy.get_feedback(source_id) self._print_result(r, p)
def pop_job(self, cmdline): """pops the most prioritized job Usage: popjob """ type = mu.get_second_arg(cmdline) if( not(type == "forum" or type == "blog") and len(type) > 0 ): print "You did not type forum or blog correctly" return self._sourcelist_id, p, comment = self._proxy.pop_job(type) print p print "comment******************" print comment print "*************************"
def test(self, cmdline): """test source in tmonkey Usage:test [sourcename] ([maxThread] ([pageLimit])) maxThread and pageLimit for forum""" if tm.UPDATE_BEFORE_TEST: print "Updating directory of source ..." mu.update_dir(tm.SOURCE_DIR) args = mu.get_second_arg(cmdline).strip().split() if len(args) == 0: print "Invalid command, test [sourcename] ([maxThread] ([pageLimit]))" return elif len(args) == 1: self.sourcename, = args self.max_thread = '5' self.page_limit = '2' elif len(args) == 2: self.sourcename, self.max_thread = args self.page_limit = '2' elif len(args) == 3: self.sourcename, self.max_thread, self.page_limit = args print "Searching directory of %s ..." % self.sourcename self.sourcedir = mu.search_for_source(self.sourcename) if not self.sourcedir: print "Directory of %s doesn't exist.\n" % self.sourcename return self.sourcetype = self.get_source_type() if self.sourcetype == 'blog': process = BlogProcess(self.sourcename, self.sourcedir) config_files = ('%s.xq' % self.sourcename, 'config.xml', 'globalConfig.xml', 'subSourceConfig.xml') elif self.sourcetype == 'forum': process = ForumProcess(self.sourcename, self.sourcedir, string.atoi(self.max_thread), self.page_limit) config_files = ('%s-url.xq' % self.sourcename, '%s-thread.xq' % self.sourcename, 'finished.xml', 'webForumConfiguration.xml') self.test_source(process, self.sourcedir, config_files)
def validate(self, cmdline): """Validate transformed.log and build report Usage:validate [sourcename]""" sourcename = mu.get_second_arg(cmdline).strip() source_space = os.path.join(tm.TMONKEY_DIR, 'sources', sourcename) if mu.check_files_in_dir(('transformed.log', ), source_space): print "transformed.log does not exists." return transformed_log = os.path.join(source_space, 'transformed.log') transformed_xml = os.path.join(source_space, 'transformed.xml') validate_xml_dir = os.path.join(source_space, 'validate.xml') validate_xml = os.path.join(validate_xml_dir, 'validate.xml') validate_html = os.path.join(source_space, 'validate.html') print "Converting transformed.log to transformed.xml ..." mu.convert_log_to_xml(transformed_log, transformed_xml) print "Validating transformed.xml ..." transformed_doc = xml.etree.ElementTree.parse(transformed_xml) validate_doc = xml.etree.ElementTree.parse(tm.VALIDATE_XML_PATH) if not os.path.exists(validate_xml_dir): os.mkdir(validate_xml_dir) with codecs.open(validate_xml, 'w', 'UTF-8') as validate_file: validate_file.write('<?xml version="1.0" encoding="UTF-8"?>\n') validate_file.write('<report>') for c in validate_doc.findall('check'): print 'checking %s ...' % c.find('field').text check_item(transformed_doc, c.find('field'), c.findall('item'), validate_file) validate_file.write('</report>') mu.convert_xml_to_html('validate.xml/validate.xml', 'validate.xml/validate.xsl', 'file:/%s' % tm.VALIDATE_CSS_PATH, validate_html) shutil.copy(tm.VALIDATE_XSL_PATH, os.path.join(source_space, 'validate.xml', 'validate.xsl')) os.remove(transformed_xml)
def commit_job(self, cmdline): """When finished with a job Usage: commitjob [email] [sourceListId] [package] note: package should archive with zip, with the following structure sourcename.zip --sourcename.xml --sourcename.png //this is logo file --webForumConfiguration.xml(forum) --sourcename-url.xq & sourcename-thread.xq(forum)) --finished.xml(forum) --sourcename.xq(blog) --config.xml(blog) --comment.txt --sourcename.xml sample: <meta> <name>istofsports.co.uk</name> <link>http://istofsports.co.uk</link> <gmt>3</gmt> </meta> """ arg = mu.get_second_arg(cmdline) args = cmdline.strip().split(); if len(args) != 4: print "Command not valid, see 'man commitjob' for more info." return False email = args[1] source_id = args[2]; pack_path = args[3] print "Be patient, I hate to wait either!" try: file = open(pack_path , "rb") r , p = self._proxy.upload_file(os.path.basename(pack_path) , xmlrpclib.Binary(file.read())) self._print_result(r, p) except: print "Failed upload package to server, Check whether package is correct?" print "Needs commit again" return False is_prioritized, p2 = self._proxy.is_prioritized(arg) r, p = self._proxy.commit_job(email , source_id , os.path.basename(pack_path)) if not r: self._print_result(r, p) return # rest so assignjob will give the correct message self._sourcelist_id = -1 if is_prioritized: print "#" * 30 print "# The source you just committed is flagged as a high-priority." print "# Please send the transformation to:" print "# \t'*****@*****.**'" print "# as SOON as possible." print "#" * 30 self._print_result(r, p)
def transform(self, cmdline): """transform url and build output.xml Usage:transform [url] ([sourcename])""" args = mu.get_second_arg(cmdline).strip().split() if len(args) == 0: print "Invalid command, transform [url] ([sourcename])" return elif len(args) == 1: url, = args sourcename = self.get_sourcename_from_url(url) elif len(args) == 2: url, sourcename = args sourcedir = mu.search_for_source(sourcename) if sourcedir == None: print "Directory of %s doesn't exist." % sourcename return source_thread_xq = os.path.join(sourcedir, '%s-thread.xq' % sourcename) source_webForumConfiguration_xml = os.path.join( sourcedir, 'webForumConfiguration.xml') file = mu.check_files_in_dir( ('%s-thread.xq' % sourcename, 'webForumConfiguration.xml'), sourcedir) if file: print "File [%s] does not exists." % file return ingentia_url_xq = os.path.join(tm.INGENTIA_PATH, 'conf', 'transformation', '5.xq') ingentia_thread_xq = os.path.join(tm.INGENTIA_PATH, 'conf', 'transformation', '6.xq') ingentia_subSourceConfig_xml = os.path.join(tm.INGENTIA_PATH, 'conf', 'configuration', 'subSourceConfig.xml') ingentia_webForumConfiguration_xml = os.path.join( tm.INGENTIA_PATH, 'conf', 'configuration', 'webForumConfiguration.xml') shutil.copy(source_thread_xq, ingentia_thread_xq) shutil.copy(source_webForumConfiguration_xml, ingentia_webForumConfiguration_xml) with codecs.open(ingentia_url_xq, 'w', 'utf-8') as url_file: url_file.write('xquery version "1.0";\n') url_file.write('<forum>\n') url_file.write(' <threads>\n') url_file.write(' <thread>%s</thread>\n' % url) url_file.write(' </threads>\n') url_file.write(' <pages/>\n') url_file.write('</forum>\n') with codecs.open(ingentia_subSourceConfig_xml, 'w', 'utf-8') as subSourceConfig_file: subSourceConfig_file.write( '<?xml version="1.0" encoding="UTF-8"?>\n') subSourceConfig_file.write( '<no.integrasco.immensus.storage.domain.source.SubSource>\n') subSourceConfig_file.write(' <subsourceid>1</subsourceid>\n') subSourceConfig_file.write(' <name>transform</name>\n') subSourceConfig_file.write(' <uri>http://www.google.com/</uri>\n') subSourceConfig_file.write( '</no.integrasco.immensus.storage.domain.source.SubSource>\n') os.chdir(tm.INGENTIA_PATH) subprocess.call(["java -jar %s -c legacythread" % tm.INGENTIA_JAR], shell=True)