def train(cls, model_filename, featuresets, classifier='naivebayes', options=[], quiet=True): # Make sure we can find java & weka. config_weka() # Build an ARFF formatter. formatter = ARFF_Formatter.from_train(featuresets) temp_dir = tempfile.mkdtemp() try: # Write the training data file. train_filename = os.path.join(temp_dir, 'train.arff') formatter.write(train_filename, featuresets) if classifier in cls._CLASSIFIER_CLASS: javaclass = cls._CLASSIFIER_CLASS[classifier] elif classifier in cls._CLASSIFIER_CLASS.values(): javaclass = classifier else: raise ValueError('Unknown classifier %s' % classifier) # Train the weka model. cmd = [javaclass, '-d', model_filename, '-t', train_filename] cmd += list(options) if quiet: stdout = subprocess.PIPE else: stdout = None java(cmd, classpath=_weka_classpath, stdout=stdout) # Return the new classifier. return WekaClassifier(formatter, model_filename) finally: for f in os.listdir(temp_dir): os.remove(os.path.join(temp_dir, f)) os.rmdir(temp_dir)
def train(model_filename, featuresets, quiet=True): # Make sure we can find java & weka. config_weka() # Build an ARFF formatter. formatter = ARFF_Formatter.from_train(featuresets) temp_dir = tempfile.mkdtemp() try: # Write the training data file. train_filename = os.path.join(temp_dir, 'train.arff') formatter.write(train_filename, featuresets) # Train the weka model. cmd = ['weka.classifiers.bayes.NaiveBayes', '-d', model_filename, '-t', train_filename] if quiet: stdout = subprocess.PIPE else: stdout = None java(cmd, classpath=_weka_classpath, stdout=stdout) # Return the new classifier. return WekaClassifier(formatter, model_filename) finally: for f in os.listdir(temp_dir): os.remove(os.path.join(temp_dir, f)) os.rmdir(temp_dir)
def _batch_classify(self, featuresets, options): # Make sure we can find java & weka. config_weka() temp_dir = tempfile.mkdtemp() try: # Write the test data file. test_filename = os.path.join(temp_dir, 'test.arff') self._formatter.write(test_filename, featuresets) # Call weka to classify the data. cmd = ['weka.classifiers.bayes.NaiveBayes', '-l', self._model, '-T', test_filename] + options (stdout, stderr) = java(cmd, classpath=_weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Check if something went wrong: if stderr and not stdout: if 'Illegal options: -distribution' in stderr: raise ValueError('The installed verison of weka does ' 'not support probability distribution ' 'output.') else: raise ValueError('Weka failed to generate output:\n%s' % stderr) # Parse weka's output. return self.parse_weka_output(stdout.split('\n')) finally: for f in os.listdir(temp_dir): os.remove(os.path.join(temp_dir, f)) os.rmdir(temp_dir)
def _batch_classify(self, featuresets, options): # Make sure we can find java & weka. config_weka() temp_dir = tempfile.mkdtemp() try: # Write the test data file. test_filename = os.path.join(temp_dir, 'test.arff') self._formatter.write(test_filename, featuresets) # Call weka to classify the data. cmd = [ 'weka.classifiers.bayes.NaiveBayes', '-l', self._model, '-T', test_filename ] + options (stdout, stderr) = java(cmd, classpath=_weka_classpath, stdout=subprocess.PIPE) # Parse weka's output. return self.parse_weka_output(stdout.split('\n')) finally: for f in os.listdir(temp_dir): os.remove(os.path.join(temp_dir, f)) os.rmdir(temp_dir)
def weka_classify(arff_file,model_file): class_index=1 if model_file =="": return None with open(arff_file,mode="r",encoding="utf-8") as file: lines = file.readlines() for i in range(lines.__len__()): if "@attribute class" in lines[i]: class_index = i break cmd =["weka.classifiers.trees.RandomForest","-p",str(class_index),"-l",str(model_file),"-T",str(arff_file)] (stdout, stderr) = java(cmd, classpath=weka_class_path,stdout=subprocess.PIPE,stderr=subprocess.PIPE) err_msg = stderr.decode("GBK") if err_msg !="": raise OSError('Java command failed : ' + str(err_msg)) result= stdout.decode(stdin.encoding) if "prediction ()" not in result: return None result = result[result.index("prediction ()")+"prediction ()".__len__():].strip() tmp = result.split("\n") final_result ={} for t in tmp: result_tmp = re.split(" +", t.strip()) if result_tmp[0] not in final_result.keys(): predict = result_tmp[2].split(":")[1] final_result[result_tmp[0]]= [predict,float(result_tmp[-1])] return final_result
def tag(self, text, options=['-mx2g']): command = ['edu.stanford.nlp.tagger.maxent.MaxentTagger'] command.extend(['-model', self._model]) command.extend(['-outputFormat', 'xml']) command.extend(['-outputFormatOptions', 'lemmatize']) command.extend(options) with tempfile.NamedTemporaryFile(mode='wb', delete=False) as text_file: text_file.write(text.encode('utf-8')) text_file.flush() command.extend(['-textFile', text_file.name]) stderr = subprocess.DEVNULL if not self._verbose else None stdout, _ = java(command, classpath=self._libs, stderr=stderr, stdout=subprocess.PIPE) output = stdout.decode('utf-8') tagged = [] for line in output.splitlines(): match = self._xml_regex.fullmatch(line) if match: tagged.append((match.group(3), match.group(2), match.group(1))) return tagged
def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(['-encoding', encoding]) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, compat.text_type) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() cmd.append(input_file.name) # Run the tagger and get the output. stdout, stderr = java(cmd, classpath=(self._stanford_jar, self._model_jar), stdout=PIPE, stderr=PIPE) stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout
def batch_tag(self, sentences): encoding = self._encoding default_options = ' '.join(_java_options) config_java(options=self.java_options, verbose=False) # Create a temporary input file _input_fh, self._input_file_path = tempfile.mkstemp(text=True) if encoding: self._cmd.extend(['-encoding', encoding]) # Write the actual sentences to the temporary input file _input_fh = os.fdopen(_input_fh, 'w') _input = '\n'.join((' '.join(x) for x in sentences)) if isinstance(_input, compat.text_type) and encoding: _input = _input.encode(encoding) _input_fh.write(_input) _input_fh.close() # Run the tagger and get the output stanpos_output, _stderr = java(self._cmd,classpath=self._stanford_jar, \ stdout=PIPE, stderr=PIPE) if encoding: stanpos_output = stanpos_output.decode(encoding) # Delete the temporary file os.unlink(self._input_file_path) # Return java configurations to their default values config_java(options=default_options, verbose=False) return self.parse_output(stanpos_output)
def batch_tag(self, sentences): encoding = self._encoding default_options = ' '.join(_java_options) config_java(options=self.java_options, verbose=False) # Create a temporary input file _input_fh, self._input_file_path = tempfile.mkstemp(text=True) if encoding: self._cmd.extend(['-encoding', encoding]) # Write the actual sentences to the temporary input file _input_fh = os.fdopen(_input_fh, 'w') _input = '\n'.join((' '.join(x) for x in sentences)) if isinstance(_input, unicode) and encoding: _input = _input.encode(encoding) _input_fh.write(_input) _input_fh.close() # Run the tagger and get the output stanpos_output, _stderr = java(self._cmd,classpath=self._stanford_jar, \ stdout=PIPE, stderr=PIPE) if encoding: stanpos_output = stanpos_output.decode(encoding) # Delete the temporary file os.unlink(self._input_file_path) # Return java configurations to their default values config_java(options=default_options, verbose=False) return self.parse_output(stanpos_output)
def tag_sents(self, sentences): encoding = self._encoding default_options = " ".join(_java_options) config_java(options=self.java_options, verbose=False) # Create a temporary input file _input_fh, self._input_file_path = tempfile.mkstemp(text=True) cmd = list(self._cmd) cmd.extend(["-encoding", encoding]) # Write the actual sentences to the temporary input file _input_fh = os.fdopen(_input_fh, "wb") _input = "\n".join((" ".join(x) for x in sentences)) if isinstance(_input, str) and encoding: _input = _input.encode(encoding) _input_fh.write(_input) _input_fh.close() # Run the tagger and get the output stanpos_output, _stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE) stanpos_output = stanpos_output.decode(encoding) # Delete the temporary file os.unlink(self._input_file_path) # Return java configurations to their default values config_java(options=default_options, verbose=False) return self.parse_output(stanpos_output, sentences)
def _batch_classify(self, featuresets, options): # Make sure we can find java & weka. config_weka() temp_dir = tempfile.mkdtemp() try: # Write the test data file. test_filename = os.path.join(temp_dir, "test.arff") self._formatter.write(test_filename, featuresets) # Call weka to classify the data. cmd = ["weka.classifiers.bayes.NaiveBayes", "-l", self._model, "-T", test_filename] + options (stdout, stderr) = java(cmd, classpath=_weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Check if something went wrong: if stderr and not stdout: if "Illegal options: -distribution" in stderr: raise ValueError( "The installed version of weka does " "not support probability distribution " "output." ) else: raise ValueError("Weka failed to generate output:\n%s" % stderr) # Parse weka's output. return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n")) finally: for f in os.listdir(temp_dir): os.remove(os.path.join(temp_dir, f)) os.rmdir(temp_dir)
def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(['-charset', encoding]) _options_cmd = self._options_cmd if _options_cmd: cmd.extend(['-options', self._options_cmd]) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, text_type) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() cmd.append(input_file.name) # Run the tagger and get the output. stdout, stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE) stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout
def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(['-encoding', encoding]) if self.corenlp_options: cmd.append(self.corenlp_options) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, text_type) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() # Run the tagger and get the output. if self._USE_STDIN: input_file.seek(0) stdout, stderr = java( cmd, classpath=self._classpath, stdin=input_file, stdout=PIPE, stderr=PIPE, ) else: cmd.append(input_file.name) stdout, stderr = java(cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE) stdout = stdout.replace(b'\xc2\xa0', b' ') stdout = stdout.replace(b'\x00\xa0', b' ') stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout
def start(self): cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer'] if self.corenlp_options: cmd.extend(self.corenlp_options) # Configure java. default_options = ' '.join(_java_options) config_java(options=self.java_options, verbose=self.verbose) try: # TODO: it's probably a bad idea to pipe stdout, as it will # accumulate when lots of text is being parsed. self.popen = java( cmd, classpath=self._classpath, blocking=False, stdout='pipe', stderr='pipe', ) finally: # Return java configurations to their default values. config_java(options=default_options, verbose=self.verbose) # Check that the server is istill running. returncode = self.popen.poll() if returncode is not None: _, stderrdata = self.popen.communicate() raise CoreNLPServerError( returncode, 'Could not start the server. ' 'The error was: {}'.format(stderrdata.decode('ascii')) ) for i in range(30): try: response = requests.get(requests.compat.urljoin(self.url, 'live')) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError( 'Could not connect to the server.' ) for i in range(60): try: response = requests.get(requests.compat.urljoin(self.url, 'ready')) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError( 'The server is not ready.' )
def start(self, stdout="devnull", stderr="devnull"): """ Starts the CoreNLP server :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe' """ import requests cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"] if self.corenlp_options: cmd.extend(self.corenlp_options) # Configure java. default_options = " ".join(_java_options) config_java(options=self.java_options, verbose=self.verbose) try: self.popen = java( cmd, classpath=self._classpath, blocking=False, stdout=stdout, stderr=stderr, ) finally: # Return java configurations to their default values. config_java(options=default_options, verbose=self.verbose) # Check that the server is istill running. returncode = self.popen.poll() if returncode is not None: _, stderrdata = self.popen.communicate() raise CoreNLPServerError( returncode, "Could not start the server. " "The error was: {}".format(stderrdata.decode("ascii")), ) for i in range(30): try: response = requests.get(requests.compat.urljoin(self.url, "live")) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError("Could not connect to the server.") for i in range(60): try: response = requests.get(requests.compat.urljoin(self.url, "ready")) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError("The server is not ready.")
def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(['-encoding', encoding]) if self.corenlp_options: cmd.append(self.corenlp_options) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, text_type) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() # Run the tagger and get the output. if self._USE_STDIN: input_file.seek(0) stdout, stderr = java( cmd, classpath=self._classpath, stdin=input_file, stdout=PIPE, stderr=PIPE, ) else: cmd.append(input_file.name) stdout, stderr = java( cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE ) stdout = stdout.replace(b'\xc2\xa0', b' ') stdout = stdout.replace(b'\x00\xa0', b' ') stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout
def train(self, file=''): if not file: file = self.train_file command = self.command_line + ' -prop ' + self.prop_file cmd = shlex.split(command) sout, serr = java(cmd, classpath=self.path_to_jar, stdout=PIPE, stderr=PIPE) print '--------------------TRAIN------------------------' print serr return sout, serr
def verify(self, file=''): if not file: file = self.test_file command = self.command_line + ' -loadClassifier ' + self.model_filename + ' -testFile ' + file cmd = shlex.split(command) sout, serr = java(cmd, classpath=self.path_to_jar, stdout=PIPE, stderr=PIPE) print '--------------------TEST------------------------' print serr return sout, serr
def train( cls, model_filename, featuresets, classifier='naivebayes', options=[], quiet=True, ): # Make sure we can find java & weka. config_weka() # Build an ARFF formatter. formatter = ARFF_Formatter.from_train(featuresets) temp_dir = tempfile.mkdtemp() try: # Write the training data file. train_filename = os.path.join(temp_dir, 'train.arff') formatter.write(train_filename, featuresets) if classifier in cls._CLASSIFIER_CLASS: javaclass = cls._CLASSIFIER_CLASS[classifier] elif classifier in cls._CLASSIFIER_CLASS.values(): javaclass = classifier else: raise ValueError('Unknown classifier %s' % classifier) # Train the weka model. cmd = [javaclass, '-d', model_filename, '-t', train_filename] cmd += list(options) if quiet: stdout = subprocess.PIPE else: stdout = None java(cmd, classpath=_weka_classpath, stdout=stdout) # Return the new classifier. return WekaClassifier(formatter, model_filename) finally: for f in os.listdir(temp_dir): os.remove(os.path.join(temp_dir, f)) os.rmdir(temp_dir)
def detokenize(self, text, options=['-mx2g']): command = ['edu.stanford.nlp.process.PTBTokenizer', '-untok'] command.extend(options) stderr = subprocess.DEVNULL if not self._verbose else None jproc = java(command, classpath=self._libs, blocking=False, stderr=stderr, stdout=subprocess.PIPE, stdin=subprocess.PIPE) stdout, _ = jproc.communicate(text.encode('utf-8')) output = stdout.decode('utf-8') return output
def call_mxpost(classpath=None, stdin=None, stdout=None, stderr=None, blocking=False): if not classpath: config_mxpost() if not classpath: classpath = _mxpost_classpath elif 'mxpost.jar' not in classpath: classpath += ':%s' % _mxpost_classpath cmd = ['tagger.TestTagger', '%s/%s' % (_mxpost_home, 'wsj-02-21.mxpost')] return java(cmd, classpath, stdin, stdout, stderr, blocking)
def _execute(self, java_class, infile, outfile): cwd = os.getcwd() os.chdir(self._mate_root) os.chdir(self._mate_root) cmd = [ java_class, "-model", self._model, "-test", infile, "-out", outfile ] stdout, stderr = java(cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE) os.chdir(cwd) return stdout, stderr
def _execute(self, cmd, verbose=False): encoding = self._encoding #cmd.extend(['-inputEncoding', encoding]) _options_cmd = self._options_cmd if _options_cmd: cmd.extend(['-options', self._options_cmd]) default_options = ' '.join(_java_options) config_java(options=self.java_options, verbose=verbose) # Configure java. stdout, _stderr = java(cmd,classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE) stdout = stdout.decode(encoding) config_java(options=default_options, verbose=verbose) # Return java configurations to their default values. return stdout
def tag(self, infile, outfile): cwd = os.getcwd() os.chdir(self._mate_root) os.chdir(self._mate_root) cmd = [ "is2.tag.Tagger", "-model", self._model, "-test", infile, "-out", outfile ] stdout, stderr = java(cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE) os.chdir(cwd) return stdout, stderr
def start(self, stdout='devnull', stderr='devnull'): """ Starts the CoreNLP server :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe' """ cmd = ['edu.stanford.nlp.parser.server.LexicalizedParserServer'] if self.corenlp_options: cmd.extend(self.corenlp_options) # Configure java. # default_options = ' '.join(_java_options) default_options = '' config_java(options=self.java_options, verbose=self.verbose) try: self.popen = java( cmd, classpath=self._classpath, blocking=False, stdout=stdout, stderr=stderr, ) finally: # Return java configurations to their default values. config_java(options=default_options, verbose=self.verbose) # Check that the server is istill running. returncode = self.popen.poll() if returncode is not None: _, stderrdata = self.popen.communicate() raise CoreNLPServerError( returncode, 'Could not start the server. ' 'The error was: {}'.format(stderrdata.decode('ascii')), ) for i in range(5): try: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.connect((self.host, self.port)) except ConnectionRefusedError: time.sleep(1) else: break else: raise CoreNLPServerError('Could not connect to the server.')
def call_mallet(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=True): """ Call `nltk.internals.java` with the given command, and with the classpath modified to include both ``nltk.jar`` and all the ``.jar`` files defined by Mallet. See `nltk.internals.java` for parameter and return value descriptions. """ if _mallet_classpath is None: config_mallet() # Set up the classpath if classpath is None: classpath = _mallet_classpath else: classpath += os.path.pathsep + _mallet_classpath # Delegate to java() return java(cmd, classpath, stdin, stdout, stderr, blocking)
def _execute(self, cmd, verbose=False): encoding = self._encoding cmd.extend(['-inputEncoding', encoding]) _options_cmd = self._options_cmd if _options_cmd: cmd.extend(['-options', self._options_cmd]) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) stdout, _stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE) stdout = stdout.decode(encoding) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout
def _classify_many(self, featuresets, options): # Make sure we can find java & weka. config_weka() temp_dir = tempfile.mkdtemp() try: # Write the test data file. test_filename = os.path.join(temp_dir, "test.arff") self._formatter.write(test_filename, featuresets) # Call weka to classify the data. cmd = [ "weka.classifiers.bayes.NaiveBayes", "-l", self._model, "-T", test_filename, ] + options (stdout, stderr) = java( cmd, classpath=_weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) # Check if something went wrong: if stderr and not stdout: if "Illegal options: -distribution" in stderr: raise ValueError("The installed version of weka does " "not support probability distribution " "output.") else: raise ValueError("Weka failed to generate output:\n%s" % stderr) # Parse weka's output. return self.parse_weka_output( stdout.decode(stdin.encoding).split("\n")) finally: for f in os.listdir(temp_dir): os.remove(os.path.join(temp_dir, f)) os.rmdir(temp_dir)
def _classify_using_weka(self, test_comments, feature_extractor): test_set = nltk.classify.util.apply_features(feature_extractor.extract, test_comments) temp_dir = tempfile.mkdtemp() self.test_filename = os.path.join(temp_dir, 'test.arff') logger.info('Writing Test WEKA File: ' + self.test_filename) self._write_ARFF_file(self.test_filename, test_set) cmd = [self.javaclass, '-t', self.train_filename, '-T', self.test_filename] + ['-p', '0'] logger.info('Executing WEKA: ' + str(cmd)) config_java(options='-Xmx2000M') (stdout, stderr) = java(cmd, classpath=weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return self.parse_weka_output(stdout.split('\n'))
def call_mallet(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=True): """ Call L{nltk.internals.java()} with the given command, and with the classpath modified to include both C{nltk.jar} and all the C{.jar} files defined by Mallet. See L{nltk.internals.java()} for parameter and return value descriptions. """ if _mallet_classpath is None: config_mallet() # Set up the classpath if classpath is None: classpath = _mallet_classpath else: classpath += ':' + _mallet_classpath # Delegate to java() return java(cmd, classpath, stdin, stdout, stderr, blocking)
def tag(self, tokens): _input = ' '.join(tokens).replace('\n', ' ').replace('\r', ' ').replace( '\r\n', ' ').strip() if len(_input) == 0: return [] # Create pipe if not already opened if not self._thread: encoding = self._encoding default_options = ' '.join(_java_options) config_java(options=self.java_options, verbose=False) cmd = list(self._cmd) cmd.extend(['-encoding', encoding]) self._child = java(cmd, classpath=self._stanford_jar, stdin='pipe', stdout='pipe', stderr='pipe', blocking=False) self._queue = Queue() self._thread = Thread(target=_enqueue_output, args=(self._child.stdout, self._queue)) self._thread.daemon = True self._thread.start() # clear all newlines, only append one at last for java _input += '\n' self._child.stdin.write(_input.encode('utf-8')) self._child.stdin.flush() try: return self.parse_output( self._queue.get(timeout=120) ) # wait for 2m, usually should return in less than 100ms except Empty: print('stanford postagger timeout, return empty tuple instead', file=sys.stderr) return []
def _classify_using_weka(self, test_comments, feature_extractor): test_set = nltk.classify.util.apply_features(feature_extractor.extract, test_comments) temp_dir = tempfile.mkdtemp() self.test_filename = os.path.join(temp_dir, 'test.arff') logger.info('Writing Test WEKA File: ' + self.test_filename) self._write_ARFF_file(self.test_filename, test_set) cmd = [ self.javaclass, '-t', self.train_filename, '-T', self.test_filename ] + ['-p', '0'] logger.info('Executing WEKA: ' + str(cmd)) config_java(options='-Xmx2000M') (stdout, stderr) = java(cmd, classpath=weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return self.parse_weka_output(stdout.split('\n'))
def _batch_classify(self, featuresets, options): # Make sure we can find java & weka. config_weka() temp_dir = tempfile.mkdtemp() try: # Write the test data file. test_filename = os.path.join(temp_dir, 'test.arff') self._formatter.write(test_filename, featuresets) # Call weka to classify the data. cmd = ['weka.classifiers.bayes.NaiveBayes', '-l', self._model, '-T', test_filename] + options (stdout, stderr) = java(cmd, classpath=_weka_classpath, stdout=subprocess.PIPE) # Parse weka's output. return self.parse_weka_output(stdout.split('\n')) finally: for f in os.listdir(temp_dir): os.remove(os.path.join(temp_dir, f)) os.rmdir(temp_dir)
def _classify_many(self, test_instances_filepath, options, show_stdout=False): # Make sure we can find java & weka. # config_weka() try: # We have already written the test instances to a file, whose name was passed in as test_instances_filepath # Example (WORKING) cmd for InputMappedClassifier, where the loaded model is a FilteredClassifier # weka.classifiers.misc.InputMappedClassifier -I -M -L .\shared\weka_en_conscientious_Filtered.model -T en_test_1094.arff -t en_test_1094.arff -classifications "weka.classifiers.evaluation.output.prediction.CSV –p 1" prediction_filepath = test_instances_filepath[:-5] + ".csv" options = '-classifications "weka.classifiers.evaluation.output.prediction.CSV –p 1 -file {path}"'.format( path=prediction_filepath) options = options.split(' ') ## INPUTMAPPEDCLASSIFIER cmd = [ "weka.classifiers.misc.InputMappedClassifier", '-I', '-M', '-L', self._model, '-T', test_instances_filepath, '-t', test_instances_filepath, '-p', '1' ] # if len(options) > 0: # cmd = cmd + options """ # Call weka to classify the data. if len(options) > 0: cmd = [self.classifier, '-l', self._model, '-T', test_instances_filepath, '-p', '0'] + options else: cmd = [self.classifier, '-l', self._model, '-T', test_instances_filepath, '-p', '0']""" if self.classifier in ('weka.classifiers.functions.LibLINEAR', 'weka.classifiers.meta.RotationForest'): cmd = [ 'weka.Run', ] + cmd # print(cmd) print(" ".join(cmd)) (stdout, stderr) = java(cmd, classpath=_weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Check if something went wrong: if stderr and not stdout: print("STDERR was:") for line in stderr.decode(stdin.encoding).split('\n'): print(line) # if 'Illegal options: -distribution' in stderr: # raise ValueError('The installed version of weka does ' # 'not support probability distribution ' # 'output.\nSTDERR:\n%s' % stderr) # else: # raise ValueError('Weka failed to generate output:\n%s' # % stderr) return None stdout_lines = stdout.decode(stdin.encoding).split('\n') if show_stdout: if len(stdout_lines) <= 20: for line in stdout_lines: print(line) else: print("\nFirst 20 lines of stdout were:") for line in stdout_lines[:20]: print(line) print( " ({0} more lines...)".format(len(stdout_lines) - 20)) # If we are using the -classifications argument with an output prediction class like weka.classifiers.evaluation.output.prediction.CSV # and sending prediction output to a file rather than stdout, don't bother parsing the input here if '-classifications' in options and '-file' in options: # Strip unwanted text from stdout for i, line in enumerate(stdout_lines): if line.strip().startswith("inst#"): stdout_lines = stdout_lines[i:] break return [ line.strip() for line in stdout_lines if len(line.strip()) > 0 and line[0] != "=" ] # Parse weka's output. try: output_text = self.parse_weka_output(stdout_lines) except ValueError: output_text = stdout_lines #return self.parse_weka_output(stdout.decode(stdin.encoding).split('\n')) return output_text finally: #for f in os.listdir(temp_dir): # os.remove(os.path.join(temp_dir, f)) #os.rmdir(temp_dir) pass
def start(self): import requests cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer'] if self.corenlp_options: cmd.extend(self.corenlp_options) # Configure java. default_options = ' '.join(_java_options) config_java(options=self.java_options, verbose=self.verbose) try: # TODO: it's probably a bad idea to pipe stdout, as it will # accumulate when lots of text is being parsed. self.popen = java( cmd, classpath=self._classpath, blocking=False, stdout='pipe', stderr='pipe', ) finally: # Return java configurations to their default values. config_java(options=default_options, verbose=self.verbose) # Check that the server is istill running. returncode = self.popen.poll() if returncode is not None: _, stderrdata = self.popen.communicate() raise CoreNLPServerError( returncode, 'Could not start the server. ' 'The error was: {}'.format(stderrdata.decode('ascii')) ) for i in range(30): try: response = requests.get(requests.compat.urljoin(self.url, 'live')) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError( 'Could not connect to the server.' ) for i in range(60): try: response = requests.get(requests.compat.urljoin(self.url, 'ready')) except requests.exceptions.ConnectionError: time.sleep(1) else: if response.ok: break else: raise CoreNLPServerError( 'The server is not ready.' )
def tag(sents, java_options='-Xmx1g -XX:ParallelGCThreads=2'): """Tags a sentence using the CMU twitter tokenizer. :param sents: List of sentences to be tagged. The list should contain each sentence as a string. :type sents: list of str """ _root = os.path.join(susx._sussex_root, 'CMU') _cp = '' jars = [ os.path.join(_root, jar) for jar in os.listdir(_root) if jar.endswith('.jar') ] _cp += ';'.join(jars) # write the sentences to the temp file _input_fh, _input_file_path = tempfile.mkstemp(text=True) _input_fh = os.fdopen(_input_fh, 'w') _input = '\n'.join(x.strip() for x in sents if x.strip()) _input_fh.write(_input) _input_fh.close() _output_fh, _output_file_path = tempfile.mkstemp(text=True) # if we're on windows and java hasn't been configured yet if platform.platform().startswith('Windows'): if nltk.internals._java_bin is None: found_java = False for jre_path in _paths: if os.path.exists(jre_path): found_java = True break if found_java: config_java(jre_path, options=java_options, verbose=False) else: raise RuntimeError( 'Can\'t find an installed Java Runtime Environment (JRE).' 'If you have installed java in a non standard location ' 'please call nltk.internals.config_java with the correct ' 'JRE path and options=\'-Xmx1g -XX:ParallelGCThreads=2\' ' 'before calling sussex_nltk.cmu.tag') else: config_java(options=java_options, verbose=False) _cmd = [ 'cmu.arktweetnlp.RunTagger', '--no-confidence', '--output-format', 'conll', _input_file_path ] _dir = os.getcwd() os.chdir(_root) java(_cmd, classpath=_cp, stdout=_output_fh, stderr=subprocess.PIPE) os.chdir(_dir) _output_file = open(_output_file_path, 'r') _output_data = _output_file.read() _output_file.close() os.fdopen(_output_fh).close() os.unlink(_input_file_path) os.unlink(_output_file_path) return _output_data
def train(cls, formatter, model_filename, featuresets, all_labels, classifier='', options=[], quiet=True, fileprefix=''): # Make sure we can find java & weka. config_weka() #temp_dir = tempfile.mkdtemp() try: # Write the training data file. train_filename = os.path.join(os.curdir, fileprefix + 'train.arff') formatter.write(train_filename, featuresets) #print("\nWekaClassifier.train wrote training data file:", train_filename) # MINE MINE MINE if classifier == 'filtered': #print('requested filtered classifier, will filter and use mnbtext') classifier = 'mnbtext' # Run a weka filter on the training data set first filter_output = train_filename[:-5] + "_filtered.arff" filter_class = "weka.filters.unsupervised.attribute.AddValues" filter_cmd = [ filter_class, '-C', 'last', '-S', '-L', ",".join(all_labels), '-i', train_filename, # input to the filter '-o', filter_output ] #print("\nWekaClassifier.train filter_cmd is:", filter_cmd) #print(r" ".join(filter_cmd)) train_filename = filter_output # use the output of the filter as the input to the classifier if quiet: stdout = subprocess.PIPE else: stdout = None java(filter_cmd, classpath=_weka_classpath, stdout=stdout) if classifier in cls._CLASSIFIER_CLASS: javaclass = cls._CLASSIFIER_CLASS[classifier] elif classifier in cls._CLASSIFIER_CLASS.values(): javaclass = classifier else: raise ValueError('Unknown classifier %s' % classifier) # Train the weka model. options = [] cmd = [ javaclass, '-d', model_filename, # Sets model output file. '-t', train_filename # Sets training input data file ] cmd += list(options) #print("\nWekaClassifier.train cmd is:\n", cmd, "\n") if quiet: stdout = subprocess.PIPE else: stdout = None java(cmd, classpath=_weka_classpath, stdout=stdout) # Return the new classifier. return WekaClassifier( formatter, model_filename, classifier= javaclass # I added this parameter so the _classify_many function will call the same class of classifier as we trained the model with (otherwise it always used NaiveBayes for some reason) ) else: #print('requested classifier', classifier) if classifier in cls._CLASSIFIER_CLASS: javaclass = cls._CLASSIFIER_CLASS[classifier] elif classifier in cls._CLASSIFIER_CLASS.values(): javaclass = classifier else: raise ValueError('Unknown classifier %s' % classifier) # Train the weka model. cmd = [ javaclass, '-d', model_filename, # Sets model output file. '-t', train_filename # Sets training input data file ] cmd += list(options) #print("\nWekaClassifier.train cmd is:\n", " ".join(cmd), "\n") if quiet: stdout = subprocess.PIPE else: stdout = None java(cmd, classpath=_weka_classpath, stdout=stdout) # Return the new classifier. return WekaClassifier( formatter, model_filename, classifier= javaclass, # I added this parameter so the _classify_many function will call the same class of classifier as we trained the model with (otherwise it always used NaiveBayes for some reason) fileprefix=fileprefix) finally: # for f in os.listdir(temp_dir): # os.remove(os.path.join(temp_dir, f)) # os.rmdir(temp_dir) pass