class CorenlpSubprocWordSplitter(CorenlpRemoteWordSplitter): """ A ``WordSplitter`` that uses CoreNLP's tokenizer. It starts ``corenlp-server`` as a sub-process, and call it's Web API. """ def __init__( self, path_to_jar: str = None, path_to_models_jar: str = None, verbose: str = False, java_options: str = None, corenlp_options: str = None, port: int = None, encoding: str = 'utf8', ): """ Parameters ---------- * For parameters from ``path_to_jar`` to ``port``, see https://www.nltk.org/api/nltk.parse.html#nltk.parse.corenlp. * For parameter ``encoding``, see https://www.nltk.org/api/nltk.parse.html#nltk.parse.corenlp.CoreNLPParser """ self._server = CoreNLPServer(path_to_jar, path_to_models_jar, verbose, java_options, corenlp_options, port) self._server.start() super().__init__(self._server.url, encoding) def __del__(self): self._server.stop()
def dependency_parse(raw_data): from nltk.parse.corenlp import CoreNLPServer # The server needs to know the location of the following files: # - stanford-corenlp-X.X.X.jar # - stanford-corenlp-X.X.X-models.jar STANFORD = os.path.join("..", "stanford-corenlp-full-2020-04-20") # Create the server server = CoreNLPServer( os.path.join(STANFORD, "stanford-corenlp-4.0.0.jar"), os.path.join(STANFORD, "stanford-corenlp-4.0.0-models.jar"), ) # Start the server in the background server.start() from nltk.parse import CoreNLPParser parser = CoreNLPParser() new_data = [] for example in raw_data: sentence, features_seq = example[0], example[-1] parse = next(parser.raw_parse(sentence)) # get a few "important" neighboring words server.stop()
def server(): print('Starting CoreNLP server...') serv = CoreNLPServer(path_to_jar=config.CORENLP_JAR, path_to_models_jar=config.CORENLP_MODELS_JAR) try: serv.start() print('Server started.') while True: pass except KeyboardInterrupt: pass except Exception as e: print(e) finally: print('Stopping server...') serv.stop()
class CoreNLP: def __init__(self, args): self.context = dict() self.server = None self.set_system_env(*args) def set_system_env(self, *args): idx = 1 while idx < len(args): if args[idx] == '--stanford': idx += 1 standford_path = args[idx] self.context['path_to_jar'] = os.path.join(standford_path, 'stanford-corenlp-3.9.2.jar') self.context['path_to_models_jar'] = os.path.join(standford_path, 'stanford-corenlp-3.9.2-models.jar') print('corenlp jar:', self.context['path_to_jar']) print('corenlp models jar:', self.context['path_to_models_jar']) elif args[idx] == '--java': idx += 1 java_path = args[idx] os.environ['JAVAHOME'] = java_path print('java path:', java_path) idx += 1 def start_server(self): self.server = CoreNLPServer(**self.context) self.server.start() def stop_server(self): self.server.stop() def parse_tree(self, s): parser = CoreNLPParser() parse = next(parser.raw_parse(s)) # parse.draw() return parse def dependency_parse_tree(self, s): parser = CoreNLPDependencyParser() parse = next(parser.raw_parse(s)) return parse
lines = f2.readlines() target_list = [line.split("\n")[0] for line in lines] print(opinion_list) print(target_list) df = pd.read_csv('data.csv', index_col=0) df['comment'] = df['comment'].apply(process_sentence) #print(df.head()) print(len(df.index.values)) #opinion_size = len(opinion_list) #target_size = len(target_list) sents = df['comment'].values opinion_d_l = [] target_d_l = [] for sent in sents: tp, op = parse_comment(sent) opinion_d_l.append(op) target_d_l.append(tp) #print(opinion_d_l) dict_list = {'opinion': [], 'target': []} dict_list['opinion'] = opinion_d_l dict_list['target'] = target_d_l df = pd.DataFrame(data=dict_list) df.head() server.stop() df.to_csv('transactions.csv')
class CoreNLPSentenceAnalyzer(): """ A sentence analyzer based on Stanford CoreNLP. Refernces: The CoreNLP Syntax Parser https://bbengfort.github.io/snippets/2018/06/22/corenlp-nltk-parses.html Penn Treebank II Tags https://gist.github.com/nlothian/9240750 """ def __init__(self): self.lab_set = set() def init_server(self): STANDFORD = os.path.join("stanford-corenlp-full-2018-10-05") self.server = CoreNLPServer( os.path.join(STANDFORD, "stanford-corenlp-3.9.2.jar"), os.path.join(STANDFORD, "stanford-corenlp-3.9.2-models.jar")) self.server.start() self.parser = CoreNLPParser() def stop_server(self): self.server.stop() def parse_syntax(self, sent): return next(self.parser.raw_parse(sent)) def _collect_labels(self, node): """ Collect labels in the given node recursively. This method should not be invoked directly but done by collect_labels. """ try: self.lab_result.append(node.label()) except AttributeError: return for nn in node: self._collect_labels(nn) return def collect_labels(self, node): """ Collect all labels in a tree starting from the given node. """ self.lab_result = [] # used to collect labels in the recursion self._collect_labels(node) lab_counter = Counter(self.lab_result) # Keep the tags we have seen so far self.lab_set = self.lab_set.union(lab_counter.keys()) return lab_counter def get_lab_series(self, lab_counter_list): """ Convert and merge all lab_counters in the given list (the result of "collect_labels") into a series by using tags which have been seen so far (self.lab_set). """ rt = pd.DataFrame(columns=self.lab_set) for lab_counter in lab_counter_list: rt = rt.append(pd.Series(lab_counter, index=self.lab_set), ignore_index=True) rt = rt.add_prefix('penn_') return rt.sum()
class Summarizer: """ Summarizer class implementing opinion-feature extraction. Uses Stanford CoreNLP dependency parser. Attributes: server (CoreNLPServer): CoreNLP server for accessing Stanford CoreNLP services. parser (CoreNLPDependencyParser): CoreNLP dependency parser. """ def __init__(self, jar_path, models_jar_path): """ The constructor for Summarizer class. Parameters: jar_path (str): Filepath to Stanford CoreNLP .jar file. models_jar_path (str): Filepath to Stanford CoreNLP models .jar file. """ logging.info('Starting CoreNLP server...') self.server = CoreNLPServer(path_to_jar=jar_path, path_to_models_jar=models_jar_path) try: self.server.start() logging.info('CoreNLP server started.') # CoreNLPServerError is thrown when a server is already running except CoreNLPServerError: logging.warning('CoreNLP server is already running.') self.parser = CoreNLPDependencyParser() def summarize(self, text): """ Summarizes a review. Extracts opinion-feature pairs from it. Parameters: text (str): Review text. Returns: Summary: List of opinion-feature pairs extracted from the review text. """ try: parse = next(self.parser.raw_parse(text)) # An HTTPError raised by the CoreNLP server is related to unrecognized characters in the review text except HTTPError: logging.warning(f'Review skipped: {text}') return [] # Search dependency parsing result to find "nsubj" or "amod" tags summary = list() for governor, dep, dependent in parse.triples(): if dep == 'nsubj': # Look if the nominal subject is noun and if it is modified by an adjective if governor[1] == 'JJ' and dependent[1] in {'NN', 'NNS'}: summary.append((governor[0].lower(), dependent[0].lower())) elif dep == 'amod': # Look if the adjective is linked to a noun if dependent[1] == 'JJ' and governor[1] in {'NN', 'NNS'}: summary.append((dependent[0].lower(), governor[0].lower())) return summary def stop(self): """ Stops the CoreNLP server of the summarizer object. """ self.server.stop() logging.info('CoreNLP server stopped.')