Exemplo n.º 1
0
class CorenlpSubprocWordSplitter(CorenlpRemoteWordSplitter):
    """
    A ``WordSplitter`` that uses CoreNLP's tokenizer.
    It starts ``corenlp-server`` as a sub-process, and call it's Web API.
    """
    def __init__(
        self,
        path_to_jar: str = None,
        path_to_models_jar: str = None,
        verbose: str = False,
        java_options: str = None,
        corenlp_options: str = None,
        port: int = None,
        encoding: str = 'utf8',
    ):
        """
        Parameters
        ----------

        * For parameters from ``path_to_jar`` to ``port``, see https://www.nltk.org/api/nltk.parse.html#nltk.parse.corenlp.
        * For parameter ``encoding``,  see https://www.nltk.org/api/nltk.parse.html#nltk.parse.corenlp.CoreNLPParser
        """
        self._server = CoreNLPServer(path_to_jar, path_to_models_jar, verbose,
                                     java_options, corenlp_options, port)
        self._server.start()
        super().__init__(self._server.url, encoding)

    def __del__(self):
        self._server.stop()
Exemplo n.º 2
0
def dependency_parse(raw_data):
    from nltk.parse.corenlp import CoreNLPServer

    # The server needs to know the location of the following files:
    #   - stanford-corenlp-X.X.X.jar
    #   - stanford-corenlp-X.X.X-models.jar
    STANFORD = os.path.join("..", "stanford-corenlp-full-2020-04-20")

    # Create the server
    server = CoreNLPServer(
        os.path.join(STANFORD, "stanford-corenlp-4.0.0.jar"),
        os.path.join(STANFORD, "stanford-corenlp-4.0.0-models.jar"),
    )

    # Start the server in the background
    server.start()
    from nltk.parse import CoreNLPParser
    parser = CoreNLPParser()

    new_data = []
    for example in raw_data:
        sentence, features_seq = example[0], example[-1]
        parse = next(parser.raw_parse(sentence))
        # get a few "important" neighboring words

    server.stop()
Exemplo n.º 3
0
def server():
    print('Starting CoreNLP server...')
    serv = CoreNLPServer(path_to_jar=config.CORENLP_JAR,
                         path_to_models_jar=config.CORENLP_MODELS_JAR)
    try:
        serv.start()
        print('Server started.')
        while True:
            pass
    except KeyboardInterrupt:
        pass
    except Exception as e:
        print(e)
    finally:
        print('Stopping server...')
        serv.stop()
class CoreNLP:
    def __init__(self, args):
        self.context = dict()
        self.server = None
        self.set_system_env(*args)

    def set_system_env(self, *args):
        idx = 1
        while idx < len(args):
            if args[idx] == '--stanford':
                idx += 1
                standford_path = args[idx]
                self.context['path_to_jar'] = os.path.join(standford_path, 'stanford-corenlp-3.9.2.jar')
                self.context['path_to_models_jar'] = os.path.join(standford_path, 'stanford-corenlp-3.9.2-models.jar')
                print('corenlp jar:', self.context['path_to_jar'])
                print('corenlp models jar:', self.context['path_to_models_jar'])

            elif args[idx] == '--java':
                idx += 1
                java_path = args[idx]
                os.environ['JAVAHOME'] = java_path
                print('java path:', java_path)

            idx += 1

    def start_server(self):
        self.server = CoreNLPServer(**self.context)
        self.server.start()

    def stop_server(self):
        self.server.stop()

    def parse_tree(self, s):
        parser = CoreNLPParser()

        parse = next(parser.raw_parse(s))
        # parse.draw()

        return parse

    def dependency_parse_tree(self, s):
        parser = CoreNLPDependencyParser()

        parse = next(parser.raw_parse(s))

        return parse
lines = f2.readlines()
target_list = [line.split("\n")[0] for line in lines]

print(opinion_list)
print(target_list)

df = pd.read_csv('data.csv', index_col=0)
df['comment'] = df['comment'].apply(process_sentence)
#print(df.head())
print(len(df.index.values))

#opinion_size = len(opinion_list)
#target_size = len(target_list)

sents = df['comment'].values
opinion_d_l = []
target_d_l = []
for sent in sents:
    tp, op = parse_comment(sent)
    opinion_d_l.append(op)
    target_d_l.append(tp)
#print(opinion_d_l)

dict_list = {'opinion': [], 'target': []}
dict_list['opinion'] = opinion_d_l
dict_list['target'] = target_d_l
df = pd.DataFrame(data=dict_list)
df.head()
server.stop()
df.to_csv('transactions.csv')
class CoreNLPSentenceAnalyzer():
    """
    A sentence analyzer based on Stanford CoreNLP.

    Refernces:
        The CoreNLP Syntax Parser
            https://bbengfort.github.io/snippets/2018/06/22/corenlp-nltk-parses.html
        Penn Treebank II Tags
            https://gist.github.com/nlothian/9240750
    """
    def __init__(self):
        self.lab_set = set()

    def init_server(self):
        STANDFORD = os.path.join("stanford-corenlp-full-2018-10-05")
        self.server = CoreNLPServer(
            os.path.join(STANDFORD, "stanford-corenlp-3.9.2.jar"),
            os.path.join(STANDFORD, "stanford-corenlp-3.9.2-models.jar"))
        self.server.start()
        self.parser = CoreNLPParser()

    def stop_server(self):
        self.server.stop()

    def parse_syntax(self, sent):
        return next(self.parser.raw_parse(sent))

    def _collect_labels(self, node):
        """
        Collect labels in the given node recursively. This method should not be invoked directly but done by collect_labels.
        """
        try:
            self.lab_result.append(node.label())
        except AttributeError:
            return
        for nn in node:
            self._collect_labels(nn)
        return

    def collect_labels(self, node):
        """
        Collect all labels in a tree starting from the given node.
        """
        self.lab_result = []  # used to collect labels in the recursion
        self._collect_labels(node)
        lab_counter = Counter(self.lab_result)

        # Keep the tags we have seen so far
        self.lab_set = self.lab_set.union(lab_counter.keys())

        return lab_counter

    def get_lab_series(self, lab_counter_list):
        """
        Convert and merge all lab_counters in the given list (the result of "collect_labels") into a series by using tags which have been seen so far (self.lab_set).
        """
        rt = pd.DataFrame(columns=self.lab_set)
        for lab_counter in lab_counter_list:
            rt = rt.append(pd.Series(lab_counter, index=self.lab_set),
                           ignore_index=True)
        rt = rt.add_prefix('penn_')
        return rt.sum()
Exemplo n.º 7
0
class Summarizer:
    """
    Summarizer class implementing opinion-feature extraction. Uses Stanford CoreNLP dependency parser.

    Attributes:
    server (CoreNLPServer): CoreNLP server for accessing Stanford CoreNLP services.
    parser (CoreNLPDependencyParser): CoreNLP dependency parser.

    """
    def __init__(self, jar_path, models_jar_path):
        """
        The constructor for Summarizer class.

        Parameters:
        jar_path (str): Filepath to Stanford CoreNLP .jar file.
        models_jar_path (str): Filepath to Stanford CoreNLP models .jar file.

        """
        logging.info('Starting CoreNLP server...')
        self.server = CoreNLPServer(path_to_jar=jar_path,
                                    path_to_models_jar=models_jar_path)
        try:
            self.server.start()
            logging.info('CoreNLP server started.')
        # CoreNLPServerError is thrown when a server is already running
        except CoreNLPServerError:
            logging.warning('CoreNLP server is already running.')
        self.parser = CoreNLPDependencyParser()

    def summarize(self, text):
        """
        Summarizes a review. Extracts opinion-feature pairs from it.

        Parameters:
        text (str): Review text.

        Returns:
        Summary: List of opinion-feature pairs extracted from the review text.

        """
        try:
            parse = next(self.parser.raw_parse(text))
        # An HTTPError raised by the CoreNLP server is related to unrecognized characters in the review text
        except HTTPError:
            logging.warning(f'Review skipped: {text}')
            return []

        # Search dependency parsing result to find "nsubj" or "amod" tags
        summary = list()
        for governor, dep, dependent in parse.triples():
            if dep == 'nsubj':
                # Look if the nominal subject is noun and if it is modified by an adjective
                if governor[1] == 'JJ' and dependent[1] in {'NN', 'NNS'}:
                    summary.append((governor[0].lower(), dependent[0].lower()))
            elif dep == 'amod':
                # Look if the adjective is linked to a noun
                if dependent[1] == 'JJ' and governor[1] in {'NN', 'NNS'}:
                    summary.append((dependent[0].lower(), governor[0].lower()))
        return summary

    def stop(self):
        """
        Stops the CoreNLP server of the summarizer object.

        """
        self.server.stop()
        logging.info('CoreNLP server stopped.')