)

# Neural Dependency Parser
from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
parses = dep_parser.parse(
    'What is the airspeed of an unladen swallow ?'.split())
print([[(governor, dep, dependent)
        for governor, dep, dependent in parse.triples()] for parse in parses])
print(
    "\nExpected: [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]\n"
)

# Tokenizer
parser = CoreNLPParser(url='http://localhost:9000')
print(list(parser.tokenize('What is the airspeed of an unladen swallow?')))
print(
    "\nExpected: ['What', 'is', 'the', 'airspeed', 'of', 'an', 'unladen', 'swallow', '?']\n"
)

# POS Tagger
pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
print(
    list(pos_tagger.tag(
        'What is the airspeed of an unladen swallow ?'.split())))
print(
    "\nExpected: [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]\n"
)

# NER Tagger
ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
    parser = CoreNLPParser(url='http://localhost:9001')
    print('parser generated!')
    exception_sen = []
    tree_list = []
    p_phrase_trees = None
    f_input = open(args.input, mode='r', encoding='utf-8')

    f_output = open(args.output, mode='w', encoding='utf-8')
    # f_output = open(args.output, 'wt')
    for senid, line in enumerate(f_input):
        print(senid)
        # if senid > 100:
        #     break
        try:
            p_parse_trees = list(parser.parse(parser.tokenize(line)))
        except ValueError:
            print('parsing fail')
            exception_sen.append(senid)
            p_parse_trees = [Tree.fromstring('(S (NULL ERROR))')
                             ]  # we simply give a dummy tree
        f_output.write('%d\n' % len(p_parse_trees))
        for sub_tree in p_parse_trees:
            f_output.write(str(sub_tree))
            f_output.write('\n|||\n')
        # str_tree = ' '.join(p_parse_trees)
        # f_output.write(str_tree)
        # f_output.write('\n')

        # tree_list.append(p_parse_trees)
    f_input.close()
示例#3
0
class custom_parse_handler:
    corenlp_host = 'http://localhost:9000'  # CoreNLP server host
    main_categories = ['geography', 'music',
                       'movies']  # Categories utilized in the project

    def __init__(self, input_file, output_file, dbConnector):
        self.ip_file = input_file  # Input statements
        self.op_file = output_file  # Generated output streamed to this file apart from the command prompt

        self.parser = CoreNLPParser(
            url=self.corenlp_host
        )  # Initializing the connection with CoreNLP parse
        self.testParserConnection()

        # Setting up the word2vec model
        corpusFilePath = os.path.dirname(os.path.realpath(
            __file__)) + os.path.sep + "tools" + os.path.sep + "word2vec"
        corpusFileName = "GoogleNews-vectors-negative300.bin"
        self.filePath = corpusFilePath  # file path
        self.fileName = corpusFilePath + os.path.sep + corpusFileName  # Constructing the full path for the file name
        self.model = KeyedVectors.load_word2vec_format(self.fileName,
                                                       binary=True)

        self.stopWords = nltk.corpus.stopwords.words('english')
        self.fileNewLine = "\n"

        self.dbConnector = dbConnector

    def testParserConnection(self):
        str = "This is a test statement"
        try:
            list(self.parser.parse(str.split()))
        except Exception as e:
            print("Error while connecting to CoreNLP server. Exiting.")
            sys.exit()

    def getParseTree(self, sentence):
        return list(self.parser.parse(sentence.split()))

    def displayConstructedParseTree(self, parseTree, fileObj=None):
        for entry in parseTree:
            if fileObj is None:
                entry.pretty_print()
            else:
                entry.pretty_print(stream=fileObj)

    def updatePredictedCategoryForWord(self, entry, category_sum):
        for i in range(len(self.main_categories)):
            try:
                sim_val = self.model.similarity(entry, self.main_categories[i])
                category_sum[self.main_categories[i]] += sim_val
            except KeyError:
                pass
        return category_sum

    def getCategoryWithMaxVoting(self, categoryMap):
        max_val = None
        max_category = None
        for entry in categoryMap:
            val = categoryMap[entry]
            if max_val is None or val > max_val:
                max_val = val
                max_category = entry
        return max_category

    def assignCategory(self, statement):
        # Special case: Statements associated with Geography are misclassified when beginning with "where is"
        if statement.lower().startswith('where is'):
            return 'geography'
        tokens = list(self.parser.tokenize(statement))
        filtered_words = [w for w in tokens if not w in self.stopWords]
        filtered_words_lower = [w.lower() for w in filtered_words]
        category_sum = {}
        for entry in self.main_categories:
            # Exclude the category 'geography' if the words related to 'birth' are present in the sentence
            if (entry != 'geography' and 'capital' not in filtered_words_lower
                ) or (entry == 'geography'
                      and not ('born' in filtered_words_lower
                               or 'birth' in filtered_words_lower)):
                category_sum[entry] = 0
        for entry in filtered_words:
            category_sum = self.updatePredictedCategoryForWord(
                entry, category_sum)
        return self.getCategoryWithMaxVoting(category_sum)

    # Direct the output to the default output stream and an output file.
    def outputGenerator(self, statement, query, answer, opFileObj=None):
        if opFileObj is not None:
            opFileObj.write("<QUESTION> " + statement)
            opFileObj.write(self.fileNewLine)
            opFileObj.write(self.fileNewLine)
            if query is not None:
                opFileObj.write("<QUERY> " + query)
            else:
                opFileObj.write("<QUERY> ")
            opFileObj.write(self.fileNewLine)
            opFileObj.write(self.fileNewLine)
            opFileObj.write("<ANSWER> " + answer)
            opFileObj.write(self.fileNewLine)
            opFileObj.write(self.fileNewLine)
            opFileObj.write(self.fileNewLine)
        print("<QUESTION> ", statement, "\n")
        if query is not None:
            print("<QUERY> ", query, "\n")
        else:
            print("<QUERY>\n")
        print("<ANSWER> ", answer, "\n\n")

    # Process the parse tree to extract projections and perform translation into SQL queries
    def extractProjections(self, parseTree, queryObj, category):
        # Starts off the entire tree recursion process
        for entry in parseTree:
            # entry.pretty_print()
            self.processRecurse(None, entry, queryObj, category)

    # Recursively traverse the parse tree using DFS and generate transitions for each parent, child node-pair
    def processRecurse(self, parent, treeObj, queryObj, category):
        if type(treeObj) != nltk.tree.Tree:  # leaf node
            transition_obj = transition(parent, treeObj, None, queryObj,
                                        category)
            return treeObj, transition_obj
        if "." == treeObj.label() or "DT" == treeObj.label(
        ):  # do not handle determiners or punctuation
            return "", None
        str_transition = treeObj.label() + " " + "->"
        current_children = []
        for i in range(len(treeObj)):
            label, transition_obj_inter = self.processRecurse(
                treeObj.label(), treeObj[i], queryObj, category)
            if transition_obj_inter is not None:
                current_children.append(transition_obj_inter)
                str_transition += " " + label
        if treeObj.label() != 'ROOT':
            transition_obj_fin = transition(parent, str_transition,
                                            current_children, queryObj,
                                            category)
        else:  # Root of the tree is encountered
            transition_obj_fin = None
        return treeObj.label(), transition_obj_fin

    def parseInputFile(
        self
    ):  # Parse the statements in the input file sequentially and perform semantic tranformation
        ipFileObj = open(self.ip_file, "r")
        opFileObj = open(self.op_file, "w")
        try:
            for entry in ipFileObj:
                question = entry.strip()
                if not question.startswith('--'):
                    queryObj = queryForm()
                    parseTree = self.getParseTree(
                        question)  # Generate parse tree
                    category = self.assignCategory(
                        question)  # Assign probable category
                    self.extractProjections(
                        parseTree, queryObj, category
                    )  # Extract the projections and generate the query object
                    # queryObj.printComponents()
                    queryObj.constructQuery()  # Construct the final query
                    results = self.dbConnector.getResults(
                        queryObj, category
                    )  # Execute the query in the database and generate results
                    self.outputGenerator(question, queryObj.getQueryStr(),
                                         results, opFileObj)
        except Exception as e:
            print("Error while processing.")
            print(e)
        finally:
            ipFileObj.close()
            opFileObj.close()
示例#4
0
文件: arabic.py 项目: mzettersten/vcs
for category in categoryNames:

    #set up list of responses for each category
    wordListResponse = []
    responseLengthList = []
    completeWordList = []
    responseList = []

    #loop through each response for that category
    for response in d.loc[d["image"] == category, "nameing_response"]:
        #break response into a list of unique words
        #first clean up response (removing punctuation, emoji, etc.)
        response_cleaned = clean_response(response)
        #now tokenize
        curWordList = list(parser.tokenize(response_cleaned))  #tokenize

        #add to list of word response lists
        wordListResponse.append(curWordList)

        #add to list tracking the number of words in each response
        responseLengthList.append(len(curWordList))

        #list of all individual word responses
        completeWordList = completeWordList + curWordList

        responseList.append(".".join(curWordList))

    #number of responses to category
    number_responses.append(len(responseLengthList))
    #set up list of responses for each category
    wordListResponse_1 = []
    lemmaListResponse_1 = []
    responseLengthList_1 = []
    completeWordList_1 = []
    completeLemmaList_1 = []
    responseList_1 = []

    #loop through each response for that category
    for response in d.loc[d["angle"] == category1, "nameing_response"]:
        #break response into a list of unique words
        #first clean up response (removing punctuation, emoji, etc.)
        response_cleaned = clean_response(response)
        #now tokenize
        curWordList = list(parser.tokenize(response))  #tokenize

        #add to list of word response lists
        wordListResponse_1.append(curWordList)

        #add to list tracking the number of words in each response
        responseLengthList_1.append(len(curWordList))

        #list of all individual word responses
        completeWordList_1 = completeWordList_1 + curWordList

        responseList_1.append(".".join(curWordList))

    #set up list of responses for each category
    wordListResponse_2 = []
    lemmaListResponse_2 = []
示例#6
0
class SingleSentencePlot(SentencePlot):
    def __init__(self, config, nl_model):
        super().__init__(config)
        self.nl_model = nl_model
        self.server_url = 'http://localhost:9000'
        self.parser = CoreNLPParser(url=self.server_url)

    def plot(self, max_relevance_words_in_plot, sentence_id="1004293:0"):

        results_file = self.config.get_file_of_results(
            self.nl_model.config.name_of_model)

        if not os.path.isfile(results_file):
            raise ("[!] Data %s not found" % results_file)

        index = -1
        original_sentences = self.nl_model.internal_data_loader.original_sentence_training

        with open(results_file, 'r') as file:
            for line in file:
                sentences = json.loads(line)
                sentences.pop(0)
                for sentence in sentences:

                    index += 1

                    if sentence['sentence_id'] == sentence_id:

                        sentence_index = sentence['sentence_index']

                        tokenized_sentence = list(
                            self.parser.tokenize(
                                original_sentences[sentence_index]))
                        print("tokenized_sentence ", tokenized_sentence)

                        max_word_relevance = np.full(
                            (3, max_relevance_words_in_plot), -1.)
                        relevant_words = np.empty(
                            [3, max_relevance_words_in_plot], dtype=object)

                        for attribute_dict in sentence[
                                'subsets_word_relevance_linear_regression']:

                            for i in range(3):

                                min_value = max_word_relevance[i].min()
                                min_index = max_word_relevance[i].argmin()

                                if attribute_dict[str(i)] > min_value:
                                    max_word_relevance[i][
                                        min_index] = attribute_dict[str(i)]

                                    n_indices = len(
                                        attribute_dict['indices_attribute'])
                                    word = str(tokenized_sentence[
                                        attribute_dict['indices_attribute']
                                        [0]])

                                    for w in range(1, n_indices):
                                        word = word + ' ' + \
                                               str(tokenized_sentence[attribute_dict['indices_attribute'][w]])

                                    relevant_words[i][min_index] = word

                        intercept = sentence['intercepts_slr']

                        self.plot_final_results(
                            max_word_relevance, relevant_words,
                            np.argmax(sentence['prediction']), "OWN",
                            sentence_index, sentence_id,
                            self.nl_model.config.name_of_model, intercept)
                        self.plot_final_results(
                            max_word_relevance, relevant_words,
                            np.argmax(sentence['aspect_polarity_matrix']),
                            "OWN", sentence_index, sentence_id,
                            self.nl_model.config.name_of_model, intercept)

                        max_word_relevance = np.full(
                            (3, max_relevance_words_in_plot), -1.)
                        relevant_words = np.empty(
                            [3, max_relevance_words_in_plot], dtype=object)

                        for attribute_dict in sentence[
                                'subsets_word_relevance_pred_difference']:

                            for i in range(3):

                                min_value = max_word_relevance[i].min()
                                min_index = max_word_relevance[i].argmin()

                                if attribute_dict[str(i)] > min_value:
                                    max_word_relevance[i][
                                        min_index] = attribute_dict[str(i)]

                                    indices = attribute_dict[
                                        'indices_attribute']
                                    if type(indices) == list:

                                        n_indices = len(
                                            attribute_dict['indices_attribute']
                                        )
                                        word = str(tokenized_sentence[
                                            attribute_dict['indices_attribute']
                                            [0]])

                                        for w in range(1, n_indices):
                                            word = word + ' ' + \
                                                   str(tokenized_sentence[attribute_dict['indices_attribute'][w]])
                                    else:
                                        word = str(tokenized_sentence[indices])

                                    relevant_words[i][min_index] = word

                        intercept = np.zeros(len(sentence['prediction']))

                        self.plot_final_results(
                            max_word_relevance, relevant_words,
                            np.argmax(sentence['prediction']), "LACE",
                            sentence_index, sentence_id,
                            self.nl_model.config.name_of_model, intercept)
                        self.plot_final_results(
                            max_word_relevance, relevant_words,
                            np.argmax(sentence['aspect_polarity_matrix']),
                            "LACE", sentence_index, sentence_id,
                            self.nl_model.config.name_of_model, intercept)

                        max_word_relevance = np.full(
                            (3, max_relevance_words_in_plot), -1.)
                        relevant_words = np.empty(
                            [3, max_relevance_words_in_plot], dtype=object)

                        for attribute_dict in sentence[
                                'word_relevance_linear_regression']:

                            for i in range(3):

                                min_value = max_word_relevance[i].min()
                                min_index = max_word_relevance[i].argmin()

                                if attribute_dict[str(i)] > min_value:
                                    max_word_relevance[i][
                                        min_index] = attribute_dict[str(i)]
                                    relevant_words[i][min_index] = str(
                                        tokenized_sentence[attribute_dict[
                                            'indices_attribute'][0]])

                        intercept = sentence['intercepts_slr']

                        self.plot_final_results(
                            max_word_relevance, relevant_words,
                            np.argmax(sentence['prediction']), "LIME",
                            sentence_index, sentence_id,
                            self.nl_model.config.name_of_model, intercept)
                        self.plot_final_results(
                            max_word_relevance, relevant_words,
                            np.argmax(sentence['aspect_polarity_matrix']),
                            "LIME", sentence_index, sentence_id,
                            self.nl_model.config.name_of_model, intercept)
class SentenceExplanationPlot:
    def __init__(self, neural_language_model):
        self.neural_language_model = neural_language_model
        self.server_url = 'http://localhost:9000'
        self.parser = CoreNLPParser(url=self.server_url)

    def run(self, sentence_id):

        file = self.neural_language_model.config.get_explanation_file(
            self.neural_language_model.config.name_of_model, sentence_id)

        with open(file, 'r') as file:
            for line in file:
                sentences = json.loads(line)

                for sentence in sentences:

                    lemmatized_sentence = sentence['lemmatized_sentence']
                    original_sentence = sentence['original_sentence']
                    tokenized_sentence = list(
                        self.parser.tokenize(original_sentence))
                    aspect_indices = sentence['aspects']

                    sentence_id = sentence['sentence_id']
                    sentence_index = sentence['sentence_index']

                    argmax_pred = np.argmax(sentence['prediction'])

                    lace = []
                    lime = []
                    own = []

                    relation_yes = {}

                    aspect_sentiment_positive = {}
                    aspect_sentiment_negative = {}

                    word_sentiment_positive = {}
                    word_sentiment_negative = {}

                    attention_score = {}

                    x = []

                    if self.neural_language_model.config.name_of_model == "LCR_Rot_hop_model":

                        for i in range(self.neural_language_model.config.
                                       n_iterations_hop):
                            relation_yes[i] = []
                            aspect_sentiment_positive[i] = []
                            aspect_sentiment_negative[i] = []
                            word_sentiment_positive[i] = []
                            word_sentiment_negative[i] = []
                            attention_score[i] = []
                    else:
                        relation_yes[0] = []
                        aspect_sentiment_positive[0] = []
                        aspect_sentiment_negative[0] = []
                        word_sentiment_positive[0] = []
                        word_sentiment_negative[0] = []
                        attention_score[0] = []

                    for index in range(len(lemmatized_sentence)):

                        if index in aspect_indices:
                            continue

                        original_word = tokenized_sentence[index]
                        x.append(original_word)

                        lemma = lemmatized_sentence[index]
                        word_info = sentence[lemma]

                        lime.append(word_info['relevance_linear_regression']
                                    [argmax_pred])
                        lace.append(word_info['subset_pred_dif'][argmax_pred])
                        own.append(word_info['subset_linear_reg'][argmax_pred])

                        if self.neural_language_model.config.name_of_model == "LCR_Rot_hop_model":

                            for i in range(self.neural_language_model.config.
                                           n_iterations_hop):
                                attention_score[i].append(
                                    word_info['attention_score_' + str(i)])

                                aspect_sentiment_positive[i].append(word_info[
                                    'weighted_states_pred_aspect_sentiments_' +
                                    str(i)][0])
                                aspect_sentiment_negative[i].append(word_info[
                                    'weighted_states_pred_aspect_sentiments_' +
                                    str(i)][1])

                                relation_yes[i].append(
                                    word_info['weighted_states_pred_relations_'
                                              + str(i)][0])

                                word_sentiment_positive[i].append(word_info[
                                    'weighted_states_pred_word_sentiments_' +
                                    str(i)][0])
                                word_sentiment_negative[i].append(word_info[
                                    'weighted_states_pred_word_sentiments_' +
                                    str(i)][1])
                        else:
                            attention_score[0].append(
                                word_info['attention_score'])

                            aspect_sentiment_positive[0].append(word_info[
                                'weighted_states_pred_aspect_sentiments'][0])
                            aspect_sentiment_negative[0].append(word_info[
                                'weighted_states_pred_aspect_sentiments'][1])

                            relation_yes[0].append(
                                word_info['weighted_states_pred_relations'][0])

                            word_sentiment_positive[0].append(word_info[
                                'weighted_states_pred_word_sentiments'][0])
                            word_sentiment_negative[0].append(word_info[
                                'weighted_states_pred_word_sentiments'][1])

                    sum_lace = np.sum(np.abs(lace))
                    sum_lime = np.sum(np.abs(lime))
                    sum_own = np.sum(np.abs(own))

                    average_lace = np.array(lace) / sum_lace
                    print("average_lace ", average_lace)
                    average_lime = np.array(lime) / sum_lime
                    print("average_lime ", average_lime)
                    average_own = np.array(own) / sum_own
                    print("own ", own)
                    print("average_own ", average_own)

                    for i in range(self.neural_language_model.config.
                                   n_iterations_hop):

                        self.plot(x, average_lime, average_lace, average_own,
                                  attention_score[i],
                                  aspect_sentiment_positive[i],
                                  aspect_sentiment_negative[i],
                                  word_sentiment_positive[i],
                                  word_sentiment_negative[i], relation_yes[i],
                                  sentence_id, sentence_index, i)

    def plot(self, x, lime, lace, own, attention_score,
             aspect_sentiment_positive, aspect_sentiment_negative,
             word_sentiment_positive, word_sentiment_negative, relation_yes,
             sentence_id, index_number, weight_number):

        fig, ax = plt.subplots()
        fig.set_size_inches(40.5, 14.5)
        ax.tick_params(length=15, axis='x', width=3, labelsize=58)
        ax.tick_params(length=15, axis='y', width=3, labelsize=40)

        plt.subplots_adjust(bottom=0.55)
        ax.set_ylim([-0.1, 1.1])

        ax.axhline(0, color='grey', alpha=0.50)

        index = np.array([2 + x * 1.25 for x in range(len(x))])
        print(index)
        bar_width = 0.10
        opacity = 0.8

        rects1 = plt.bar(index,
                         relation_yes,
                         bar_width,
                         alpha=opacity,
                         align='center',
                         color='C0',
                         label='ARC',
                         edgecolor='black')
        rects2 = plt.bar(index + bar_width,
                         aspect_sentiment_positive,
                         bar_width,
                         alpha=opacity,
                         align='center',
                         color='C1',
                         label='ARWSC positive',
                         edgecolor='black')
        rects3 = plt.bar(index + bar_width * 2,
                         aspect_sentiment_negative,
                         bar_width,
                         alpha=opacity,
                         align='center',
                         color='C2',
                         label='ARWSC negative',
                         edgecolor='black')
        rects4 = plt.bar(index + bar_width * 3,
                         word_sentiment_positive,
                         bar_width,
                         alpha=opacity,
                         align='center',
                         color='C3',
                         label='WSC positive',
                         edgecolor='black')
        rects5 = plt.bar(index + bar_width * 4,
                         word_sentiment_negative,
                         bar_width,
                         alpha=opacity,
                         align='center',
                         color='C4',
                         label='WSC negative',
                         edgecolor='black')
        rects6 = plt.bar(index + bar_width * 5,
                         attention_score,
                         bar_width,
                         alpha=opacity,
                         align='center',
                         color='C9',
                         label='Attention score',
                         edgecolor='black')
        rects7 = plt.bar(index + bar_width * 6,
                         lime,
                         bar_width,
                         alpha=opacity,
                         align='center',
                         color='C6',
                         label='A-LIME',
                         edgecolor='black')
        rects8 = plt.bar(index + bar_width * 7,
                         lace,
                         bar_width,
                         alpha=opacity,
                         align='center',
                         color='C7',
                         label='A-LACE',
                         edgecolor='black')
        rects9 = plt.bar(index + bar_width * 8,
                         own,
                         bar_width,
                         alpha=opacity,
                         align='center',
                         color='C8',
                         label='LETA',
                         edgecolor='black')

        plt.xticks(index + bar_width, x)
        plt.xticks(index + bar_width * 2, x)
        plt.xticks(index + bar_width * 3, x)
        plt.xticks(index + bar_width * 4, x)

        plt.legend(loc='upper left', prop={'size': 36})

        plt.tight_layout()
        # plt.show()

        model_name = self.neural_language_model.config.name_of_model
        file = self.neural_language_model.config.get_plot_entire_sentence(
            model_name, sentence_id, index_number, weight_number)

        plt.savefig(file)
示例#8
0
        senid += 1
        # stopline = 1899
        # if senid != stopline:
        #     continue
        # senid = stopline
        fe_alignment, ef_alignment = get_alignments(fe_phrase, ef_phrase)
        alignment = do_alignment(fe_alignment, ef_alignment, len(ef_phrase[0]),
                                 len(fe_phrase[0]))
        # fe_phrase = fe_phrases[id]
        # ef_phrase = ef_phrases[id]
        BP, BP_pos = phrase_extraction(fe_phrase[0], ef_phrase[0],
                                       alignment)  # fe_phrase[0] 是 e 句子
        f_sen = ' '.join(ef_phrase[0])

        try:
            p_parse_trees = list(parser.parse(parser.tokenize(f_sen)))
        except ValueError:
            print('parsing fail')
            exception_sen.append(senid)
            p_parse_trees = [Tree.fromstring('(S (NULL ERROR))')
                             ]  # we simply give a dummy tree

        # create a dict to keep all phrase in different categories
        p_phrase_dict = {}
        for tag in phrase_tag:
            p_phrase_dict[tag] = []

        for one_tree in p_parse_trees:
            # print(one_tree)
            traverse(one_tree, p_phrase_dict, phrase_tag)
            if last_two in categories:
                next_category = last_two
            elif last_three in categories:
                next_category = last_three
        # make sure all entries have each of the categories above as keys
        for c in category_english:
            if c not in chengyu_data:
                chengyu_data[c] = ""
        translation_help = dict()
        # fields that support translation
        for field in to_translate:
            to_segment = chengyu_data[field]
            # substitute out · to help with segmentation
            to_segment = re.sub(r'·', ' ', to_segment)
            if to_segment:
                new_words = list(chengyu_segmenter.tokenize(to_segment))
            else:
                new_words = []
            # add extra fields
            translation_help[field + '_Segmentation'] = new_words

            translations, sent_dict = lookup(new_words, zh_en_simp_dict)
            chengyu_data[field + '_Translations'] = translations
            translation_help[field + '_Sentence_Code'] = sent_dict
        chengyu_english[chengyu_number] = translation_help
        chengyu_index[chengyu_number] = chengyu_data
        chengyu_number += 1

    # make necessary json files (chengyu index, translation, simplified chinese dictionary)
    corpus_from_dict(chengyu_index, chengyu_json_file)
    corpus_from_dict(chengyu_english, translation_json_file)
示例#10
0
from nltk.parse import CoreNLPParser

parser = CoreNLPParser('http://localhost:9001')
ner_tagger = CoreNLPParser(url='http://localhost:9001', tagtype='ner')
segs = list(
    parser.tokenize(
        u'截至1月20日24时,中国境内累计报告新型冠状病毒感染的肺炎确诊病例291例(湖北270例,北京5例,广东14例,上海2例)。'))
print(list(ner_tagger.tag(segs)))
示例#11
0
class ExternalDataLoader:
    def __init__(self, config):
        self.ontology_tagging = OntologyTagging()
        self.config = config
        self.word_dictionary = self.compute_all_embeddings()
        self.server_url = 'http://localhost:9000'
        self.parser = CoreNLPParser(url=self.server_url)
        self.core_nlp_dependency_parser = CoreNLPDependencyParser(
            url=self.server_url)

    def load_external_data(self, load_external_file_name,
                           write_internal_file_name):

        if not os.path.isfile(load_external_file_name):
            raise ("[!] Data %s not found" % load_external_file_name)

        xml_tree = elementTree.parse(load_external_file_name)
        root = xml_tree.getroot()

        opinion_counter = 0
        total_counter = 0

        all_sentences = []

        for sentence in root.iter('sentence'):

            sentence_id = sentence.get('id')

            original_sentence = sentence.find('text').text

            tokenized_sentence = list(self.parser.tokenize(original_sentence))

            aspects = []
            aspect_indices = []
            polarities = []
            polarity_matrix = []
            categories = []
            category_matrix = []

            for opinions in sentence.iter('Opinions'):

                for opinion in opinions.findall('Opinion'):
                    total_counter += 1
                    aspect = opinion.get('target')
                    if aspect != "NULL":

                        opinion_counter += 1

                        aspects.append(aspect)
                        category = opinion.get('category')
                        polarity = opinion.get('polarity')

                        categories.append(category)
                        polarities.append(polarity)

                        tokenized_aspect = list(self.parser.tokenize(aspect))
                        aspect_indices.append(
                            self.get_aspect_indices(tokenized_aspect,
                                                    tokenized_sentence))
                        polarity_matrix.append(
                            self.get_polarity_number(polarity))
                        category_matrix.append(
                            self.get_category_number(category))

            if len(aspects) != 0:

                print("opinion_counter ", opinion_counter)

                sentiment_distribution = self.annotate(original_sentence,
                                                       properties={
                                                           "annotators":
                                                           "sentiment",
                                                           "outputFormat":
                                                           "json",
                                                       })

                processed_sentence = self.process_characters(
                    tokenized_sentence)

                lemmatized_sentence, part_of_speech_sentence, aspect_dependencies, sentence_negation, sentiments = \
                    self.lemmatize_and_pos_tagging(processed_sentence, aspect_indices)

                ontology_classes_sentence = self.ontology_tagging.ontology_classes_tagging(
                    lemmatized_sentence)

                mentions = self.ontology_tagging.mention_tagging(
                    ontology_classes_sentence)

                ont_sentiments_sentence, aspect_sentiments_sentence, sentiments_sentence, relations_sentence = \
                    self.ontology_tagging.polarity_and_aspect_relation_tagging(ontology_classes_sentence,
                                                                               aspect_indices, categories,
                                                                               aspect_dependencies, sentiments)

                word_embedding_sentence = self.compute_word_embeddings(
                    lemmatized_sentence)

                dict_sentence = {
                    'sentence_id': sentence_id,
                    'original_sentence': original_sentence,
                    'lemmatized_sentence': lemmatized_sentence,
                    'sentiment_distribution': sentiment_distribution,
                    'part_of_speech_tags': part_of_speech_sentence,
                    'negation_in_sentence': sentence_negation,
                    'word_polarities': ont_sentiments_sentence,
                    'aspect_sentiments': aspect_sentiments_sentence,
                    'word_sentiments': sentiments_sentence,
                    'word_mentions': mentions,
                    'aspect_relations': relations_sentence,
                    'aspects': aspects,
                    'aspect_indices': aspect_indices,
                    'polarities': polarities,
                    'polarity_matrix': polarity_matrix,
                    'categories': categories,
                    'category_matrix': category_matrix,
                    'word_embeddings': word_embedding_sentence
                }
                all_sentences.append(dict_sentence)

        with open(write_internal_file_name, 'w') as outfile:
            json.dump(all_sentences, outfile, ensure_ascii=False)

    def get_polarity_number(self, polarity):

        if polarity == "positive":
            return [1, 0, 0]
        elif polarity == "neutral":
            return [0, 1, 0]
        elif polarity == "negative":
            return [0, 0, 1]
        else:
            raise Exception("Polarity ", polarity, " is not in the sentence.")

    def get_category_number(self, category):

        if category == "AMBIENCE#GENERAL":
            return [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        elif category == "DRINKS#PRICES":
            return [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        elif category == "DRINKS#QUALITY":
            return [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        elif category == "DRINKS#STYLE_OPTIONS":
            return [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        elif category == "FOOD#GENERAL":
            return [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
        elif category == "FOOD#PRICES":
            return [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
        elif category == "FOOD#QUALITY":
            return [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
        elif category == "FOOD#STYLE_OPTIONS":
            return [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
        elif category == "LOCATION#GENERAL":
            return [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
        elif category == "RESTAURANT#GENERAL":
            return [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
        elif category == "RESTAURANT#MISCELLANEOUS":
            return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
        elif category == "RESTAURANT#PRICES":
            return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
        elif category == "SERVICE#GENERAL":
            return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
        else:
            raise Exception("Category ", category, " is not in the sentence.")

    @staticmethod
    def get_aspect_indices(aspect, sentence):

        number_words_in_aspect = len(aspect)
        number_words_in_sentence = len(sentence)

        for i in range(number_words_in_sentence):

            if aspect[0] == sentence[i]:
                return list(range(i, i + number_words_in_aspect))

        raise Exception("Aspect ", aspect, " is not in the sentence ",
                        sentence)

    def compute_all_embeddings(self):

        word_dictionary = {}

        with open(self.config.glove_embeddings, 'r', encoding="utf8") as f:
            for line in f:
                word_embedding = line.strip().split()
                word_dictionary[word_embedding[0]] = list(
                    map(float, word_embedding[1:]))

        return word_dictionary

    def compute_word_embeddings(self, sentence):

        number_words_in_sentence = len(sentence)
        word_embeddings = np.random.normal(0, 0.05,
                                           [number_words_in_sentence, 300])

        for word_index in range(number_words_in_sentence):

            if sentence[word_index] in self.word_dictionary:
                word_embeddings[word_index] = self.word_dictionary[
                    sentence[word_index]]

        return word_embeddings.tolist()

    @staticmethod
    def process_characters(sentence):

        number_words_in_sentence = len(sentence)
        processed_sentence = []

        punctuation_and_numbers = [
            '(', ')', '?', ':', ';', ',', '.', '!', '/', '"', '*', '$', '&',
            '%', '@', '#', '^', '!', '0', '1', '2', '3', '4', '5', '6', '7',
            '8', '9'
        ]
        alphabet = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ''
        ]
        punctuation_to_be_replaced = {'–': '-', '’': '\''}

        for word_index in range(number_words_in_sentence):

            list_of_word = list(sentence[word_index].lower())

            for char_index in range(len(list_of_word) - 1):

                if list_of_word[char_index] in punctuation_to_be_replaced:
                    list_of_word[char_index] = punctuation_to_be_replaced[
                        list_of_word[char_index]]

                if list_of_word[char_index] in alphabet and list_of_word[
                        char_index + 1] in punctuation_and_numbers:
                    list_of_word[char_index + 1] = ''
                elif list_of_word[
                        char_index] in punctuation_and_numbers and list_of_word[
                            char_index + 1] in alphabet:
                    list_of_word[char_index] = ''

            word = "".join(list_of_word)
            if word == '.' and sentence[word_index - 1] == '.':
                pass
            else:
                if word == '.......' or word == '....' or word == '.....' or word == '......' or word == '..':
                    word = '...'
                processed_sentence.append(word)
        return processed_sentence

    def lemmatize_and_pos_tagging(self, sentence, aspect_indices):

        punctuations = [
            '–', '(', ')', '?', ':', ';', ',', '.', '!', '/', '"', '’', '*',
            '$', '&', '%', '@', '#', '^', '!', '\'', '-'
        ]

        parses = self.core_nlp_dependency_parser.parse(sentence)
        dependencies = [[(governor, dep, dependent)
                         for governor, dep, dependent in parse.triples()]
                        for parse in parses][0]

        wordnet_lemmatizer = nltk.WordNetLemmatizer()
        part_of_speech_sentence = list(range(len(sentence)))
        lemmatized_sentence = list(range(len(sentence)))
        sentiments = list(range(len(sentence)))
        aspects_dependencies = [['no'] * len(sentence)
                                for i in range(len(aspect_indices))]

        backup_sentence = sentence.copy()
        interesting_translates = {
            '-LRB-': '(',
            '-RRB-': ')',
            '2\xa01/2': '2 1/2',
            "''": '"',
            ':-RRB-': ':)'
        }

        sentence_negations = []

        for dependency in dependencies:

            words = [dependency[0][0], dependency[2][0]]
            part_of_speech = [dependency[0][1], dependency[2][1]]

            if words[0] in interesting_translates:
                words[0] = interesting_translates[words[0]]
            if words[1] in interesting_translates:
                words[1] = interesting_translates[words[1]]

            range_list = [0, 1]
            if words[0] in sentence:
                index_of_word1 = sentence.index(words[0])
                sentence[index_of_word1] = ''
            else:
                index_of_word1 = backup_sentence.index(words[0])
                range_list = [1]

            if words[1] in sentence:
                index_of_word2 = sentence.index(words[1])
                sentence[index_of_word2] = ''
            else:
                index_of_word2 = backup_sentence.index(words[1])
                range_list = [0]

            word_indices = [index_of_word1, index_of_word2]

            if dependency[1] == 'neg':
                sentence_negations.append(word_indices)

            for aspect_index in range(len(aspect_indices)):

                if index_of_word1 in aspect_indices[aspect_index] and index_of_word2 not in \
                        aspect_indices[aspect_index]:
                    aspects_dependencies[aspect_index][
                        index_of_word2] = dependency[1]
                elif index_of_word1 not in aspect_indices[aspect_index] and index_of_word2 in \
                        aspect_indices[aspect_index]:
                    aspects_dependencies[aspect_index][
                        index_of_word1] = dependency[1]
                elif index_of_word1 in aspect_indices[
                        aspect_index] and index_of_word2 in aspect_indices[
                            aspect_index]:
                    if aspects_dependencies[aspect_index][
                            index_of_word1] == 'no':
                        aspects_dependencies[aspect_index][
                            index_of_word1] = dependency[1]
                    else:
                        aspects_dependencies[aspect_index][
                            index_of_word2] = dependency[1]

            for i in range_list:

                if part_of_speech[i].startswith('V'):  # Verb
                    part_of_speech_sentence[word_indices[i]] = [1, 0, 0, 0, 0]
                    word = spell(words[i])
                    lemma = wordnet_lemmatizer.lemmatize(word, wordnet.VERB)
                    sentiments[word_indices[i]] = self.get_sentiment_of_word(
                        word, lemma, wordnet.VERB)
                    lemmatized_sentence[word_indices[i]] = lemma.lower()
                elif part_of_speech[i].startswith('J'):  # Adjective
                    part_of_speech_sentence[word_indices[i]] = [0, 1, 0, 0, 0]
                    word = spell(words[i])
                    lemma = wordnet_lemmatizer.lemmatize(word, wordnet.ADJ)
                    sentiments[word_indices[i]] = self.get_sentiment_of_word(
                        word, lemma, wordnet.ADJ)
                    lemmatized_sentence[word_indices[i]] = lemma.lower()
                elif part_of_speech[i].startswith('R'):  # Adverb
                    part_of_speech_sentence[word_indices[i]] = [0, 0, 1, 0, 0]
                    word = spell(words[i])
                    lemma = wordnet_lemmatizer.lemmatize(word, wordnet.ADV)
                    sentiments[word_indices[i]] = self.get_sentiment_of_word(
                        word, lemma, wordnet.ADV)
                    lemmatized_sentence[word_indices[i]] = lemma.lower()
                elif part_of_speech[i].startswith('N'):  # Noun
                    part_of_speech_sentence[word_indices[i]] = [0, 0, 0, 1, 0]
                    word = spell(words[i])
                    lemma = wordnet_lemmatizer.lemmatize(word, wordnet.NOUN)
                    sentiments[word_indices[i]] = self.get_sentiment_of_word(
                        word, lemma, wordnet.NOUN)
                    lemmatized_sentence[word_indices[i]] = lemma.lower()
                else:  # Otherwise
                    part_of_speech_sentence[word_indices[i]] = [0, 0, 0, 0, 1]
                    if words[i] not in punctuations:
                        words[i] = spell(words[i])
                    lemma = wordnet_lemmatizer.lemmatize(words[i])
                    sentiments[word_indices[i]] = [0, 0, 1]
                    lemmatized_sentence[word_indices[i]] = lemma.lower()

        return lemmatized_sentence, part_of_speech_sentence, aspects_dependencies, sentence_negations, sentiments

    @staticmethod
    def get_sentiment_of_word(word, lemma, pos):

        synsets = wordnet.synsets(word, pos=pos)

        if len(synsets) != 0:

            memorized_synset_01 = None
            check_boolean_01 = False

            memorized_synset_rest = None
            check_boolean_rest = False

            list_of_numbers = [
                '04', '02', '03', '05', '06', '07', '08', '09', '10', '11',
                '12'
            ]

            for synset in synsets:
                synset_split = synset.name().split(".")
                if synset_split[0] == lemma:
                    swn_synset = sentiwordnet.senti_synset(synset.name())
                    pos_score = swn_synset.pos_score()
                    neg_score = swn_synset.neg_score()

                    if pos_score > neg_score:
                        return [1, 0, 0]
                    elif neg_score > pos_score:
                        return [0, 1, 0]
                    else:
                        return [0, 0, 1]
                if synset_split[2] == '01' and not check_boolean_01:
                    memorized_synset_01 = synset
                    check_boolean_01 = True
                elif synset_split[
                        2] in list_of_numbers and not check_boolean_rest:
                    memorized_synset_rest = synset
                    check_boolean_rest = True
            if check_boolean_01:
                synset = memorized_synset_01
            else:
                synset = memorized_synset_rest

            swn_synset = sentiwordnet.senti_synset(synset.name())
            pos_score = swn_synset.pos_score()
            neg_score = swn_synset.neg_score()

            if pos_score > neg_score:
                return [1, 0, 0]
            elif neg_score > pos_score:
                return [0, 1, 0]
            else:
                return [0, 0, 1]
        return [0, 0, 1]

    def annotate(self, text, properties=None):
        assert isinstance(text, str)
        if properties is None:
            properties = {}
        else:
            assert isinstance(properties, dict)

        # Checks that the Stanford CoreNLP server is started.
        try:
            requests.get(self.server_url)
        except requests.exceptions.ConnectionError:
            raise Exception(
                'Check whether you have started the CoreNLP server e.g.\n'
                '$ cd stanford-corenlp-full-2018-02-27/ \n'
                '$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer'
            )
        data = text.encode()
        r = requests.post(self.server_url,
                          params={'properties': str(properties)},
                          data=data,
                          headers={'Connection': 'close'})
        output = r.text

        char_index1 = output.index("sentimentDistribution")
        char_index2 = output.index("sentimentTree")
        distribution = output[(char_index1 - 1):(char_index2 - 2)]

        new_distribution = []
        word = []

        for char_index in range(len(distribution)):

            if distribution[char_index].isnumeric():
                word.append(distribution[char_index])
            elif distribution[char_index] == ',' and len(word) == 1:
                word.append('.')
            elif (distribution[char_index] == ','
                  or distribution[char_index] == ']') and len(word) != 1:
                number = float("".join(word))
                new_distribution.append(number)
                word = []

        return new_distribution
示例#12
0
            entry_value = entry[0]
            entry_type = entry[1]

            if entry_type == 'LOCATION':
                entities.append(entry_value)
    return entities


currentDT = datetime.datetime.now()
print(str(currentDT))

count = 0
passed = 0
for i, city in enumerate(cities['City'].unique()):
    try:
        city_ = parser.tokenize(city)
        classified_paragraphs_list = ner_tagger.tag_sents([city_])
        formatted_result = formatted_entities(classified_paragraphs_list)
        if len(formatted_result) > 0:
            count += 1
    except Exception as e:
        passed += 1
        print(i, city, 'error:', e)
        pass
    if i % 100 == 0:
        print(i, count, passed, city, city_, 'result:',
              ' '.join(formatted_result))
print(f'Stanford knows {count} out of {cities.City.unique().shape[0]}')
print('couldnt process:', passed)

currentDT = datetime.datetime.now()
示例#13
0
print("\nRaw string")
# Parse raw string.
print(list(parser.raw_parse(sentence)))
# [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]

# Neural Dependency Parser
print("\nNeural Dependency Parser")
from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
parses = dep_parser.parse(sentence.split())
# [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]
# [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]

# Tokenizer
parser = CoreNLPParser(url='http://localhost:9000')
print("\nTokenizer")
print(list(parser.tokenize(sentence)))
# ['What', 'is', 'the', 'airspeed', 'of', 'an', 'unladen', 'swallow', '?']

# POS Tagger
print("\nPOS Tagger")
pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
print(list(pos_tagger.tag(sentence.split())))
# [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]

# NER Tagger
print("\nNER Tagger")
ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
print(list(ner_tagger.tag((sentence.split()))))
# [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')]