示例#1
0
    def __stanford_openie(self, input, output, verbose=False):
        with open(input, 'r') as input_file:
            contents = input_file.read()
            input_file.close()

        if verbose:
            print('Searching for triples using Stanford OpenIE ...')

        nlp = CoreNLPWrapper()
        annotated = nlp.annotate(
            contents,
            properties={
                'annotators':
                'tokenize, ssplit, pos, ner, depparse, parse, openie'
            })

        for sentence in annotated['sentences']:
            for openie in sentence['openie']:
                with open(output, 'a') as output_file:
                    triple = Triple(sentence['index'], openie['subject'],
                                    openie['relation'], openie['object'])
                    if verbose:
                        print(triple.to_string())
                    output_file.write(triple.to_string() + '\n')
                    output_file.close()

        return output
示例#2
0
    def __clausie(self, input, output, verbose=False):
        with open(input, 'r') as input_file:
            contents = input_file.read()
            input_file.close()

        if verbose:
            print('Searching for triples using ClausIE ...')

        input_clausie = os.path.splitext(input)[0] + '_clausie_input.txt'
        open(input_clausie, 'w').close()

        print('Preparing contents to be processed by ClausIE at {}'.format(
            input_clausie))

        nlp = CoreNLPWrapper()
        annotated = nlp.annotate(
            contents, properties={'annotators': 'tokenize, ssplit, pos'})

        for sentence in annotated['sentences']:
            sent_str = ''
            for token in sentence['tokens']:
                if token['pos'] == 'POS':
                    sent_str.strip()

                sent_str += token['word'] + ' '

            with open(input_clausie, 'a') as clausie_file:
                clausie_file.write(
                    str(sentence['index']) + '\t' + sent_str.strip() + '\n')
                clausie_file.close()

        clausie_out = ClausIEWrapper.run_clausie(input_clausie, output,
                                                 verbose)

        os.remove(input_clausie)

        # We need to do some adjustments to the output.
        final_contents = ""
        with open(clausie_out, 'r') as clausie_out_file:
            line = clausie_out_file.readline()
            while line:
                line = line.replace('\"', '').split('\t')
                triple = Triple(line[0].strip(),
                                NLPUtils.adjust_tokens(line[1].strip()),
                                line[2].strip(),
                                NLPUtils.adjust_tokens(line[3].strip()))
                if verbose:
                    print(triple.to_string())

                final_contents += triple.to_string() + '\n'

                line = clausie_out_file.readline()

        final_file = open(clausie_out, "w")
        n = final_file.write(final_contents)
        final_file.close()

        return final_file
示例#3
0
    def __senna(self, input_filename, output_filename, verbose=False):
        if verbose:
            print('Performing Sentence Role Labeling with SENNA...')

        senna = SennaWrapper()

        out_contents = ''
        with open(input_filename, 'r') as input_file:
            sentence_number = 0
            for line in input_file.readlines():
                if len(line) < 1: continue

                senna_output = senna.srl(NLPUtils.adjust_tokens(line), verbose=False)
                for predicate in senna_output.keys():
                    dict_contents = senna_output[predicate]
                    agent = None
                    patient = None

                    if 'A0' in dict_contents and 'A1' in dict_contents:
                        agent = dict_contents['A0']
                        patient = dict_contents['A1']

                    elif 'A0' in dict_contents: # No A1
                        agent = dict_contents['A0']
                        if 'A2' in dict_contents:
                            patient = dict_contents['A2']
                        else:
                            for key in dict_contents.keys():
                                if not key == 'A0':
                                    patient = dict_contents

                    elif 'A1' in dict_contents: # No A0
                        patient = dict_contents['A1']
                        if 'A2' in dict_contents:
                            agent = dict_contents['A2']
                        else:
                            for key in dict_contents.keys():
                                if not key == 'A1':
                                    agent = dict_contents[key]

                    else: # Neither A0 nor A1
                        if 'A2' in dict_contents:
                            agent = dict_contents['A2']
                            for key in dict_contents.keys():
                               if not key == 'A2':
                                   patient = dict_contents[key]
                        else: # Very unlikely
                            key_lst = dict_contents.keys()
                            key_lst.sort(key = len) # sort by string length
                            agent = dict_contents[key_lst[0]]
                            patient = dict_contents[key_lst[1]]

                    if agent is None or patient is None:
                        print('-Warning: No agent or patient determined for predicate {}'.format(predicate))
                        print('-- agent: {}'.format(agent))
                        print('-- patient: {}'.format(patient))
                        continue

                    triple = Triple(sentence_number, agent, predicate, patient)

                    if verbose:
                        print(triple.to_string())

                    out_contents += triple.to_string() + '\n'

                sentence_number += 1

            input_file.close()

        with open(output_filename, 'w') as output_file:
            output_file.write(out_contents)
            output_file.close()

        return output_filename
示例#4
0
    def __senna(self, input_filename, output_filename, verbose=False):
        if verbose:
            print('Performing Sentence Role Labeling with SENNA...')

        senna = SennaWrapper()

        out_contents = ''
        with open(input_filename, 'r') as input_file:
            sentence_number = 0
            for line in input_file.readlines():
                if len(line) < 1:
                    continue

                dependency_list = NLPUtils.dependency_parse(
                    line,
                    deps_key='enhancedPlusPlusDependencies',
                    verbose=verbose)

                previous_term = ''
                previous_compound = ''
                dict_basic_to_most_specific = {}
                connective_dependencies = []
                while len(dependency_list) > 0:
                    elem = dependency_list.pop()

                    if elem[1] in ['ROOT', 'punct', 'det'
                                   ] or 'subj' in elem[1] or 'obj' in elem[1]:
                        continue

                    if elem[1] in ['compound', 'nmod:poss', 'aux', 'neg'
                                   ] or elem[1].endswith('mod'):
                        if previous_term == elem[0]:
                            updated_term = '{} {}'.format(
                                elem[2], previous_compound)
                        else:
                            updated_term = '{} {}'.format(elem[2], elem[0])
                            previous_compound = elem[0]
                        dict_basic_to_most_specific[elem[0]] = updated_term

                        triple = Triple(sentence_number, updated_term,
                                        'rdfs:subClassOf', previous_compound)

                        previous_compound = updated_term
                        previous_term = elem[0]

                        if verbose:
                            print(triple.to_string())

                        out_contents += triple.to_string() + '\n'

                    elif elem[1] in ['acl', 'appos'
                                     ] or elem[1].startswith('nmod:'):
                        connective_dependencies.append(elem)

                while len(connective_dependencies) > 0:
                    elem = connective_dependencies.pop()

                    if elem[1] == 'nmod:poss':
                        continue

                    if elem[1].find(':') > 0:  # e.g. 'nmod:of'
                        connector = elem[1][elem[1].find(':') + 1:]
                    elif elem[1] in ['acl', 'appos']:
                        connector = ''
                    else:
                        connector = elem[1]

                    first = elem[0]
                    if first in dict_basic_to_most_specific.keys():
                        first = dict_basic_to_most_specific[first]

                    second = elem[2]
                    if second in dict_basic_to_most_specific.keys():
                        second = dict_basic_to_most_specific[second]

                    if connector == '':
                        full = '{} {}'.format(first, second)
                    else:
                        full = '{} {} {}'.format(first, connector, second)

                    triple = Triple(
                        sentence_number,
                        full, 'local:{}_{}'.format(connector,
                                                   second.replace(' ',
                                                                  '')), first)
                    if verbose:
                        print(triple.to_string())
                    out_contents += triple.to_string() + '\n'

                    triple = Triple(
                        sentence_number, full,
                        'local:{}_{}'.format(first.replace(' ', ''),
                                             connector), second)
                    if verbose:
                        print(triple.to_string())
                    out_contents += triple.to_string() + '\n'

                    dict_basic_to_most_specific[elem[0]] = full

                senna_output = senna.srl(line, verbose=False)
                for predicate in senna_output.keys():
                    pred_args = senna_output[predicate]
                    pred_arg_names = NLPUtils.get_verbnet_args(predicate,
                                                               verbose=True)
                    if len(pred_arg_names) < 1:
                        print(
                            'WARNING -- Unable to retrieve predicate arg names for "{}"'
                            .format(predicate))

                    if verbose:
                        print('predicate: {}, args: {}'.format(
                            predicate, pred_args))

                    for pred_arg in pred_args:
                        if 'AM-NEG' == pred_arg:
                            predicate = 'not {}'.format(predicate)
                        elif 'AM-MOD' == pred_arg:
                            predicate = ' '.join(
                                [pred_args['AM-MOD'].strip(), predicate])
                        elif pred_arg.startswith('AM-'):
                            # Remove initial stopwords (e.g. determiners)
                            s = pred_args[pred_arg].strip()
                            split = s.split(' ', 1)
                            if NLPUtils.is_stopword(
                                    split[0]) and len(split) > 1:
                                s = s.split(' ', 1)[1]

                            triple = Triple(sentence_number, predicate,
                                            'local:{}'.format(pred_arg), s)
                            if verbose:
                                print(triple.to_string())

                            out_contents += triple.to_string() + '\n'

                    for i in range(len(pred_arg_names)):
                        pred_args_index = 'A{}'.format(i)
                        if pred_args_index in pred_args:
                            # Remove initial stopwords (e.g. determiners)
                            s = pred_args[pred_args_index].strip()
                            split = s.split(' ', 1)
                            if NLPUtils.is_stopword(
                                    split[0]) and len(split) > 1:
                                s = s.split(' ', 1)[1]

                            triple = Triple(
                                sentence_number, predicate,
                                'vn.role:{}'.format(pred_arg_names[i]), s)
                            if verbose:
                                print(triple.to_string())

                            out_contents += triple.to_string() + '\n'

                sentence_number += 1

            input_file.close()

        with open(output_filename, 'w') as output_file:
            output_file.write(out_contents)
            output_file.close()

        return output_filename