Python CoreNLPParser.raw_parse示例，nltk.parse.CoreNLPParser.raw_parse Python示例

示例#1

0

显示文件

def parseSentenceStructure(data):

    #Tokenize sent.
    tokens = nltk.word_tokenize(data)

    #Tag sent.
    tagged = nltk.pos_tag(tokens)

    #Parser
    parser = CoreNLPParser(
        url='http://localhost:9000'
    )  #https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk/51981566#51981566
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

    #Parse w/ Stanford
    tree = parser.raw_parse(data)
    #print(list(tree))

    #list(tree)[0].pretty_print()
    #print(list(tree))

    #Provide N-V-N relationships w/ all N combinations

    #Traverse for NP root
    tree_recurse_find(list(tree)[0])

示例#2

0

显示文件

def dependency_parse(raw_data):
    from nltk.parse.corenlp import CoreNLPServer

    # The server needs to know the location of the following files:
    #   - stanford-corenlp-X.X.X.jar
    #   - stanford-corenlp-X.X.X-models.jar
    STANFORD = os.path.join("..", "stanford-corenlp-full-2020-04-20")

    # Create the server
    server = CoreNLPServer(
        os.path.join(STANFORD, "stanford-corenlp-4.0.0.jar"),
        os.path.join(STANFORD, "stanford-corenlp-4.0.0-models.jar"),
    )

    # Start the server in the background
    server.start()
    from nltk.parse import CoreNLPParser
    parser = CoreNLPParser()

    new_data = []
    for example in raw_data:
        sentence, features_seq = example[0], example[-1]
        parse = next(parser.raw_parse(sentence))
        # get a few "important" neighboring words

    server.stop()

示例#3

0

显示文件

文件： tree.py 项目： tTeha/MRMARS

def parse(text):
    parser = CoreNLPParser(CORENLP_SERVER)
    result = parser.raw_parse(text)
    trees = [tree for tree in result]
    for tree in trees:
        tree.chomsky_normal_form()
        tree.collapse_unary(collapseRoot=True, collapsePOS=True)
    trees = [ParentedTree.convert(tree) for tree in trees]
    return trees

示例#4

0

显示文件

文件： tree.py 项目： sushe-shakya/RNTN

def parse(text):
    parser = CoreNLPParser("http://localhost:9000")
    result = parser.raw_parse(text.lower())
    trees = [tree for tree in result]
    for tree in trees:
        tree.chomsky_normal_form()
        tree.collapse_unary(collapseRoot=True, collapsePOS=True)
    trees = [ParentedTree.convert(tree) for tree in trees]
    return trees

示例#5

0

显示文件

 def createGrammar(self, userMessages, ctx):
     parser = CoreNLPParser(url='http://localhost:9000')
     parse_trees = []
     for message in userMessages:
         tokenized = nltk.sent_tokenize(message)
         for sentence in tokenized:
             parse_trees.append(list(parser.raw_parse(sentence))[0])
     grammar_rules = set()
     for tree in parse_trees:
         for production in tree.productions():
             grammar_rules.add(production)
     start = nltk.Nonterminal('S')
     grammar = nltk.induce_pcfg(start, grammar_rules)
     return (' '.join((self.generate_sentence(grammar))))

示例#6

0

显示文件

文件： noun_phrase_tagger.py 项目： jkussmann/noun_phrase_tagger

def get_stanford_nps(line):
    """
	Get noun phrases using the Stanford tagger
	"""

    noun_phrase_list = []
    parser = CoreNLPParser(url="http://localhost:9000")

    tag_list = list(parser.raw_parse(line))
    for item in tag_list:
        for subtree in item.subtrees():
            if subtree.label() == "NP":
                noun_phrase = " ".join(subtree.leaves())
                noun_phrase_list.append((noun_phrase))

    return noun_phrase_list

示例#7

0

显示文件

文件： func.py 项目： sigmarising/icdm-dssc-2019-backend

def text_2_triple_list(text, strength):
    nlp = spacy.load("en")
    neuralcoref.add_to_pipe(nlp)
    api = CoreNLPParser(url='http://39.98.186.125:9000')
    api.parser_annotator = "tokenize,ssplit,coref,openie"
    parser = CoreNLPParser(url='http://39.98.186.125:9000')

    text = clean_text(text)
    text = remove_adjective_possessive_pronoun(text)
    doc = nlp(text)
    text = doc._.coref_resolved
    entities = []
    entities_labels = []
    for e in doc.ents:
        if e.label_ in supported_entity_types:
            entities.append(e.text)
            entities_labels.append(e.label_)

    json_text = api.api_call(text)
    openie_sentences = ssplit_article_into_sentences(json_text, step=-1)
    syntax_sentences = ssplit_article_into_sentences(json_text, step=1)
    triples = []
    for sentence in openie_sentences:
        json_sentence = api.api_call(sentence)
        triples += extract_triples_by_openie(json_sentence)
    syntax_triples = []
    for sentence in syntax_sentences:
        syntax_tree = list(parser.raw_parse(sentence))[0]
        cur_syntax_triples = []
        parse_tree_v2(syntax_tree, cur_syntax_triples)
        syntax_triples += cur_syntax_triples

    triples = filter_triples_by_entities(triples, entities, strength)
    triples = beautify_triples(triples)
    triples = remove_meaningless_triples(triples)
    triples = check_triples_by_pos(triples)
    triples = remove_duplicate_triples(triples)

    syntax_triples = beautify_triples(syntax_triples)
    syntax_triples = remove_meaningless_triples(syntax_triples)
    syntax_triples = check_triples_by_pos(syntax_triples)
    syntax_triples = remove_duplicate_triples(syntax_triples)

    triples = normalize_entities(triples, syntax_triples)
    triples = remove_duplicate_triples(triples)

    return generate_structured_triples(triples, entities, entities_labels)

示例#8

0

显示文件

def parseSentenceStructure(data):

    #Tokenize sent.
    tokens = nltk.word_tokenize(data)

    #Tag sent.
    tagged = nltk.pos_tag(tokens)

    #Parser
    parser = CoreNLPParser(
        url='http://localhost:9000'
    )  #https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk/51981566#51981566
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

    #Parse w/ Stanford
    tree = parser.raw_parse(data)
    #print(list(tree))

    list(tree)[0].pretty_print()

示例#9

0

显示文件

    def chunk_with_sp(sentence):

        verb_phrase_list = []
        try:
            parser = CoreNLPParser(url='http://localhost:9000')
            parsed_tree = next(parser.raw_parse(sentence))
            # parsed_tree.pretty_print()

            # VP_tree = list(parsed_tree.subtrees(filter=lambda x: x.label() == 'VP'))
            parsed_phrase_tree = Utils.ExtractPhrases(parsed_tree, 'VP')

            for vp in parsed_phrase_tree:
                verb_phrase_list.append(" ".join(vp.leaves()))

            # print("\nVerb phrases:")
            # print("Verb Phrase List : ", verb_phrase_list)
            # print("--------\n")

        except Exception as e:
            print(str(e))

        return verb_phrase_list

示例#10

0

显示文件

文件： PatInv.py 项目： RobustNLP/TestTranslation

def generate_syntactically_similar_sentences_remove(dataset):
    """Generate syntactically similar sentences for each sentence in the dataset.
	for PaInv-Remove
    Returns dictionary of original sentence to list of generated sentences
    """
    # Run CoreNLPPArser on local host
    eng_parser = CoreNLPParser('http://localhost:9000')

    # Use nltk treebank tokenizer and detokenizer
    tokenizer = TreebankWordTokenizer()
    detokenizer = TreebankWordDetokenizer()

    # Stopwords from nltk
    stopWords = list(set(stopwords.words('english')))

    # Load dataset
    file = open(dataset, "r")

    dic = {}

    for line in file:
        sent = line.split("\n")[0]
        source_tree = eng_parser.raw_parse(sent)
        dic[line] = []
        for x in source_tree:
            phrases = get_all(x, detokenizer, stopWords)
            for t in phrases:
                if t == "'s":
                    continue
                for y in range(20):
                    try:
                        new_sent = replacenth(sent, t, "",
                                              y + 1).replace("  ", " ")
                        dic[line].append(new_sent)
                    except:
                        break
    return dic

示例#11

0

显示文件

def geography_category(question):
    parser = CoreNLPParser(url='http://localhost:9000')
    ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
    quest = question
    l3 = list(parser.raw_parse(quest))
    #pretty print parser
    Tree.fromstring(str(l3[0])).pretty_print()
    print("********************")

    selectQ = ""
    fromQ = []
    tempfromQ = []
    tempwhereQ = []
    whereQ = []
    query_variables = []
    city_list = []
    country_list = []
    location_list = []

    fromQ.append(" from")
    whereQ.append(" where")

    ptree = ParentedTree.fromstring(str(l3[0]))
    rules = ptree.productions()
    print(rules)

    for i in rules:
        l1 = "Rule: " + str(i)
        print(l1)

    l4 = list(ner_tagger.tag((quest.split())))
    print(l4)

    for word, category in l4:
        if category == 'CITY':
            city_list.append(word)
        if category == 'COUNTRY':
            country_list.append(word)
        if category == 'LOCATION':
            location_list.append(word)

    print("City List:")
    print(city_list)
    print("Country List:")
    print(country_list)
    print("Location List:")
    print(location_list)

    print("********************************")
    print(type(rules))

    str_rules = str(rules)

    #*********************************************

    if 'ROOT -> SBARQ' in str_rules:
        #Wh Questions
        print("Wh Questions")
        selectQ = ""

        for i in rules:
            if 'NNP ->' in str(i):
                print("Query variables")
                qvar = str(i).split('NNP ->', 1)[1]
                qvar = qvar.strip()
                qvar = qvar[1:-1]
                query_variables.append(qvar)

        for i in rules:
            if 'WP ->' in str(i):
                print("WP")
                x1 = str(i).split('WP ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1.lower(
                ) == 'what' and city_list == [] and country_list != []:
                    selectQ = "select CI.name"
                    fromQ.append(" CITIES CI")
                    fromQ.append(
                        " INNER JOIN Capitals CA on CI.Id = CA.CityId")
            if 'WRB ->' in str(i):
                print("WRB")
                x1 = str(i).split('WRB ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1.lower() == 'where' and city_list != []:
                    for city in city_list:
                        selectQ = "select CY.name"
                        fromQ.append(" Countries CY")
                        fromQ.append(
                            " INNER JOIN Capitals CA on CY.Id = CA.CountryId")
                        fromQ.append(
                            " INNER JOIN Cities CI on CA.CityId = CI.Id")
                        whereQ.append(" CI.name like '%" + city + "%'")
                if x1.lower() == 'where' and country_list != []:
                    for country in country_list:
                        selectQ = "select CO.continent"
                        fromQ.append(" Continents CO")
                        fromQ.append(
                            " INNER JOIN CountryContinents CC on CO.Id = CC.ContinentId"
                        )
                        fromQ.append(
                            " INNER JOIN Countries CY on CC.CountryId = CY.Id")
                        whereQ.append(" CY.name like '%" + country + "%'")
            if 'NN ->' in str(i):
                print("NN")
                x1 = str(i).split('NN ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "capital":
                    if country_list != []:
                        for country in country_list:
                            fromQ.append(
                                " INNER JOIN Countries CY on CA.CountryId = CY.Id"
                            )
                            whereQ.append(" CY.name like '%" + country + "%'")

    elif 'ROOT -> S' in str_rules:
        #Yes/No Questions
        print("Yes/No Questions")
        selectQ = "select count(*)"

        for i in rules:
            if 'NNP ->' in str(i):
                print("Query variables")
                qvar = str(i).split('NNP ->', 1)[1]
                qvar = qvar.strip()
                qvar = qvar[1:-1]
                query_variables.append(qvar)
        for i in rules:
            if 'NN ->' in str(i):
                print("NN")
                x1 = str(i).split('NN ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "capital":
                    if city_list != []:
                        for city in city_list:
                            fromQ.append(" CITIES CI")
                            fromQ.append(
                                " INNER JOIN Capitals CA on CI.Id = CA.CityId")
                            whereQ.append(" CI.name like '%" + city + "%'")
                    if country_list != []:
                        for country in country_list:
                            fromQ.append(
                                " INNER JOIN Countries CY on CA.CountryId = CY.Id"
                            )
                            whereQ.append(" CY.name like '%" + country + "%'")
            if 'IN ->' in str(i):
                print("IN")
                x1 = str(i).split('IN ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "in":
                    if country_list != [] and location_list != []:
                        for country in country_list:
                            fromQ.append(" Countries CY")
                            fromQ.append(
                                " INNER JOIN CountryContinents CC on CY.Id = CC.CountryId"
                            )
                            whereQ.append(" CY.name like '%" + country + "%'")
                        for continent in location_list:
                            fromQ.append(
                                " INNER JOIN Continents CO on CC.ContinentId = CO.Id"
                            )
                            whereQ.append(" CO.continent like '%" + continent +
                                          "%'")
                    if city_list != [] and country_list != []:
                        for city in city_list:
                            fromQ.append(" CITIES CI")
                            fromQ.append(
                                " INNER JOIN Capitals CA on CI.Id = CA.cityId")
                            whereQ.append(" CI.name like '%" + city + "%'")
                        for country in country_list:
                            fromQ.append(
                                " INNER JOIN Countries CY on CA.CountryId = CY.Id"
                            )
                            whereQ.append(" CY.name like '%" + country + "%'")

    print("Query variables: " + str(query_variables))
    print("Select statement: " + str(selectQ))
    print("From statement: " + str(fromQ))
    print("Where statement: " + str(whereQ))
    print(str(selectQ) + str(fromQ) + str(whereQ))
    print("***************************************************")

    for f in fromQ:
        if 'from' in str(f):
            tempfromQ.append(str(f))
    for f in fromQ:
        if "INNER JOIN" not in f and "from" not in f:
            tempfromQ.append(str(f))
    for f in fromQ:
        if "INNER JOIN" in f:
            tempfromQ.append(str(f))
        '''elif " from" in tempfromQ[len(tempfromQ)-1] and f != " from":
            tempfromQ.append(str(f))'''

    print("From statement: " + str(tempfromQ))

    for w in whereQ:
        if 'where' in str(w):
            tempwhereQ.append(w)
    for w in whereQ:
        if "like" in tempwhereQ[len(tempwhereQ) - 1]:
            tempwhereQ.append(" and")
            tempwhereQ.append(str(w))
        elif " where" in tempwhereQ[len(tempwhereQ) - 1] and w != " where":
            tempwhereQ.append(str(w))

    print("Where statement: " + str(tempwhereQ))

    print("*************************************************")
    #building query
    from_statement = ""
    where_statement = ""
    final_query = ""
    for each_from in tempfromQ:
        from_statement = from_statement + str(each_from)
    for each_where in tempwhereQ:
        where_statement = where_statement + str(each_where)

    final_query = final_query + str(selectQ) + str(from_statement) + str(
        where_statement)
    print(final_query)
    return final_query

示例#12

0

显示文件

sentence1 = "Patient with HGM value greater than 55 g/L"
sentence = "When did princes Diana die?"

# Lexical Parser
parser = CoreNLPParser(url='http://localhost:9000')

# Parse tokenized text.
print("\nParse tokenized text")
# print(list(parser.parse('What is the airspeed of an unladen swallow ?'.split())))
print(list(parser.parse(sentence.split())))
# [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]

print("\nRaw string")
# Parse raw string.
print(list(parser.raw_parse(sentence)))
# [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]

# Neural Dependency Parser
print("\nNeural Dependency Parser")
from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
parses = dep_parser.parse(sentence.split())
# [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]
# [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]

# Tokenizer
parser = CoreNLPParser(url='http://localhost:9000')
print("\nTokenizer")
print(list(parser.tokenize(sentence)))
# ['What', 'is', 'the', 'airspeed', 'of', 'an', 'unladen', 'swallow', '?']

示例#13

0

显示文件

        if i.lower() in question.lower():
            category = "Music"

    return category


with open(input_filepath, "r") as f1:
    for line in f1:
        question = line.strip()
        if question != "":
            count = count + 1
            c = str(count)

            l1 = "Question " + c + ": " + question
            l2 = "Parse Tree:"
            l3 = list(parser.raw_parse(question))
            tree1 = parser.raw_parse(question)

            print(l1)
            print(l2)
            print(l3)

            f2.write(l1)
            f2.write("\n")

            category = movies_music(question)
            if category == "":
                final_category = NLP_Part1_Category.categorize(question)
            else:
                final_category = category
            f2.write("Category: " + final_category)

示例#14

0

显示文件

文件： evals.py 项目： hzheng40/formalization

class FeatureExtractor:
    def __init__(self, w2v_path, corpus_dict_path, port=9000):
        # corenlp client
        self.parser = CoreNLPParser(url='http://localhost:' + str(port))
        self.dep_parser = CoreNLPDependencyParser(url='http://localhost:' +
                                                  str(port))
        # w2v
        self.word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
            'data/saved_models/GoogleNews-vectors-negative300.bin',
            binary=True)
        print('w2v model loaded')
        # training corpus for one hot features
        corpus_dict = pickle.load(open(corpus_dict_path, 'rb'))

        self.dep_tuple_vectorizer = DictVectorizer(sparse=False)
        self.dep_tuple_vectorizer = self.dep_tuple_vectorizer.fit(
            corpus_dict['dep_tuple'])

        self.unigram_vectorizer = DictVectorizer(sparse=False)
        self.unigram_vectorizer = self.unigram_vectorizer.fit(
            corpus_dict['unigram'])

        self.bigram_vectorizer = DictVectorizer(sparse=False)
        self.bigram_vectorizer = self.bigram_vectorizer.fit(
            corpus_dict['bigram'])

        self.trigram_vectorizer = DictVectorizer(sparse=False)
        self.trigram_vectorizer = self.trigram_vectorizer.fit(
            corpus_dict['trigram'])

        self.lexical_vectorizer = DictVectorizer(sparse=False)
        self.lexical_vectorizer = self.lexical_vectorizer.fit(
            corpus_dict['lexical'])

    def _get_case_features(self, sent_annotations, sentence):
        num_all_caps = 0
        for word_annotations in sent_annotations:
            if word_annotations.token.isupper():
                num_all_caps += 1
        if sentence.islower():
            is_sent_lower = 1
        else:
            is_sent_lower = 0
        if sent_annotations[0].token.isupper():
            is_first_word_caps = 1
        else:
            is_first_word_caps = 0
        return [num_all_caps, is_sent_lower, is_first_word_caps]

    def _get_dependency_tuples(self, sent_annotations):
        # (gov, typ, dep)  (gov, typ)  (typ, dep)  (gov, dep)
        dependency_tuple_dict = defaultdict(int)
        for word_annotations in sent_annotations:
            gov = sent_annotations[int(word_annotations.head) - 1].pos
            typ = word_annotations.depRel
            dep = word_annotations.pos
            gov_typ_dep = '_'.join([gov, typ, dep])
            dependency_tuple_dict[gov_typ_dep] = 1
            gov_typ = '_'.join([gov, typ])
            dependency_tuple_dict[gov_typ] = 1
            typ_dep = '_'.join([typ, dep])
            dependency_tuple_dict[typ_dep] = 1
            gov_dep = '_'.join([gov, dep])
            dependency_tuple_dict[gov_dep] = 1
        return dependency_tuple_dict

    def _get_entity_features(self, sent_annotations):
        ner_tags = [0] * len(NER_TAGSET)
        person_mentions_total_len = 0
        for word_annotations in sent_annotations:
            if word_annotations.ner == 'O':
                continue
            if word_annotations.ner not in NER_TAGSET:
                continue
            else:
                index = NER_TAGSET.index(word_annotations.ner)
                ner_tags[index] = 1
            if word_annotations.ner == 'PERSON':
                person_mentions_total_len += len(word_annotations.token)
        person_mentions_avg_len = person_mentions_total_len * 1.0 / len(
            sent_annotations)
        return ner_tags + [person_mentions_avg_len]

    def _get_lexical_features(self, words):
        num_contractions = 0
        total_word_len = 0
        for word in words:
            if '\'' in word:
                num_contractions += 1
            total_word_len += len(word)
        avg_num_contractions = num_contractions * 1.0 / len(words)
        avg_word_len = total_word_len * 1.0 / len(words)
        #TODO: avg word-log frequency acc to Google Ngram
        #TODO: avg formality score using Pavlick & Nenkova (2015)
        return [avg_num_contractions, avg_word_len]

    def _get_ngrams(self, sent_annotations):
        # tokens = [w.token for w in sent_annotations]
        tokens = [w.lemma for w in sent_annotations]
        sentence = ' '.join(tokens)
        # .decode('utf-8', 'ignore')
        blob = TextBlob(sentence)
        unigrams = tokens
        bigrams = blob.ngrams(n=2)
        trigrams = blob.ngrams(n=3)
        unigram_dict = defaultdict(int)
        bigram_dict = defaultdict(int)
        trigram_dict = defaultdict(int)
        for unigram in unigrams:
            unigram_dict[unigram] = 1
        for bigram in bigrams:
            bigram_dict['_'.join(bigram)] = 1
        for trigram in trigrams:
            trigram_dict['_'.join(trigram)] = 1
        return unigram_dict, bigram_dict, trigram_dict

    def _get_parse_features(self, stanford_parse_tree, sent_annotations):
        sent_len = len(sent_annotations)
        avg_depth = stanford_parse_tree.height() * 1.0 / sent_len
        lexical_production_dict = defaultdict(int)
        for production in stanford_parse_tree.productions():
            if production.is_lexical():
                continue
            lexical_production_dict[production] += 1
        avg_depth_feature = [avg_depth]
        return avg_depth_feature, lexical_production_dict

    def _get_POS_features(self, sent_annotations):
        pos_tag_ct = [0] * len(POS_TAGSET)
        for word_annotations in sent_annotations:
            try:
                pos_tag_ct[POS_TAGSET.index(word_annotations.pos)] += 1
            except:
                # print word_annotations.pos
                continue
        for i in range(len(pos_tag_ct)):
            pos_tag_ct[i] = pos_tag_ct[i] * 1.0 / len(sent_annotations)
        return pos_tag_ct

    def _get_punctuation_features(self, sentence):
        num_question_marks = sentence.count('?')
        num_ellipses = sentence.count('...')
        num_exclamations = sentence.count('!')
        return [num_question_marks, num_ellipses, num_exclamations]

    def _get_readability_features(self, sentence, words):
        num_words = len(words)
        num_chars = len(sentence) - sentence.count(' ')
        return [num_words, num_chars]

    def _get_subjectivity_features(self, sent_annotations, sentence):
        subjectivity_features = []
        fp_pros = 0
        tp_pros = 0
        for word_annotations in sent_annotations:
            if word_annotations.lemma in FP_PRO_LIST:
                fp_pros += 1
            if word_annotations.lemma in TP_PRO_LIST:
                tp_pros += 1
        subjectivity_features.append(fp_pros * 1.0 / len(sent_annotations))
        subjectivity_features.append(tp_pros * 1.0 / len(sent_annotations))
        polarity, subjectivity = TextBlob(sentence).sentiment
        subjectivity_features.append(float(np.sign(polarity)))
        subjectivity_features.append(subjectivity)
        return subjectivity_features

    def _get_word2vec_features(self, sent_annotations):
        word_vectors = []
        for word_annotations in sent_annotations:
            try:
                word_vector = self.word2vec_model[word_annotations.lemma]
                word_vectors.append(word_vector)
            except:
                # print word_annotations.token
                continue
        if len(word_vectors) == 0:
            avg_word_vectors = np.zeros(300)
        else:
            avg_word_vectors = np.transpose(np.mean(word_vectors, axis=0))
        return avg_word_vectors

    def _remove_less_frequent(self, dict, reference_dict, freq_cutoff):
        new_dict = defaultdict(int)
        for item, count in dict.iteritems():
            if reference_dict[item] > freq_cutoff:
                new_dict[item] = count
        return new_dict

    def extract_features_pt16(self, sentence, sent_annotations, parse_tree):
        words = sentence.split()
        feature_set = []
        #case features
        case_features = self._get_case_features(sent_annotations, sentence)
        feature_set += case_features

        # dependency features
        dependency_tuple_dict = self._get_dependency_tuples(sent_annotations)

        # entity features
        entity_features = self._get_entity_features(sent_annotations)
        feature_set += entity_features

        # lexical features
        lexical_features = self._get_lexical_features(words)
        feature_set += lexical_features

        # ngram features
        unigram_dict, bigram_dict, trigram_dict = self._get_ngrams(
            sent_annotations)

        # parse features
        avg_depth_feature, lexical_production_dict = self._get_parse_features(
            parse_tree, sent_annotations)
        feature_set += avg_depth_feature

        # POS features
        pos_features = self._get_POS_features(sent_annotations)
        feature_set += pos_features

        # punctuation features
        punctuation_features = self._get_punctuation_features(sentence)
        feature_set += punctuation_features

        # readability features
        readability_features = self._get_readability_features(sentence, words)
        feature_set += readability_features

        # subjectivity features
        # subjectivity_features = self._get_subjectivity_features(sent_annotations, sentence)
        # feature_set += subjectivity_features

        # word2vec features
        word2vec_features = self._get_word2vec_features(sent_annotations)
        feature_set = np.concatenate((feature_set, word2vec_features), axis=0)

        # get one hot features
        dependency_tuple_feature = self.dep_tuple_vectorizer.transform(
            dependency_tuple_dict)
        unigram_feature = self.unigram_vectorizer.transform(unigram_dict)
        bigram_feature = self.bigram_vectorizer.transform(bigram_dict)
        trigram_feature = self.trigram_vectorizer.transform(trigram_dict)
        lexical_production_feature = self.lexical_vectorizer.transform(
            lexical_production_dict)

        feature_vectors = np.array([feature_set])
        feature_vectors = np.concatenate(
            (feature_vectors, dependency_tuple_feature, unigram_feature,
             bigram_feature, trigram_feature, lexical_production_feature),
            axis=1)

        return feature_vectors

    def _transform_raw(self, sentence):
        sent_annotations = []

        for dependency in sentence['basicDependencies']:
            dep_idx = dependency['dependent']
            token = sentence['tokens'][dep_idx - 1]

            annotation = StanfordAnnotations(token['word'], token['lemma'],
                                             token['pos'], token['ner'],
                                             dependency['governor'],
                                             dependency['dep'])
            sent_annotations.append(annotation)

        return sent_annotations

    def extract_parse(self, s):
        """
        Easy, built in parser from nltk
        """
        tree_list = self.parser.raw_parse(s, outputFormat='penn')
        tree = next(tree_list)
        return tree

    def extract_annotations(self, s):
        """
        Needs some arm wrestling
        """

        props = {'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,dcoref'}
        raw_json = self.dep_parser.api_call(s, properties=props)
        sentence = raw_json['sentences'][0]
        return self._transform_raw(sentence)

示例#15

0

显示文件

class SVO(object):
    """
    Class Methods to Extract Subject Verb Object Tuples from a Sentence
    """
    def __init__(self):
        """
        Initialize the SVO Methods
        """
        self.noun_types = ["NN", "NNP", "NNPS", "NNS", "PRP"]
        self.verb_types = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
        self.adjective_types = ["JJ", "JJR", "JJS"]
        self.pred_verb_phrase_siblings = None
        jar = r'D:\data'
        model = r'your_path/stanford-postagger-full-2016-10-31/models/english-left3words-distsim.tagger'
        self.parser = CoreNLPParser(url='http://localhost:9000')
        self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    def get_attributes(self, node, parent_node, parent_node_siblings):
        """
        returns the Attributes for a Node
        """

    def get_subject(self, sub_tree):
        """
        Returns the Subject and all attributes for a subject, sub_tree is a Noun Phrase

        :param sub_tree:
        :return:
        """
        sub_nodes = sub_tree.subtrees()
        sub_nodes = [each for each in sub_nodes if each.pos()]
        subject = None

        for each in sub_nodes:

            if each.label() in self.noun_types:
                subject = each.leaves()
                break

        return {'subject': subject}

    def get_object(self, sub_tree):
        """
        Returns an Object with all attributes of an object
        """
        siblings = self.pred_verb_phrase_siblings
        Object = None
        for each_tree in sub_tree:
            if each_tree.label() in ["NP", "PP"]:
                sub_nodes = each_tree.subtrees()
                sub_nodes = [each for each in sub_nodes if each.pos()]

                for each in sub_nodes:
                    if each.label() in self.noun_types:
                        Object = each.leaves()
                        break
                break
            else:
                sub_nodes = each_tree.subtrees()
                sub_nodes = [each for each in sub_nodes if each.pos()]
                for each in sub_nodes:
                    if each.label() in self.adjective_types:
                        Object = each.leaves()
                        break
                # Get first noun in the tree
        self.pred_verb_phrase_siblings = None
        return {'object': Object}

    def get_predicate(self, sub_tree):
        """
        Returns the Verb along with its attributes, Also returns a Verb Phrase

        :param sub_tree:
        :return:
        """

        sub_nodes = sub_tree.subtrees()
        sub_nodes = [each for each in sub_nodes if each.pos()]
        predicate = None
        sub_tree = ParentedTree.convert(sub_tree)
        for each in sub_nodes:
            if each.label() in self.verb_types:
                sub_tree = each
                predicate = each.leaves()

        # get all predicate_verb_phrase_siblings to be able to get the object
        sub_tree = ParentedTree.convert(sub_tree)
        if predicate:
            pred_verb_phrase_siblings = self.tree_root.subtrees()
            pred_verb_phrase_siblings = [
                each for each in pred_verb_phrase_siblings
                if each.label() in ["NP", "PP", "ADJP", "ADVP"]
            ]
            self.pred_verb_phrase_siblings = pred_verb_phrase_siblings

        return {'predicate': predicate}

    def process_parse_tree(self, parse_tree):
        """
        Returns the Subject-Verb-Object Representation of a Parse Tree.
        Can Vary depending on number of 'sub-sentences' in a Parse Tree
        """
        self.tree_root = parse_tree
        # Step 1 - Extract all the parse trees that start with 'S'
        output_list = []
        output_dict = {}

        for idx, subtree in enumerate(parse_tree[0].subtrees()):
            subject = None
            predicate = None
            Object = None
            if subtree.label() in ["S", "SQ", "SBAR", "SBARQ", "SINV", "FRAG"]:
                children_list = subtree
                children_values = [
                    each_child.label() for each_child in children_list
                ]
                children_dict = dict(zip(children_values, children_list))

                # Extract Subject, Verb-Phrase, Objects from Sentence sub-trees
                if children_dict.get("NP") is not None:
                    subject = self.get_subject(children_dict["NP"])

                if children_dict.get("VP") is not None:
                    # Extract Verb and Object
                    # i+=1
                    # """
                    # if i==1:
                    #    pdb.set_trace()
                    # """
                    predicate = self.get_predicate(children_dict["VP"])
                    Object = self.get_object(children_dict["VP"])

                try:
                    if subject['subject'] and predicate[
                            'predicate'] and Object['object']:
                        output_dict['subject'] = subject['subject']
                        output_dict['predicate'] = predicate['predicate']
                        output_dict['object'] = Object['object']
                        output_list.append(output_dict)
                except Exception as e:
                    print(e)
                    continue

        return output_list

    def traverse(self, t):
        try:
            t.label()
        except AttributeError:
            print(t)
        else:
            print('(', t.label())
            for child in t:
                self.traverse(child)

            print(')')

    def sentence_split(self, text):
        """
        returns the Parse Tree of a Sample
        """
        sentences = self.sent_detector.tokenize(text)
        return sentences

    def get_parse_tree(self, sentence):
        """
        returns the Parse Tree of a Sample
        """
        parse_tree = self.parser.raw_parse(sentence)

        return parse_tree

    def List_To_Tree(self, lst):
        if (not isinstance(lst, basestring)):
            if (len(lst) == 2 and isinstance(lst[0], str)
                    and isinstance(lst[1], str)):
                lst = Tree(str(lst[0]).split('+')[0], [str(lst[1])])
            elif (isinstance(lst[0], str) and not isinstance(lst[1], str)):
                lst = Tree(
                    str(lst[0]).split('+')[0],
                    map(self.List_To_Tree, lst[1:len(lst)]))
        return lst

示例#16

0

显示文件

文件： RDF_Triple.py 项目： uridr/RDF-TextGeneration

class RDF_Triple():
    
    class RDF_SOP():
        
        def __init__(self, name, pos=''):
            self.name = name
            self.word = ''
            self.parent = ''
            self.grandparent = ''
            self.depth = ''
            self.predicate_list = []
            self.predicate_sibings = []
            self.pos = pos
            self.attr = []
            self.attr_trees = []
            
    
    def __init__(self, sentence):
        self.sentence = sentence
        self.clear_data()
        
    
    def clear_data(self):
        self.parser = CoreNLPParser(url='http://localhost:9000')
        self.first_NP = ''
        self.first_VP = ''
        self.parse_tree = None
        self.subject = RDF_Triple.RDF_SOP('subject')
        self.predicate = RDF_Triple.RDF_SOP('predicate', 'VB')
        self.Object = RDF_Triple.RDF_SOP('object')      


    def find_NP(self, t):
        try:
            t.label()
        except AttributeError:
            pass
        else:
            # Now we know that t.node is defined
            if t.label() == 'NP':
                if self.first_NP == '': 
                    #print(t)
                    self.first_NP = t
            elif t.label() == 'VP':
                if self.first_VP == '':
                    self.first_VP = t
            for child in t:
                self.find_NP(child)

    # Custom this function: if self.subject.word limits to 1 entity
    def find_subject(self, t, parent=None, grandparent=None):
        try:
            t.label()
        except AttributeError:
            return
        else:
            # Now we know that t.node is defined
            if t.label()[:2] == 'NN':
                if len(self.subject.word) == 0:
                     self.subject.word += t.leaves()[0]
                else:
                     self.subject.word += ' ' + t.leaves()[0]
                self.subject.pos = t.label()
                self.subject.parent = parent
                self.subject.grandparent = grandparent
            else:
                for child in t:
                    self.find_subject(child, parent=t, grandparent=parent)
    
    
    def find_predicate(self, t, parent=None, grandparent=None, depth=0):
        try:
            t.label()
        except AttributeError:
            pass
        else:
            if t.label()[:2] == 'VB':
                self.predicate.predicate_list.append((t.leaves()[0], depth, parent, grandparent))
                
            for child in t:
                self.find_predicate(child, parent=t, grandparent=parent, depth=depth+1)
                
                
    def find_deepest_predicate(self):
        if not self.predicate.predicate_list:
            return '','','',''
        return max(self.predicate.predicate_list, key=operator.itemgetter(1))


    def extract_word_and_pos(self, t, depth=0, words=[]):
        try:
            t.label()
        except AttributeError:
            pass
        else:
            # Now we know that t.node is defined
            if t.height() == 2:
                words.append((t.leaves()[0], t.label()))
            for child in t:
                self.extract_word_and_pos(child, depth+1, words)
        return words
            
            
    
    def print_tree(self, t, depth=0):
        try:
            t.label()
        except AttributeError:
            print(t)
            pass
        else:
            # Now we know that t.node is defined
            print('(')
            for child in t:
                self.print_tree(child, depth+1)
            print(') ')
    
    
    def find_object(self):
        for t in self.predicate.parent:
            if self.Object.word == '':
                self.find_object_NP_PP(t, t.label(), self.predicate.parent, self.predicate.grandparent)
           
    
    def find_object_NP_PP(self, t, phrase_type, parent=None, grandparent=None):
        '''
        finds the object given its a NP or PP or ADJP
        '''
        if self.Object.word != '':
            return
        try:
            t.label()
        except AttributeError:
            pass
        else:
            # Now we know that t.node is defined
            if t.label()[:2] == 'NN' and phrase_type in ['NP', 'PP']:
                if self.Object.word == '': 
                    self.Object.word = t.leaves()[0]
                    self.Object.pos = t.label()
                    self.Object.parent = parent
                    self.Object.grandparent = grandparent
            elif t.label()[:2] == 'JJ' and phrase_type == 'ADJP':
                if self.Object.word == '': 
                    self.Object.word = t.leaves()[0]
                    self.Object.pos = t.label()
                    self.Object.parent = parent
                    self.Object.grandparent = grandparent
            else:
                for child in t:
                    self.find_object_NP_PP(child, phrase_type, parent=t, grandparent=parent)
                    
    
    def get_attributes(self, pos, sibling_tree, grandparent):
        rdf_type_attr = []
        if pos[:2] == 'JJ':
            for item in sibling_tree:
                if item.label()[:2] == 'RB':
                    rdf_type_attr.append((item.leaves()[0], item.label()))
        else:
            if pos[:2] == 'NN':
                for item in sibling_tree:
                    if item.label()[:2] in ['DT', 'PR', 'PO', 'JJ', 'CD']:
                        rdf_type_attr.append((item.leaves()[0], item.label()))
                    if item.label() in ['QP', 'NP']:
                        #append a tree
                        rdf_type_attr.append(item, item.label())
            elif pos[:2] == 'VB':
                for item in sibling_tree:
                    if item.label()[:2] == 'AD':
                        rdf_type_attr.append((item, item.label()))
        
        if grandparent:
            if pos[:2] in ['NN', 'JJ']:
                for uncle in grandparent:
                    if uncle.label() == 'PP':
                        rdf_type_attr.append((uncle, uncle.label()))
            elif pos[:2] == 'VB':
                for uncle in grandparent:
                    if uncle.label()[:2] == 'VB':
                        rdf_type_attr.append((uncle, uncle.label()))
                        
                    
        return self.attr_to_words(rdf_type_attr)
    
    
    def attr_to_words(self, attr):
        new_attr_words = []
        new_attr_trees = []
        for tup in attr:
            if type(tup[0]) != str:
                if tup[0].height() == 2:
                    new_attr_words.append((tup[0].leaves()[0], tup[0].label()))
                else:
#                     new_attr_words.extend(self.extract_word_and_pos(tup[0]))
                    new_attr_trees.append(tup[0].unicode_repr())
            else:
                new_attr_words.append(tup)
        return new_attr_words, new_attr_trees
    
    def jsonify_rdf(self):
        return {'sentence':self.sentence,
                'parse_tree':self.parse_tree, #.unicode_repr(),
         'predicate':{'word':self.predicate.word, 'POS':self.predicate.pos,
                      'Word Attributes':self.predicate.attr, 'Tree Attributes':self.predicate.attr_trees},
         'subject':{'word':self.subject.word, 'POS':self.subject.pos,
                      'Word Attributes':self.subject.attr, 'Tree Attributes':self.subject.attr_trees},
         'object':{'word':self.Object.word, 'POS':self.Object.pos,
                      'Word Attributes':self.Object.attr, 'Tree Attributes':self.Object.attr_trees},
         'rdf':[self.subject.word, self.predicate.word, self.Object.word]
         }
                        
                                 
    def main(self):
        self.clear_data()
        self.parse_tree = self.parser.raw_parse(self.sentence)
        for line in self.parse_tree:
                self.parse_tree = line
        self.find_NP(self.parse_tree)
        self.find_subject(self.first_NP)
        self.find_predicate(self.first_VP)
        if self.subject.word == '' and self.first_NP != '':
            self.subject.word = self.first_NP.leaves()
        self.predicate.word, self.predicate.depth, self.predicate.parent, self.predicate.grandparent = self.find_deepest_predicate()
        self.find_object()
        self.subject.attr, self.subject.attr_trees = self.get_attributes(self.subject.pos, self.subject.parent, self.subject.grandparent)
        self.predicate.attr, self.predicate.attr_trees = self.get_attributes(self.predicate.pos, self.predicate.parent, self.predicate.grandparent)
        self.Object.attr, self.Object.attr_trees = self.get_attributes(self.Object.pos, self.Object.parent, self.Object.grandparent)
        self.answer = self.jsonify_rdf()

示例#17

0

显示文件

文件： preprocess_nlp_tok_sent.py 项目： rickchung/Og-ConVis-UserStudy-Analysis

class CoreNLPSentenceAnalyzer():
    """
    A sentence analyzer based on Stanford CoreNLP.

    Refernces:
        The CoreNLP Syntax Parser
            https://bbengfort.github.io/snippets/2018/06/22/corenlp-nltk-parses.html
        Penn Treebank II Tags
            https://gist.github.com/nlothian/9240750
    """
    def __init__(self):
        self.lab_set = set()

    def init_server(self):
        STANDFORD = os.path.join("stanford-corenlp-full-2018-10-05")
        self.server = CoreNLPServer(
            os.path.join(STANDFORD, "stanford-corenlp-3.9.2.jar"),
            os.path.join(STANDFORD, "stanford-corenlp-3.9.2-models.jar"))
        self.server.start()
        self.parser = CoreNLPParser()

    def stop_server(self):
        self.server.stop()

    def parse_syntax(self, sent):
        return next(self.parser.raw_parse(sent))

    def _collect_labels(self, node):
        """
        Collect labels in the given node recursively. This method should not be invoked directly but done by collect_labels.
        """
        try:
            self.lab_result.append(node.label())
        except AttributeError:
            return
        for nn in node:
            self._collect_labels(nn)
        return

    def collect_labels(self, node):
        """
        Collect all labels in a tree starting from the given node.
        """
        self.lab_result = []  # used to collect labels in the recursion
        self._collect_labels(node)
        lab_counter = Counter(self.lab_result)

        # Keep the tags we have seen so far
        self.lab_set = self.lab_set.union(lab_counter.keys())

        return lab_counter

    def get_lab_series(self, lab_counter_list):
        """
        Convert and merge all lab_counters in the given list (the result of "collect_labels") into a series by using tags which have been seen so far (self.lab_set).
        """
        rt = pd.DataFrame(columns=self.lab_set)
        for lab_counter in lab_counter_list:
            rt = rt.append(pd.Series(lab_counter, index=self.lab_set),
                           ignore_index=True)
        rt = rt.add_prefix('penn_')
        return rt.sum()

示例#18

0

显示文件

文件： stanfordServerTest.py 项目： Rav830/Monolingual-Word-Aligner

# TODO make this test throw messages saying if the output is correct or not.

from nltk.parse import CoreNLPParser

# Lexical Parser
parser = CoreNLPParser(url='http://localhost:9000')

# Parse tokenized text.
print(
    list(parser.parse('What is the airspeed of an unladen swallow ?'.split())))
print(
    "\nExpected: [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]\n"
)

# Parse raw string.
print(list(parser.raw_parse('What is the airspeed of an unladen swallow ?')))
print(
    "\nExpected: [Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]\n"
)

# Neural Dependency Parser
from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
parses = dep_parser.parse(
    'What is the airspeed of an unladen swallow ?'.split())
print([[(governor, dep, dependent)
        for governor, dep, dependent in parse.triples()] for parse in parses])
print(
    "\nExpected: [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]\n"
)

示例#19

0

显示文件

class GapSelection:
    def __init__(self, port):
        self.port = port
        self.parser = CoreNLPParser('http://localhost:' + str(self.port))

    def _parse(self, sentence):
        """Parse sentence into an syntatic tree
        - Args:
            sentence(str): string of current sentence
        - Returns:
            parsed_sentence(list):list of Tree object syntactic tree
        """
        parsed_sentence = list(self.parser.raw_parse((sentence)))
        return parsed_sentence

    def get_score(self, gap, tree):
        entities = list(map(lambda x: list(x.subtrees()), tree))
        score = 0
        score_dict = {'DT': -1, 'CD': 1}
        for e in entities:
            for t in e:
                if t.label() in score_dict.keys():
                    score += score_dict[t.label()]
        score += len(gap)
        return score

    def _extract_gaps(self, sentence, tree):
        """Extract nouns, np, adjp from tree object
        - Args:
            sentence(str): current sentence
            tree(list): list of Tree object, correspond to sentence
        - - Returnss:
            candidates(list of dict): candidate questions generated by this sentence,
            e.g. [{'question':'the capital city of NL is _____', 'gap':'Amsterdam'}]
        """
        candidates = []
        candidate = {}
        entities = ['NP', 'ADJP']
        entities = list(
            map(
                lambda x: list(
                    x.subtrees(filter=lambda x: x.label() in entities)),
                tree))[0]

        parsed_sentence = list(map(lambda x: list(x.subtrees()), tree))[0]
        parsed_sentence = ' '.join(parsed_sentence[0].leaves())

        tmp_entities = []
        for entity in entities:
            if len(entity.leaves()) > 5:
                continue
            gap = str(' '.join(entity.leaves()))
            tmp_entities.append(gap)

        final_entities = []
        flag = False
        for ent in tmp_entities:
            for sent in final_entities:
                if sent.find(ent) >= 0:
                    flag = True
            if not flag:
                final_entities.append(ent)
            flag = False

        for entity in entities:
            gap = str(' '.join(entity.leaves()))
            if gap not in final_entities:
                continue
            score = self.get_score(gap, entity)
            candidate_gap = gap
            sentence_copy = sentence
            # replace sentence candidate_gap with ___
            sentence_copy = sentence_copy.replace(candidate_gap, '_____')
            candidate['Sentence'] = sentence
            candidate['Question'] = sentence_copy
            candidate['Answer'] = candidate_gap
            candidate['Score'] = score
            if candidate_gap.strip() != sentence.strip():
                candidates.append(candidate)
            candidate = {}
        if len(candidates) == 0:
            return False
        else:
            return sorted(candidates, key=lambda x: x['Score'], reverse=True)

    def get_candidates(self, sentences):
        """Main function, prepare sentences, parse sentence, extract gap
        - Args:
            sentences(dict): topically important sentences
        - - Returnss:
                candidates(list of dict): list of dictionary, e.g.
                [{'Sentence': .....,'Question':.....,'Answer':...},...]
        """
        candidates = []
        for sentence_id, sentence in sentences.items():
            tree = self._parse(sentence)
            for t in tree:
                print(t)
            current_sentence_candidates = self._extract_gaps(
                sentence, tree)  # build candidate questions
            if current_sentence_candidates == False:
                continue
            candidates = candidates + current_sentence_candidates
            print("building candidate question/answer pairs %d" %
                  len(candidates))
            # clear current_sentence_candidates
            current_sentence_candidates = []
        return candidates

示例#20

0

显示文件

        if len(item) < 1:
            i = i + 1
            sub_trees.append("")
        else:
            sub_trees[i] = sub_trees[i] + " " + item

    i = 0
    for item in sub_trees:
        sub_trees[i] = ' '.join(item.split())
        i = i + 1

    sub_trees = [t for t in sub_trees if t != '']

    return sub_trees


target = "I know this has already been answered, but I wanted to share a potentially better looking way to call Popen via the use of from x import x and functions."

sub_tree = getSentenceRelations("S", target)

root = parser.raw_parse(target)
tree_string = list(root)[0]
tree_string = str(tree_string).replace("\n", "")
tree_string = ' '.join(tree_string.split())
root = Tree.fromstring(tree_string)
list(root)[0].pretty_print()

for item in sub_tree:
    tree = Tree.fromstring(item)
    list(tree)[0].pretty_print()

示例#21

0

显示文件

文件： ner.py 项目： nitimkc/cyberbullying

            trees = ne_chunk(sent)
            for tree in trees:
                if hasattr(tree, 'label'):
                    if tree.label() in labels:
                        entities.append(' '.join(
                            [child[0].lower() for child in tree]))
    return entities


# run this you have to connect to api
# go to dir - stanford-corenlp-full-2018-02-27
# the two lines below type in terminal as one line
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
# -preload tokenize,ssplit,pos,lemma,ner,parse,depparse -status_port 9000 -port 9000 -timeout 15000 &

from nltk.parse import CoreNLPParser
parser = CoreNLPParser(url='http://localhost:9000')
list(parser.parse(doc))  # for sentence tokenized doc
list(parser.raw_parse(doc))  # for non tokenized docs

# on tokenized list of words
pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
list(pos_tagger.tag(doc))

ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
list(ner_tagger.tag(doc))

from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
list(dep_parser.parse(doc))

示例#22

0

显示文件

文件： music_Category.py 项目： ManasaK33/QuestionAndAnswerSystem

def music_category(question):
    parser = CoreNLPParser(url='http://localhost:9000')
    ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
    quest = question
    l3 = list(parser.raw_parse(quest))
    #pretty print parser
    Tree.fromstring(str(l3[0])).pretty_print()
    print("********************")

    selectQ = ""
    fromQ = []
    tempfromQ = []
    tempwhereQ = []
    whereQ = []
    query_variables = []
    person_list = []
    country_list = []
    date_list = []
    nationality_list = []
    title_list = []
    state_list = []
    album_names = list()
    track_names = list()
    album_name = ""
    track_name = ""

    fromQ.append(" from")
    whereQ.append(" where")

    ptree = ParentedTree.fromstring(str(l3[0]))
    rules = ptree.productions()
    print(rules)

    for i in rules:
        l1 = "Rule: " + str(i)
        print(l1)

    l4 = list(ner_tagger.tag((quest.split())))
    print(l4)

    for word, category in l4:
        if category == 'PERSON':
            person_list.append(word)
        if category == 'COUNTRY':
            country_list.append(word)
        if category == 'DATE':
            date_list.append(word)
        if category == 'NATIONALITY':
            nationality_list.append(word)
        if category == 'TITLE':
            title_list.append(word)
        if category == 'STATE_OR_PROVINCE':
            state_list.append(word)

    print("Person List:")
    print(person_list)
    print("Country List:")
    print(country_list)
    print("Date List:")
    print(date_list)
    print("Nationality List:")
    print(nationality_list)
    print("Title List:")
    print(title_list)
    print("State List:")
    print(state_list)

    albnames = QDB.get_album_names()
    for an in albnames:
        album_names.append(" %s" % an)
    for i in album_names:
        if i.lower() in quest.lower():
            album_name = i
    print("Album Name: " + album_name)
    album_name = album_name.strip()

    trknames = QDB.get_track_names()
    for tn in trknames:
        track_names.append(" %s" % tn)
    for i in track_names:
        if i.lower() in quest.lower():
            track_name = i
    print("Track Name: " + track_name)
    track_name = track_name.strip()

    print("********************************")
    print(type(rules))

    str_rules = str(rules)

    #*********************************************

    if 'ROOT -> SBARQ' in str_rules:
        #Wh Questions
        print("Wh Questions")
        selectQ = ""

        for i in rules:
            if 'NNP ->' in str(i):
                print("Query variables")
                qvar = str(i).split('NNP ->', 1)[1]
                qvar = qvar.strip()
                qvar = qvar[1:-1]
                query_variables.append(qvar)

        for i in rules:
            if 'WDT ->' in str(i):
                print("WDT")
                x1 = str(i).split('WDT ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1.lower() == 'which' and "artist" in title_list:
                    selectQ = "select A.name"
                    fromQ.append(" Artist A")
                if x1.lower() == 'which' and "album" in quest:
                    selectQ = "select AL.name"
                    fromQ.append(" Album AL")
            if 'WRB ->' in str(i):
                print("WRB")
                x1 = str(i).split('WRB ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1.lower() == 'where':
                    selectQ = "select A.placeOfBith"
                if x1.lower() == 'when':
                    selectQ = "select A.dateOfBirth"

            if 'VBZ ->' in str(i):
                print("VBZ")
                x1 = str(i).split('VBZ ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "sings":
                    fromQ.append(" INNER JOIN Album AL ON A.id = AL.artsitID")
                    fromQ.append(
                        " INNER JOIN Track T ON AL.albumID = T.albumID")
                    whereQ.append(" T.name like '%" + track_name + "%'")
            if 'IN ->' in str(i):
                print("IN")
                x1 = str(i).split('IN ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "by" and person_list != []:
                    fromQ.append(" INNER JOIN Artist A ON AL.artsitID = A.id")
                    for p in person_list:
                        whereQ.append(" A.name like '%" + p + "%'")
            if 'VBN ->' in str(i):
                print("VBN")
                x1 = str(i).split('VBN ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "released":
                    for d in date_list:
                        whereQ.append(" AL.releaseDate like '%" + d + "%'")
                if x1 == "born":
                    fromQ.append(" Artist A")
                    for qv in query_variables:
                        whereQ.append(" A.name like '%" + qv + "%'")

    elif 'ROOT -> S' in str_rules:
        #Yes/No Questions
        print("Yes/No Questions")
        selectQ = "select count(*)"

        for i in rules:
            if 'NNP ->' in str(i):
                print("Query variables")
                qvar = str(i).split('NNP ->', 1)[1]
                qvar = qvar.strip()
                qvar = qvar[1:-1]
                query_variables.append(qvar)
        for i in rules:
            if 'NNP ->' in str(i):
                print("NNP")
                x1 = str(i).split('NNP ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 in person_list:
                    fromQ.append(" Artist A")
                    whereQ.append(" A.name like '%" + x1 + "%'")
            if 'NN ->' in str(i):
                print("NN")
                x1 = str(i).split('NN ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "album":
                    fromQ.append(" Album AL")
                    whereQ.append(" AL.name like '%" + album_name + "%'")
                if x1 == "track":
                    fromQ.append(
                        " INNER JOIN Track T ON AL.albumID = T.albumID")
                    whereQ.append(" T.name like '%" + track_name + "%'")

            if 'VB ->' in str(i):
                print("VB")
                x1 = str(i).split('VB ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "sing":
                    fromQ.append(" INNER JOIN Album AL ON A.id = AL.artsitID")
                    fromQ.append(
                        " INNER JOIN Track T ON AL.albumID = T.albumID")
                    whereQ.append(" T.name like '%" + track_name + "%'")
            if 'VBN ->' in str(i):
                print("VBN")
                x1 = str(i).split('VBN ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "born":
                    for qv in query_variables:
                        if qv not in person_list and qv in country_list or qv in state_list:
                            #Contry/State in Artist table
                            whereQ.append(" A.placeOfBith like '%" + qv + "%'")

    print("Query variables: " + str(query_variables))
    print("Select statement: " + str(selectQ))
    print("From statement: " + str(fromQ))
    print("Where statement: " + str(whereQ))
    print(str(selectQ) + str(fromQ) + str(whereQ))
    print("***************************************************")

    for f in fromQ:
        if 'from' in str(f):
            tempfromQ.append(str(f))
    for f in fromQ:
        if "INNER JOIN" not in f and "from" not in f:
            tempfromQ.append(str(f))
    for f in fromQ:
        if "INNER JOIN" in f:
            tempfromQ.append(str(f))
        '''elif " from" in tempfromQ[len(tempfromQ)-1] and f != " from":
            tempfromQ.append(str(f))'''

    print("From statement: " + str(tempfromQ))

    for w in whereQ:
        if 'where' in str(w):
            tempwhereQ.append(w)
    for w in whereQ:
        if "like" in tempwhereQ[len(tempwhereQ) - 1]:
            tempwhereQ.append(" and")
            tempwhereQ.append(str(w))
        elif " where" in tempwhereQ[len(tempwhereQ) - 1] and w != " where":
            tempwhereQ.append(str(w))

    print("Where statement: " + str(tempwhereQ))

    print("*************************************************")
    #building query
    from_statement = ""
    where_statement = ""
    final_query = ""
    for each_from in tempfromQ:
        from_statement = from_statement + str(each_from)
    for each_where in tempwhereQ:
        where_statement = where_statement + str(each_where)

    final_query = final_query + str(selectQ) + str(from_statement) + str(
        where_statement)
    print(final_query)
    return final_query

示例#23

0

显示文件

文件： algorithm.py 项目： kae-mihara/LS-Tree

def extract_subtrees_of_parse(raw_text, parser=None):
    """
	Input: string
	Output:
	A list of lists of lists of words. The words in a single inner list form a phrase. 
	"""
    # print(raw_text)
    # print('Is it a unicode?')
    # hahahahhaha
    sents_text = sent_tokenize(raw_text)
    if_error = False

    if parser is None:
        parser = CoreNLPParser(url='http://localhost:9000')

    parse_trees = []
    full = []
    sent_idx = []
    sents_lst = []
    for i, sent in enumerate(sents_text):
        try:
            const_parse = next(parser.raw_parse(sent))
            sent = list(list(const_parse.subtrees())[0].flatten())

        except:
            print('There is an error...')
            print(sent.encode('ascii'))
            sent = nltk.word_tokenize(sent)
            const_parse = sent[:]
            if_error = True

        parse_trees.append(const_parse)

        full += sent
        sents_lst.append(sent)
        if i == 0:
            sent_idx.append([0, len(sent)])
        else:
            sent_idx.append([sent_idx[-1][-1], sent_idx[-1][-1] + len(sent)])

    adj_lists = []
    all_nodes = []
    for i, const_parse in enumerate(parse_trees):
        if type(const_parse) == list:

            sent = sents_lst[i]
            start_id, end_id = sent_idx[i]
            d = len(sent)
            if d == 1:
                adj_list = {(start_id, start_id + 1): []}
                nodes = [(start_id, start_id + 1)]
            else:
                nodes = [(i + start_id, i + start_id + 1) for i in range(d)]
                adj_list = {(start_id, end_id): nodes[:]}
                for node in nodes:
                    adj_list[node] = []
                nodes.append((start_id, end_id))
        else:
            adj_list, nodes = find_subtrees(const_parse, sents_lst[i],
                                            sent_idx[i])
            #here print([full[node[0]:node[1]] for node in nodes])

        adj_lists.append(adj_list)
        all_nodes.append(nodes)

    return adj_lists, all_nodes, full, if_error

示例#24

0

显示文件

文件： movies_Category.py 项目： ManasaK33/QuestionAndAnswerSystem

def movies_category(question):
    parser = CoreNLPParser(url='http://localhost:9000')
    ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
    quest = question
    l3 = list(parser.raw_parse(quest))
    #pretty print parser
    Tree.fromstring(str(l3[0])).pretty_print()
    print("********************")

    selectQ = ""
    fromQ = []
    tempfromQ = []
    tempwhereQ = []
    whereQ = []
    query_variables = []
    person_list = []
    country_list = []
    date_list = []
    nationality_list = []
    title_list = []
    movie_names = []
    movie_name = ""

    fromQ.append(" from")
    whereQ.append(" where")

    ptree = ParentedTree.fromstring(str(l3[0]))
    rules = ptree.productions()
    print(rules)

    for i in rules:
        l1 = "Rule: " + str(i)
        print(l1)

    l4 = list(ner_tagger.tag((quest.split())))
    print(l4)

    for word, category in l4:
        if category == 'PERSON':
            person_list.append(word)
        if category == 'COUNTRY':
            country_list.append(word)
        if category == 'DATE':
            date_list.append(word)
        if category == 'NATIONALITY':
            nationality_list.append(word)
        if category == 'TITLE':
            title_list.append(word)

    print("Person List:")
    print(person_list)
    print("Country List:")
    print(country_list)
    print("Date List:")
    print(date_list)
    print("Nationality List:")
    print(nationality_list)
    print("Title List:")
    print(title_list)

    movnames = QDB.get_movie_names()
    for mn in movnames:
        movie_names.append(" %s" % mn)
    for i in movie_names:
        if i.lower() in quest.lower():
            movie_name = i
    print("Movie Name: " + movie_name)
    movie_name = movie_name.strip()

    print("********************************")
    print(type(rules))

    str_rules = str(rules)

    if 'ROOT -> SBARQ' in str_rules:
        #WH Questions
        print("WH Questions")
        selectQ = ""

        for i in rules:
            if 'NNP ->' in str(i):
                print("Query variables")
                qvar = str(i).split('NNP ->', 1)[1]
                qvar = qvar.strip()
                qvar = qvar[1:-1]
                query_variables.append(qvar)

        for i in rules:
            if 'WP ->' in str(i):
                print("WP")
                x1 = str(i).split('WP ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1.lower() == 'who':
                    selectQ = "select P.name"
                    fromQ.append(" Person P")
            if 'WDT ->' in str(i):
                print("WDT")
                x1 = str(i).split('WDT ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1.lower() == 'which' and 'movie' in quest:
                    selectQ = "select M.name"
                    fromQ.append(" Movie M")
                elif x1.lower() == 'which':
                    selectQ = "select P.name"
                    fromQ.append(" Person P")
            if 'WRB ->' in str(i):
                print("WRB")
                x1 = str(i).split('WRB ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1.lower() == 'when':
                    selectQ = "select O.year"
                    fromQ.append(" Oscar O")
                    for qv in query_variables:
                        whereQ.append(" P.name like '%" + qv + "%'")
            if 'VBD ->' in str(i):
                print("VBD")
                x1 = str(i).split('VBD ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == 'directed' and 'movie' not in quest:
                    fromQ.append(
                        " INNER JOIN Director D ON P.id = D.director_id")
                    fromQ.append(" INNER JOIN MOVIE M ON D.movie_id = M.id")
                    '''for qv in query_variables:
                        if qv not in person_list and qv not in country_list and qv not in date_list:'''
                    whereQ.append(" M.name like '%" + movie_name + "%'")
                if x1 == 'directed' and query_variables == []:
                    fromQ.append(
                        " INNER JOIN Director D ON P.id = D.director_id")
                    fromQ.append(
                        " INNER JOIN OSCAR O ON D.movie_id = O.movie_id")
                if x1 == "won" and 'movie' in quest:
                    fromQ.append(" INNER JOIN Oscar O ON M.id = O.movie_id")
                elif x1 == "won":
                    fromQ.append(" INNER JOIN Oscar O ON P.id = O.person_id")
            if 'VB ->' in str(i):
                print("VB")
                x1 = str(i).split('VB ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "win":
                    fromQ.append(" INNER JOIN Person P ON O.person_id = P.id")
            if 'NN ->' in str(i):
                print("NN")
                x1 = str(i).split('NN ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if "actor" in x1.lower():
                    Otype = "BEST-ACTOR"
                    whereQ.append(" O.type like '%" + Otype + "%'")
                elif "actress" in x1.lower():
                    Otype = "BEST-ACTRESS"
                    whereQ.append(" O.type like '%" + Otype + "%'")
                elif "movie" in x1.lower():
                    Otype = "BEST-PICTURE"
                    whereQ.append(" O.type like '%" + Otype + "%'")
                elif "director" in x1.lower():
                    Otype = "BEST-DIRECTOR"
                    whereQ.append(" O.type like '%" + Otype + "%'")
            if 'CD ->' in str(i):
                print("CD")
                x1 = str(i).split('CD ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 in date_list:
                    whereQ.append(" O.year like '%" + x1 + "%'")
            if 'JJ ->' in str(i):
                print("JJ Nationality")
                x1 = str(i).split('JJ ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 in nationality_list:
                    '''fromQ.append(" Person P")'''
                    if x1.lower() == "french":
                        nat_word = "France"
                    elif x1.lower() == "american":
                        nat_word = "USA"
                    elif x1.lower() == "italian":
                        nat_word = "Italy"
                    elif x1.lower() == "british":
                        nat_word = "UK"
                    elif x1.lower() == "german":
                        nat_word = "Germany"
                    whereQ.append(" P.pob like '%" + nat_word + "%'")

    elif 'ROOT -> S' in str_rules:
        #Yes/No Questions
        print("Yes/No Questions")
        selectQ = "select count(*)"

        for i in rules:
            if 'NNP ->' in str(i):
                print("Query variables")
                qvar = str(i).split('NNP ->', 1)[1]
                qvar = qvar.strip()
                qvar = qvar[1:-1]
                query_variables.append(qvar)

        for i in rules:
            if 'NNP ->' in str(i):
                print("NNP")
                x1 = str(i).split('NNP ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 in person_list:
                    fromQ.append(" Person P")
                    whereQ.append(" P.name like '%" + x1 + "%'")
                elif x1 in country_list:
                    whereQ.append(" P.pob like '%" + x1 + "%'")
                elif date_list != []:
                    fromQ.append(" Movie M")
                    fromQ.append(" INNER JOIN Oscar O ON M.id = O.movie_id")
                    whereQ.append(" M.name like '%" + movie_name + "%'")
            if 'NN ->' in str(i):
                print("NN")
                x1 = str(i).split('NN ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if "actor" in x1.lower() and "Is" not in quest:
                    Otype = "BEST-ACTOR"
                    whereQ.append(" O.type like '%" + Otype + "%'")
                elif "actress" in x1.lower() and "Is" not in quest:
                    Otype = "BEST-ACTRESS"
                    whereQ.append(" O.type like '%" + Otype + "%'")
                elif "movie" in x1.lower() and "Is" not in quest:
                    Otype = "BEST-PICTURE"
                    whereQ.append(" O.type like '%" + Otype + "%'")
                elif "director" in x1.lower() and "Is" not in quest:
                    Otype = "BEST-DIRECTOR"
                    whereQ.append(" O.type like '%" + Otype + "%'")
            if 'CD ->' in str(i):
                print("CD")
                x1 = str(i).split('CD ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 in date_list:
                    whereQ.append(" O.year like '%" + x1 + "%'")
            if 'IN ->' in str(i):
                print("IN")
                x1 = str(i).split('IN ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "by":
                    fromQ.append(
                        " INNER JOIN Director D ON P.id = D.director_id")
                    fromQ.append(" INNER JOIN Movie M ON D.movie_id = M.id")
                    '''for qv in query_variables:
                        if qv not in person_list and qv not in country_list and qv not in date_list:'''
                    whereQ.append(" M.name like '%" + movie_name + "%'")
                if x1 == "with":
                    fromQ.append(
                        " INNER JOIN Director D ON P.id = D.director_id")
                    fromQ.append(
                        " INNER JOIN Oscar O ON D.movie_id = O.movie_id")
            if 'VB ->' in str(i):
                print("VB")
                x1 = str(i).split('VB ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "star":
                    fromQ.append(" INNER JOIN Actor A ON P.id = A.actor_id")
                    fromQ.append(" INNER JOIN Movie M ON A.movie_id = M.id")
                    '''for qv in query_variables:
                        if qv not in person_list and qv not in country_list and qv not in date_list:'''
                    whereQ.append(" M.name like '%" + movie_name + "%'")
                if x1 == "win":
                    fromQ.append(" INNER JOIN Oscar O ON P.id = O.person_id")
                if x1 == "direct":
                    fromQ.append(
                        " INNER JOIN Director D ON P.id = D.director_id")
                    fromQ.append(" INNER JOIN Movie M ON D.movie_id = M.id")
                    '''for qv in query_variables:
                        if qv not in person_list and qv not in country_list and qv not in date_list:'''
                    whereQ.append(" M.name like '%" + movie_name + "%'")
            if 'VBZ ->' in str(i):
                print("VBZ")
                x1 = str(i).split('VBZ ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 == "Is" and 'director' in quest:
                    fromQ.append(
                        " INNER JOIN Director D on P.id = D.director_id")
                elif x1 == "Is" and 'actor' in quest:
                    fromQ.append(" INNER JOIN Actor A on P.id = A.actor_id")
            if 'JJ ->' in str(i):
                print("JJ Nationality")
                x1 = str(i).split('JJ ->', 1)[1]
                x1 = x1.strip()
                x1 = x1[1:-1]
                if x1 in nationality_list:
                    fromQ.append(" Person P")
                    if x1.lower() == "french":
                        nat_word = "France"
                    elif x1.lower() == "american":
                        nat_word = "USA"
                    elif x1.lower() == "italian":
                        nat_word = "Italy"
                    elif x1.lower() == "british":
                        nat_word = "UK"
                    elif x1.lower() == "german":
                        nat_word = "Germany"
                    whereQ.append(" P.pob like '%" + nat_word + "%'")

    print("Query variables: " + str(query_variables))
    print("Select statement: " + str(selectQ))
    print("From statement: " + str(fromQ))
    print("Where statement: " + str(whereQ))
    print(str(selectQ) + str(fromQ) + str(whereQ))
    print("***************************************************")

    for f in fromQ:
        if 'from' in str(f):
            tempfromQ.append(str(f))
    for f in fromQ:
        if "INNER JOIN" not in f and "from" not in f:
            tempfromQ.append(str(f))
    for f in fromQ:
        if "INNER JOIN" in f:
            tempfromQ.append(str(f))
        '''elif " from" in tempfromQ[len(tempfromQ)-1] and f != " from":
            tempfromQ.append(str(f))'''

    print("From statement: " + str(tempfromQ))

    for w in whereQ:
        if 'where' in str(w):
            tempwhereQ.append(w)
    for w in whereQ:
        if "like" in tempwhereQ[len(tempwhereQ) - 1]:
            tempwhereQ.append(" and")
            tempwhereQ.append(str(w))
        elif " where" in tempwhereQ[len(tempwhereQ) - 1] and w != " where":
            tempwhereQ.append(str(w))

    print("Where statement: " + str(tempwhereQ))

    print("*************************************************")
    #building query
    from_statement = ""
    where_statement = ""
    final_query = ""
    for each_from in tempfromQ:
        from_statement = from_statement + str(each_from)
    for each_where in tempwhereQ:
        where_statement = where_statement + str(each_where)

    final_query = final_query + str(selectQ) + str(from_statement) + str(
        where_statement)
    print(final_query)
    return final_query