Exemplo n.º 1
0
def convert_eng_to_isl(input_string):
    # get all required packages
    download_required_packages()

    if len(list(input_string.split(' '))) is 1:
        return list(input_string.split(' '))

    # Initializing stanford parser
    parser = StanfordParser()

    # Generates all possible parse trees sort by probability for the sentence
    possible_parse_tree_list = [
        tree for tree in parser.parse(input_string.split())
    ]

    # Get most probable parse tree
    parse_tree = possible_parse_tree_list[0]
    print(parse_tree)
    # output = '(ROOT
    #               (S
    #                   (PP (IN As) (NP (DT an) (NN accountant)))
    #                   (NP (PRP I))
    #                   (VP (VBP want) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN payment))))))
    #                )
    #             )'

    # Convert into tree data structure
    parent_tree = ParentedTree.convert(parse_tree)

    modified_parse_tree = modify_tree_structure(parent_tree)

    parsed_sent = modified_parse_tree.leaves()
    return parsed_sent
Exemplo n.º 2
0
    def POS_data(self):
        """POS sentences"""
        tag = 'pos'
        idx = 19
        file_name = 'data/normalize_{}_piece/nor_{}_{}.csv'.format(tag, tag, idx)
        with open(file_name, 'r') as file:
            sentences = file.read().strip().split('\n')

        stop_words = stopwords.words('english')
        eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
        eng_parser.java_options = '-mx3000m'

        print('=' * 100)
        print('current tag: {}, file idx: {}'.format(tag, idx))

        '''POS'''
        print('=' * 100)
        print('Starting POS...')
        pos_sent = []
        for sent in tqdm(sentences):
            pos_sent.append(list(eng_parser.parse(
                [w for w in sent.split()]))[0])

        '''save file'''
        save_file = 'data/{}_sent/{}_sent_{}.csv'.format(tag, tag, idx)
        with open(save_file, mode='w') as file:
            for sent, pos in zip(sentences, pos_sent):
                file.write(sent + '\t')
                file.write(str(pos) + '\t')
        print('Finish! Saved in {}'.format(save_file))
Exemplo n.º 3
0
    def clean_apriori_data(self, sentences):
        """
        filter apriori data
        methods:
        - clean stop words
        - stemming
        - fuzzy matching within sentence
        """
        stop_words = stopwords.words('english')
        eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')

        if config.apriori_test_size < 6:
            for sent in sentences:
                print(sent)
        '''POS'''
        pos_sent = []
        for sent in sentences:
            pos_sent.append(list(eng_parser.parse(
                [w for w in sent.split()]))[0])

        '''filter noun phrase & NLTK stemming'''
        cleaned_sent = []
        for sent in pos_sent:
            wnl = WordNetLemmatizer()
            tmp_sent = []
            for s in sent.subtrees(lambda t: t.height() <= 4 and t.label() == 'NP'):
                '''clean stop words & stemming'''
                tmp = [wnl.lemmatize(w, pos='n') for w in s.leaves() if w not in stop_words]
                '''lenght <= 3 & filter repeated list'''
                if 0 < len(tmp) <= 3 and tmp not in tmp_sent:
                    tmp_sent.append(tmp)
            cleaned_sent.append(tmp_sent)

        return pos_sent
Exemplo n.º 4
0
    def __init__(self, sentence):
        en_parser = StanfordParser(
            path_to_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser.jar',
            path_to_models_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
            model_path=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
        )
        sg = StanfordTokenizer(
            path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar'
        )
        self.status = 0
        self.trans = googletrans.Translator()

        self.sentence = sentence.strip("\n").replace(" ", "")

        en_trans = self.trans.translate(sentence).text
        en_trans = sg.tokenize(en_trans)
        try:
            tree = list(en_parser.parse(en_trans))
            self.tree = tree[0]
            # print(self.tree)
            self.rel = []
        except:
            self.status = 1
Exemplo n.º 5
0
class NLPParser(object):
    def __init__(self):
        self.eng_parser = StanfordParser(
            model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')

    def tag_pos(self, text):
        res = list(self.eng_parser.parse(text.split(' ')))[0].pos()
        return [t[0] for t in res], [t[1] for t in res]
Exemplo n.º 6
0
def parser(sentence):
    chi_parser = StanfordParser(
        path_to_jar=path_dit.get('path_to_jar'),
        path_to_models_jar=path_dit.get('path_to_models_jar'),
        model_path=path_dit.get('model_path'))
    re = chi_parser.parse(sentence.split())

    return re
Exemplo n.º 7
0
def parser(tokens):
    from nltk.parse.stanford import StanfordParser

    chi_parser = StanfordParser(
        r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-parser-full-2016-10-31\stanford-parser.jar",
        r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-parser-full-2016-10-31\stanford-parser-3.7.0-models.jar",
        r"E:\03_tools\machine learning\stanfordnlp\3.7\stanford-chinese-corenlp-2016-10-31-models\edu\stanford\nlp"
        r"\models\lexparser\chinesePCFG.ser.gz")
    print(list(chi_parser.parse(tokens)))
Exemplo n.º 8
0
class RelationExtractor:
    '''
		relation extraction
	'''
    def __init__(
            self,
            model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
    ):
        '''
			initialization
		'''
        self.eng_parser = StanfordParser()
        # self.eng_parser_dependency = StanfordDependencyParser()
        self.eng_parser_dependency = StanfordNeuralDependencyParser()

    def PCFG_parse(self, sentence, draw_graph=True):
        res = list(self.eng_parser.parse(sentence.split()))
        if draw_graph: res[0].draw()
        return res

    def dependency_parse(self, sentence, draw_graph=True):
        res = list(self.eng_parser_dependency.parse(sentence.split()))
        if draw_graph: res[0].tree().draw()
        return res

    def generate_relation(self,
                          sentence,
                          nes,
                          draw_graph=False,
                          dep_path_max=10**2):
        pairs = [(nes[i], nes[j]) for i in range(len(nes) - 1)
                 for j in range(i + 1, len(nes)) if nes[i][1] != nes[j][1]]
        if len(sentence.split()) > 60 or len(pairs) < 3: return

        def get_relation(n1, n2):
            get_range = lambda n: range(n[2] + 1, n[3] + 2)
            e1ind, e2ind = get_range(n1), get_range(n2)
            dep_path = nx.shortest_path(G, source=e1ind[-1], target=e2ind[-1])
            vbs = filter(lambda n: G.node[n]['tag'].startswith('VB'), dep_path)
            if len(dep_path) <= dep_path_max and vbs:
                ws = sentence.split()
                r = G.node[vbs[-1]]['word']
                e1, e2 = ' '.join(ws[i - 1]
                                  for i in e1ind), ' '.join(ws[i - 1]
                                                            for i in e2ind)
                print '{0}\n{1} | {2} | {3} | {4}'.format(
                    sentence, e1, e2, r, len(dep_path))
                return e1, e2, r, len(dep_path)
            else:
                return None, None, None, None

        rels = []
        res = self.dependency_parse(sentence, draw_graph=False)
        G = nx.Graph()
        nodes = {}
        edges = []
        return res[0].nodes.items()
Exemplo n.º 9
0
    def __init__(self, sentence):

        en_parser = StanfordParser(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar',
                                   path_to_models_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar',
                                   )
        sg = StanfordTokenizer(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar')
        self.trans = googletrans.Translator()

        self.sentence = sentence

        result1 = sg.tokenize(self.trans.translate(sentence).text)

        tree = list(en_parser.parse(result1))
        self.tree = tree[0]
        self.rel=[]
Exemplo n.º 10
0
    def parser_tree(self, sent):
        seg_sent = self.segment(sent)
        chi_parser = StanfordParser(
            model_path=r"edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz")
        sentences = list(chi_parser.parse(seg_sent.split()))
        #result = chi_parser.raw_parse(seg_sent)
        #tree = chi_parser.parse(seg_sent.split())

        #print(sentences)

        #GUI
        #for sentence in sentences:
        #   sentence.draw()

        return sentences[0]
Exemplo n.º 11
0
def parser():
	os.environ['STANFORD_PARSER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09'
	os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser.jar'
	os.environ['STANFORD_MODELS'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'

	eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',java_options="-mx2048m")
	for x in content:
		a = list(eng_parser.parse(x.split()))[0]
		print(a)
		# a.draw()

	eng_dep_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
	for x in content:
		a = list(eng_dep_parser.parse(x.split()))[0]
		for row in a.triples():
			print(row)
Exemplo n.º 12
0
def pos_test():
    stop_words = stopwords.words('english')
    eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
    eng_parser.java_options = '-mx3000m'
    sentence = "so now i run out take a few shot then run back in the house and xfer them to the pc"

    res = eng_parser.parse([w for w in sentence.lower().split()])
    lst_res = list(res)[0]
    with open('data/tree_test.txt', 'w') as file:
        file.write(sentence + '\t')
        file.write(str(lst_res) + '\t')
    # print(lst_res)
    lst_res.pretty_print()

    # lst_res.remove(Tree('NN', ['camera']))
    cleaned_sent = []
    for sent in lst_res:
        wnl = WordNetLemmatizer()
        tmp_sent = []
        for s in sent.subtrees(lambda t: t.height() <= 4 and t.label() == 'NP'):
            '''clean stop words & stemming'''
            tmp = [wnl.lemmatize(w, pos='n') for w in s.leaves() if w not in stop_words]
            '''lenght <= 3 & filter repeated list'''
            if 0 < len(tmp) <= 3 and tmp not in tmp_sent:
                tmp_sent.append(tmp)

        cleaned_sent.append(tmp_sent)

    # get opinion word
    # for w in cleaned_sent[0]:
    #     print(w)
    #     words = sentence.split()
    #     min_dist = len(words)
    #     min_asp = w
    #     for s in lst_res.subtrees(lambda t: t.label() == 'JJ'):
    #         if abs(words.index(s.leaves()[0]) - words.index(w[0])) < min_dist:
    #             min_dist = pos_test
    #             min_asp = s.leaves()[0]
    #
    #     if min_asp == w:
    #         print('not found')
    #     else:
    #         print(min_asp)

    print(cleaned_sent)
Exemplo n.º 13
0
def parser_nltk(word_lists, filename):
    os.environ['JAVAHOME'] = JAVA_PATH
    os.environ["STANFORD_PARSER"] = STANFORD_PARSER_PATH
    os.environ["STANFORD_MODELS"] = STANFORD_PARSER_MODELS
    chinese_parser = StanfordParser(model_path=nltk_parse_model_path)
    STANFORD_DIR = chinese_parser._classpath[0].rpartition('/')[0]
    chinese_parser._classpath = tuple(find_jars_within_path(STANFORD_DIR))
    chinese_parser.java_options = '-mx15000m'
    all_parser_sentence = []
    file = shelve.open(filename)
    flag = 0

    for sentence in word_lists:
        if sentence.strip() != "":
            res = list(chinese_parser.parse((sentence.strip()).split()))
            new_str = return_str_tofile(sentence_parse=str(res[0]))
            file[str(flag)] = res
            all_parser_sentence.append(new_str)
            flag += 1
            print("###### NLTK Dependency Parser Have finished " + str(flag) +
                  " sentences ###")
    return all_parser_sentence
Exemplo n.º 14
0
    def __init__(self, sentence):

        en_parser = StanfordParser(
            path_to_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser.jar',
            path_to_models_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
            model_path=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
        )
        sg = StanfordTokenizer(
            path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar'
        )
        self.trans = googletrans.Translator()

        self.sentence = sentence

        result1 = sg.tokenize(self.trans.translate(sentence).get_text())

        tree = list(en_parser.parse(result1))
        self.tree = tree[0]
        self.rel = []
Exemplo n.º 15
0
#    if tag != "O":
#        print("%-12s"%tag, " ".join(w for w, t in chunk))
#b = eng_parser.parse("Rami Eid is studying at Stony Brook University in NY".split())

eng_parser = StanfordParser(
    r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser.jar",
    r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser-3.8.0-models.jar"
)
#print(list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())))

eng_parser = StanfordDependencyParser(
    r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser.jar",
    r"C:\Users\jingx\Dropbox\MSCF Course\NLP\stanford-parser-full-2017-06-09\stanford-parser-3.8.0-models.jar"
)
res = list(
    eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
#for row in res[0].triples():
#    print(row)

trainfile = r'C:\Users\jingx\Dropbox\MSCF Course\NLP\NLP_Project\data\set1\a6.txt'
with open(trainfile, encoding='utf8') as fin:
    train = fin.readlines()

train = list(map(lambda x: x.strip('\n'), train))
train = list(map(lambda x: x.strip(' '), train))
train = ' '.join(train)

sent_tokenize_list = sent_tokenize(train)

NE = dict()
for i in range(200, 240):  #range(len(sent_tokenize_list)):
os.environ['JAVAHOME'] = java_path

for each in range(1, len(sys.argv)):
    inputString += sys.argv[each]
    inputString += " "

# inputString = raw_input("Enter the String to convert to ISL: ")

parser = StanfordParser(
    model_path=
    'D:/stanford-parser-full-2018-02-27/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
)

# o=parser.parse(s.split())

englishtree = [tree for tree in parser.parse(inputString.split())]
parsetree = englishtree[0]

dict = {}

# "***********subtrees**********"

parenttree = ParentedTree.convert(parsetree)
for sub in parenttree.subtrees():
    dict[sub.treeposition()] = 0

#"----------------------------------------------"

isltree = Tree('ROOT', [])
i = 0
for sub in parenttree.subtrees():
Exemplo n.º 17
0
# find_entity_t = test.find_entity()
# find_VP_t = test.firstVP()
# test.drawTree()
test.show(firstNP_t)
# test.show(find_entity_t)
# test.show(find_VP_t)
# # test.show(find_entity_t)
# test.show(firstMinNP_t)
result = test.find_realtionship(firstNP_t)
print(result)
test.drawTree()
#
#
# print(test.rel)
# test.show(test.find_realtionship())

# 对比实验
chi_parser = StanfordParser(path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar',
                            path_to_models_jar='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
                            model_path='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
data_dir='../stanford-segmenter-2018-02-27/'
segmenter = StanfordSegmenter(path_to_jar=data_dir+"stanford-segmenter-3.9.1.jar",
                              path_to_sihan_corpora_dict=data_dir+"/data", path_to_model=data_dir+"/data/pku.gz",
                              path_to_dict=data_dir+"/data/dict-chris6.ser.gz",
                              java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
                              )
result=segmenter.segment(test_str)
result_ls = result.split()
ch_tree = list(chi_parser.parse(result_ls))[0]
ch_tree.draw()
# print(result)
Exemplo n.º 18
0
from nltk.tokenize import StanfordSegmenter
# from nltk.tokenize import StanfordTokenizer

segmenter = StanfordSegmenter(
    path_to_sihan_corpora_dict="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data",
    path_to_model="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data/pku.gz",
    path_to_dict="/Users/cquan/Documents/model/stanford-segmenter-2018-10-16/data/dict-chris6.ser.gz")
res = segmenter.segment(u'北海已成为中国对外开放中升起的一颗明星')
print(type(res))
print(res.encode('utf-8'))


from nltk.parse.stanford import StanfordParser
eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
for tree in res:
    print(tree)
    tree.draw()

ch_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
ch_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz')
res1 = list(ch_parser.parse(u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'.split()))
for tree in res1:
    print(tree)
    tree.draw()


from nltk.parse.stanford import StanfordDependencyParser
eng_parser = StanfordDependencyParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
res2 = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
# for row in res2[0].triples():
Exemplo n.º 19
0
stop_words = stopwords.words('english')
eng_parser = StanfordParser(
    model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
eng_parser.java_options = '-mx3000m'

# print('=' * 100)
# print('current tag: {}, file idx: {}'.format(tag, idx))
'''POS'''
print('=' * 100)
print('Starting POS...')
pos_sent = []
# for sent in tqdm(sentences):
#     pos_sent.append(list(eng_parser.parse(
#         [w for w in sent.split()]))[0])
for sent in tqdm(sentences):
    pos_sent.append(list(eng_parser.parse([w for w in sent.split()[2:]
                                           ]))[0])  # ignore first two word
'''filter noun phrase & NLTK stemming'''
# print('=' * 100)
# print('Starting filter and stemming...')
# cleaned_sent = []
# for sent in tqdm(pos_sent):
#     wnl = WordNetLemmatizer()
#     tmp_sent = []
#     for s in sent.subtrees(lambda t: t.height() <= 4 and t.label() == 'NP'):
#         '''clean stop words & stemming'''
#         tmp = [wnl.lemmatize(w, pos='n') for w in s.leaves() if w not in stop_words]
#         '''length <= 3 & filter repeated list'''
#         if 0 < len(tmp) <= 3 and tmp not in tmp_sent:
#             tmp_sent.append(tmp)
#     cleaned_sent.append(tmp_sent)
'''save file'''
Exemplo n.º 20
0
# encoding: utf-8
import nltk
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.parse.stanford import StanfordParser

segmenter=StanfordSegmenter(
    #分词依赖的jar包
    path_to_jar=r"/home/jiangix/document/stanford-segmenter/stanford-segmenter.jar",
    path_to_slf4j=r"/home/jiangix/document/slf4j/slf4j-api.jar",
    #分词数据文件夹
    path_to_sihan_corpora_dict=r"/home/jiangix/document/stanford-segmenter/data",
    #基于北大在2005backoof上提供的人名日报语料库
    path_to_model=r"/home/jiangix/document/stanford-segmenter/data/pku.gz",
    path_to_dict=r"/home/jiangix/document/stanford-segmenter/data/dict-chris6.ser.gz"
    )

segmenter.default_config('zh')
result=segmenter.segment(u'我喜欢学习编程')


chi_parser = StanfordParser(
    r"/home/jiangix/document/stanford-parser/stanford-parser.jar",
    r"/home/jiangix/document/stanford-parser/stanford-parser-3.8.0-models.jar",
    r"/home/jiangix/document/stanford-parser/chinesePCFG.ser.gz")

sentences=chi_parser.parse(result.split())

for sentence in sentences:
    sentence.draw()
Exemplo n.º 21
0
def ch_parser(sent):
    chi_parser = StanfordParser(
        r"E:\tools\stanfordNLTK\jar\stanford-parser.jar",
        r"E:\tools\stanfordNLTK\jar\stanford-parser-3.9.1-models.jar",
        r"E:\tools\stanfordNLTK\jar\classifiers\chinesePCFG.ser.gz")
    print(list(chi_parser.parse(sent.split())))
Exemplo n.º 22
0
def func():
    r = sr.Recognizer()
    isl_gif = [
        'all the best', 'any questions', 'are you angry', 'are you busy',
        'are you hungry', 'are you sick', 'be careful', 'can we meet tomorrow',
        'did you book tickets', 'did you finish homework',
        'do you go to office', 'do you have money',
        'do you want something to drink', 'do you want tea or coffee',
        'do you watch TV', 'dont worry', 'flower is beautiful',
        'good afternoon', 'good evening', 'good morning', 'good night',
        'good question', 'had your lunch', 'happy journey',
        'hello what is your name', 'how many people are there in your family',
        'i am a clerk', 'i am bore doing nothing', 'i am fine', 'i am sorry',
        'i am thinking', 'i am tired', 'i dont understand anything',
        'i go to a theatre', 'i love to shop',
        'i had to say something but i forgot', 'i have headache',
        'i like pink colour', 'i live in nagpur', 'lets go for lunch',
        'my mother is a homemaker', 'my name is john', 'nice to meet you',
        'no smoking please', 'open the door', 'please call an ambulance',
        'please call me later', 'please clean the room',
        'please give me your pen', 'please use dustbin dont throw garbage',
        'please wait for sometime', 'shall I help you',
        'shall we go together tommorow', 'sign language interpreter',
        'sit down', 'stand up', 'take care', 'there was traffic jam',
        'wait I am thinking', 'what are you doing', 'what is the problem',
        'what is todays date', 'what is your age', 'what is your father do',
        'what is your job', 'what is your mobile number', 'what is your name',
        'whats up', 'when is your interview', 'when we will go',
        'where do you stay', 'where is the bathroom',
        'where is the police station', 'you are wrong', 'address', 'agra',
        'ahemdabad', 'all', 'april', 'assam', 'august', 'australia', 'badoda',
        'banana', 'banaras', 'banglore', 'bihar', 'bihar', 'bridge', 'cat',
        'chandigarh', 'chennai', 'christmas', 'church', 'clinic', 'coconut',
        'crocodile', 'dasara', 'deaf', 'december', 'deer', 'delhi', 'dollar',
        'duck', 'febuary', 'friday', 'fruits', 'glass', 'grapes', 'gujrat',
        'hello', 'hindu', 'hyderabad', 'india', 'january', 'jesus', 'job',
        'july', 'july', 'karnataka', 'kerala', 'krishna', 'litre', 'mango',
        'may', 'mile', 'monday', 'mumbai', 'museum', 'muslim', 'nagpur',
        'october', 'orange', 'pakistan', 'pass', 'police station',
        'post office', 'pune', 'punjab', 'rajasthan', 'ram', 'restaurant',
        'saturday', 'september', 'shop', 'sleep', 'southafrica', 'story',
        'sunday', 'tamil nadu', 'temperature', 'temple', 'thursday', 'toilet',
        'tomato', 'town', 'tuesday', 'usa', 'village', 'voice', 'wednesday',
        'weight'
    ]

    arr = [
        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
        'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
    ]
    with sr.Microphone() as source:

        r.adjust_for_ambient_noise(source)
        i = 0
        while True:
            print('Say something')
            audio = r.listen(source)

            # recognize speech using Sphinx
            try:
                a = r.recognize_google(audio)
                print("you said " + a.lower())
                inputString = a.lower()
                parser = StanfordParser(
                    model_path=
                    'D:/stanford-parser-full-2018-02-27/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
                )
                o = parser.parse(inputString.split())
                #print(o)
                englishtree = [
                    tree for tree in parser.parse(inputString.split())
                ]
                parsetree = englishtree[0]
                dict = {}
                # "***********subtrees**********"
                parenttree = ParentedTree.convert(parsetree)
                for sub in parenttree.subtrees():
                    dict[sub.treeposition()] = 0
                    #"----------------------------------------------"
                isltree = Tree('ROOT', [])
                i = 0
                for sub in parenttree.subtrees():
                    if (sub.label() == "NP" and dict[sub.treeposition()] == 0
                            and dict[sub.parent().treeposition()] == 0):
                        dict[sub.treeposition()] = 1
                        isltree.insert(i, sub)
                        i = i + 1
                    if (sub.label() == "VP" or sub.label() == "PRP"):
                        for sub2 in sub.subtrees():
                            if ((sub2.label() == "NP" or sub2.label() == 'PRP')
                                    and dict[sub2.treeposition()] == 0 and
                                    dict[sub2.parent().treeposition()] == 0):
                                dict[sub2.treeposition()] = 1
                                isltree.insert(i, sub2)
                                i = i + 1
                for sub in parenttree.subtrees():
                    for sub2 in sub.subtrees():
                        # print sub2
                        # print len(sub2.leaves())
                        # print dict[sub2.treeposition()]
                        if (len(sub2.leaves()) == 1
                                and dict[sub2.treeposition()] == 0
                                and dict[sub2.parent().treeposition()] == 0):
                            dict[sub2.treeposition()] = 1
                            isltree.insert(i, sub2)
                            i = i + 1
                parsed_sent = isltree.leaves()
                words = parsed_sent
                stop_words = set(stopwords.words("english"))
                # print stop_words
                lemmatizer = WordNetLemmatizer()
                ps = PorterStemmer()
                lemmatized_words = []
                for w in parsed_sent:
                    # w = ps.stem(w)
                    lemmatized_words.append(lemmatizer.lemmatize(w))
                islsentence = ""
                print("According to ISL:")
                print(lemmatized_words)
                for w in lemmatized_words:
                    if w not in stop_words:
                        islsentence += w
                        islsentence += " "
                l = islsentence.split(" ")
                #print(l)
                t = set(l)
                l1 = list(t)
                #print(l1)
                #print("")
                while ("" in l1):
                    l1.remove("")
                print(l1)
                str = " "
                str = str.join(l1)
                print("Output:")
                print(str)

                #print(islsentence)

                for c in string.punctuation:
                    a = a.replace(c, "")

                if (str == 'done'):
                    print("oops!Time To say good bye")
                    break

                elif (a.lower() in isl_gif):

                    class ImageLabel(tk.Label):
                        """a label that displays images, and plays them if they are gifs"""
                        def load(self, im):
                            if isinstance(im, a.lower()):
                                im = Image.open(im)
                            self.loc = 0
                            self.frames = []

                            try:
                                for i in count(1):
                                    self.frames.append(
                                        ImageTk.PhotoImage(im.copy()))
                                    im.seek(i)
                            except EOFError:
                                pass

                            try:
                                self.delay = im.info['duration']
                            except:
                                self.delay = 100

                            if len(self.frames) == 1:
                                self.config(image=self.frames[0])
                            else:
                                self.next_frame()

                        def unload(self):
                            self.config(image=None)
                            self.frames = None

                        def next_frame(self):
                            if self.frames:
                                self.loc += 1
                                self.loc %= len(self.frames)
                                self.config(image=self.frames[self.loc])
                                self.after(self.delay, self.next_frame)

                    root = tk.Tk()
                    lbl = ImageLabel(root)
                    lbl.pack()
                    lbl.load(r'C:/Users/shree/ISL/ISL_Gifs/{0}.gif'.format(
                        a.lower()))
                    root.mainloop()
                else:

                    for i in range(len(a)):
                        #a[i]=a[i].lower()
                        if (a[i] in arr):

                            ImageAddress = 'letters/' + islsentence[i] + '.jpg'
                            ImageItself = Image.open(ImageAddress)
                            ImageNumpyFormat = np.asarray(ImageItself)
                            plt.imshow(ImageNumpyFormat)
                            plt.draw()
                            plt.pause(1)  # pause how many seconds
                            #plt.close()
                        else:
                            continue

            except:
                print("Could not listen")
            plt.close()
Exemplo n.º 23
0
import os, sys

os.environ[
    'CLASSPATH'] = '/home/Aaditya/assignments/Project/stanford-parser-full-2015-12-09/'

from nltk.parse.stanford import StanfordParser

parser = StanfordParser()

question = sys.argv[1].strip('?').lower()
tree = list(parser.parse(question.split()))[0]

person = ' '.join(
    list(tree.subtrees(
        filter=lambda x: x.label() == 'NP'))[0].leaves()).lower()

l = list(tree.subtrees(filter=lambda x: x.label() == 'VP'))
if len(l) == 0 and 'who' in question:
    what = 'who'
else:
    what = ' '.join(l[0].leaves()).lower()

print(person)
print(what)

from get_results import query

if 'star' in what or 'appear' in what:
    sparql = open('./films.sparql').read()
    sparql = sparql.replace('[[name]]', person)
    query(sparql)
Exemplo n.º 24
0
def en_parser(str):  # 待处理
    eng_parser = StanfordParser(
        r"E:\tools\stanfordNLTK\jar\stanford-parser.jar",
        r"E:\tools\stanfordNLTK\jar\stanford-parser-3.9.1-models.jar",
        r"E:\tools\stanfordNLTK\jar\classifiers\englishPCFG.ser.gz")
    print(list(eng_parser.parse(str.split())))
Exemplo n.º 25
0
import os 
from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser


os.environ['STANFORD_PARSER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09'
os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser.jar'
os.environ['STANFORD_MODELS'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar'

os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_40.jdk/Contents/Home'

eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
print(list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())))
a = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))[0]
a.draw()

eng_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')
res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
for row in res[0].triples():
	print(row)
res[0].tree().draw()
Exemplo n.º 26
0
# for word, tag in chi_tagger.tag(sent.split()):
#     print word.encode('utf-8'), tag
#
# # 英文词性标注
from nltk.tag import StanfordPOSTagger
# eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
# print eng_tagger.tag('What is the airspeed of an unladen swallow ?'.split())
# # 中文词性标注
chi_tagger = StanfordPOSTagger('chinese-distsim.tagger')
# sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'
sent = u'宫体 子宫 呈 垂直位 宫内膜 高 T2 信号 连续'
for _, word_and_tag in chi_tagger.tag(sent.split()):
    word, tag = word_and_tag.split('#')
    print word.encode('utf-8'), tag


# 中英文句法分析 区别在于词库不同
from nltk.parse.stanford import StanfordParser
eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
sent = list(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split())
for tree in eng_parser.parse(sent):
    tree.pprint()


# 依存关系分析
from nltk.parse.stanford import StanfordDependencyParser
eng_parser = StanfordDependencyParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
res = list(eng_parser.parse(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split()))
# st(context=21)
for row in res[0].triples():
    print '(' + row[0][0] + ',' + row[0][1] + ')', row[1], '(' + row[2][0] + ',' + row[2][1] + ')'
def animation_view2(request):
    if request.method == 'POST':
        text = request.POST.get('sen')
        #tokenizing the sentence
        text.lower()
        #tokenizing the sentence
        words = word_tokenize(text)
        print(words)
        parser = StanfordParser(
            model_path=
            'C:/Users/Shree/Downloads/CS 753/project/stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
        )
        englishtree = [tree for tree in parser.parse(text.split())]
        parsetree = englishtree[0]
        dict = {}
        parenttree = ParentedTree.convert(parsetree)
        for sub in parenttree.subtrees():
            dict[sub.treeposition()] = 0
# -------------------
        isltree = Tree('ROOT', [])
        i = 0
        for sub in parenttree.subtrees():
            if (sub.label() == "NP" and dict[sub.treeposition()] == 0
                    and dict[sub.parent().treeposition()] == 0):
                dict[sub.treeposition()] = 1
                isltree.insert(i, sub)
                i += 1

            if (sub.label() == "VP" or sub.label() == "PRP"):
                for sub2 in sub.subtrees():
                    if ((sub2.label() == "NP" or sub2.label() == 'PRP')
                            and dict[sub2.treeposition()] == 0
                            and dict[sub2.parent().treeposition()] == 0):
                        dict[sub2.treeposition()] = 1
                        isltree.insert(i, sub2)
                        i = i + 1

# ---------------------
        for sub in parenttree.subtrees():
            for sub2 in sub.subtrees():
                if (len(sub2.leaves()) == 1 and dict[sub2.treeposition()] == 0
                        and dict[sub2.parent().treeposition()] == 0):
                    dict[sub2.treeposition()] = 1
                    isltree.insert(i, sub2)
                    i = i + 1

        parsed_sent = isltree.leaves()
        words = parsed_sent
        stop_words = set(stopwords.words("english"))

        lemmatizer = WordNetLemmatizer()
        ps = PorterStemmer()
        lemmatized_words = []
        # print(parsed_sent)
        for w in parsed_sent:
            lemmatized_words.append(lemmatizer.lemmatize(w))
        islsentence = ""
        # print(lemmatized_words)
        filtered_text = []
        for w in lemmatized_words:
            if w not in stop_words:
                filtered_text.append(w)
                islsentence += w
                islsentence += " "
        # print(islsentence)
        words = filtered_text
        print(words)
        filtered_text = []
        for w in words:
            path = w + ".mp4"
            f = finders.find(path)
            #splitting the word if its animation is not present in database
            if not f:
                for c in w:
                    filtered_text.append(c)
            #otherwise animation of word
            else:
                filtered_text.append(w)
        words = filtered_text
        return render(request, 'animation2.html', {
            'words': words,
            'text': text
        })
    else:
        return render(request, 'animation2.html')
Exemplo n.º 28
0
# Download Stanford Parser:https://nlp.stanford.edu/software/lex-parser.shtml#Download; unzip #
# Or download Stanford Parser: https://cloud.tsinghua.edu.cn/d/095d08f52f504f32b40d/; unzip #
# Required runtime enviroment for mac also avaible at https://cloud.tsinghua.edu.cn/d/095d08f52f504f32b40d/ #

from nltk.parse.stanford import StanfordParser
import os

# set environment variables to the path to your Stanford Parser
os.environ['STANFORD_PARSER'] = '/Users/baixiaojing/StanfordNLP/stanford-parser-full-2017-06-09/stanford-parser.jar'
os.environ['STANFORD_MODELS'] = '/Users/baixiaojing/StanfordNLP/stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar'

# choose the model for your parser
eng_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')

# parse the sentence
parse = eng_parser.parse("Can you book a flight to London?".split())

# form a tree
tree = list(parse)[0]

# draw a tree
tree.draw()
Exemplo n.º 29
0
from nltk.parse.stanford import StanfordParser

eng_parser = StanfordParser()
print(
    list(
        eng_parser.parse(
            "the quick brown fox jumps over the lazy dog".split())))
import nltk
import os
from nltk.parse.stanford import StanfordParser
from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger
from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tree import *
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

s = raw_input("Enter string")
parser = StanfordParser(
    model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
o = parser.parse(s.split())
tree1 = [tree for tree in parser.parse(s.split())]
parsetree = tree1[0]
dict = {}
#output = '(ROOT (S (PP (IN As) (NP (DT an) (NN accountant))) (NP (PRP I)) (VP (VBP want) (S (VP (TO to) (VP (VB make) (NP (DT a) (NN payment))))))))'
#parsetree=Tree.fromstring(output)
#parsetree=parser.raw_parse(s)
print parsetree

print "***********subtrees**********"

ptree = ParentedTree.convert(parsetree)
for sub in ptree.subtrees():
    #print sub
    dict[sub.treeposition()] = 0
# print sub.label()

print "----------------------------------------------"
Exemplo n.º 31
0
#中文词性标注
chi_tagger=StanfordPOSTagger(model_filename='/home/hsiao/Develops/nlp/stanford-postagger-full-2016-10-31/models/chinese-distsim.tagger',
                             path_to_jar='/home/hsiao/Develops/nlp/stanford-postagger-full-2016-10-31/stanford-postagger.jar')
print(chi_tagger.tag('四川省 成都 信息 工程 大学 我 在 博客 园 开 了 一个 博客 , 我 的 博客 名叫 伏 草 惟 存 , 写 了 一些 自然语言 处理 的 文章 。'.split()))



#英文句法分析
#import os
#java_path='/usr/lib/jvm/jdk/jdk1.8.0_121'
#os.environ['JAVAHOME']=java_path
from nltk.internals import find_jars_within_path
from nltk.parse.stanford import StanfordParser
eng_parser=StanfordParser('/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser.jar',
                          '/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar',
                          '/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/englishPCFG.ser.gz')
eng_parser.__classpath=tuple(find_jars_within_path('/home/hsiao/Develops/nlp/stanford-parser-full-2016-10-31/'))
print (list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())))


#英文依存句法分析
from nltk.parse.stanford import StanfordDependencyParser
eng_parser=StanfordDependencyParser('/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser.jar',
                          '/home/hsiao/Develops/nlp/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar',
                          '/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/englishPCFG.ser.gz')
res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
print (res[0])
for row in res[0].triples():
    print(row)

Exemplo n.º 32
0
Arquivo: test.py Projeto: weiang/baike
def test_chinese_parser():
    sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'
    chi_parser = StanfordParser(model_path=u'edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
    print list(chi_parser.parse(sent.split()))
def main():
	# stanford_pos_dir = '/Users/yuyanzhang/Desktop/CMU/NLP/project/tools/stanford-postagger-full-2015-04-20/'
	# eng_model_filename= stanford_pos_dir + 'models/english-bidirectional-distsim.tagger'
	# my_path_to_jar= stanford_pos_dir + 'stanford-postagger.jar'
	# st = StanfordPOSTagger(model_filename=eng_model_filename, path_to_jar=my_path_to_jar) 
	# print(st.tag('What is the airspeed of an unladen swallow ?'.split()))


	# # NER Tagging:
	stanford_ner = '/Users/yuyanzhang/Desktop/CMU/NLP/project/tools/stanford-ner-2015-04-20/'
	# stanford_ner_model = stanford_ner + 'classifiers/english.all.3class.distsim.crf.ser.gz'
	stanford_ner_model = stanford_ner + 'classifiers/english.muc.7class.distsim.crf.ser.gz'
	stanford_ner_jar = stanford_ner + 'stanford-ner.jar'
	ner = StanfordNERTagger(model_filename=stanford_ner_model, path_to_jar=stanford_ner_jar)
	#print(ner.tag('Rami Eid is studying at Stony Brook University in NY'.split()))

	# Set up the stanford PCFG parser
	stanford_parser_dir = '/Users/yuyanzhang/Desktop/CMU/NLP/project/tools/stanford-parser-full-2015-04-20/'
	eng_model_path = stanford_parser_dir  + "stanford-parser-3.5.2-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
	my_path_to_models_jar = stanford_parser_dir  + "stanford-parser-3.5.2-models.jar"
	my_path_to_jar = stanford_parser_dir  + "stanford-parser.jar"
	parser=StanfordParser(model_path=eng_model_path, path_to_models_jar=my_path_to_models_jar, path_to_jar=my_path_to_jar)
	# sent = "Seth Kramer, one of the directors, describes how he first got the idea for The Linguists when in Vilnius, Lithuania, he could not read Yiddish inscriptions on a path in spite of his Jewish heritage."
	# parser_result =(parser.parse("The random person on the street eats meat.".split()))
	# for a in parser_result:
	# 	getNodes(a)
	# 	print("\u")
	# 	
	

	#	Read in the article and list of questions
	article_path = sys.argv[1]
	question_list = sys.argv[2]

	#	Tokenize all sentences
	sentences_pool = []

	article = open(article_path).read()
	paragraphs = [p for p in article.split('\n') if p]
	for paragraph in paragraphs[1:len(paragraphs)]: #	Skip the title
		# sentences = sent_tokenize(paragraph)
		sentences = sent_tokenize(paragraph.decode('utf-8'))
		for sentence in sentences:
			sentence_tokenized = [a.lower() for a in word_tokenize(sentence)]
			sentences_pool.append(sentence_tokenized)
			
	#	Answer questions in the quesiton list
	count = 0

	#	Read in the lemmatized the sentences
	# sentences_pool_lemmatized = []

	# Uncomment if the lemmatized sentence pool hasn't been generated yet
	# This step takes a long time, so you only need to run lemmatizaiton onece and you can load
	# the lemmatized setences from file to try different things after 
	# with open('sentences_pool_lemmatized.csv','w') as f:
	# 	writer = csv.writer(f,delimiter="\t")
	# 	for sent in sentence_pool:
	# 		sent = [a for a in sent if a != '\t']
	# 		sentences_lemmatized = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(sent)]
	# 		sentences_pool_lemmatized.append(sentences_lemmatized)
	# 		writer.writerow(sentences_lemmatized)
	
	# with open('sentences_pool_lemmatized.csv') as f:
	# 	for line in f:
	# 		line = [a.lower() for a in line.strip().split("\t")]
	# 		sentences_pool_lemmatized.append(line)

	with open(question_list) as f:
		#	For each question on the list
		for question in f:
			count += 1
			question_tokenized = word_tokenize(question)
			question_tokenized_lower = [a.lower() for a in question_tokenized]
			question_start = question_tokenized_lower[0]

			
			# # Control the type of question 
			# if question_start in ['when','where']:
			# 	pass
			# else:
			# 	continue 
			
			#	Seperate question words and question content
			#  filtered_list = [a for a in string.punctuation]
			
			filtered_list = ['?','when', 'what','where','what','why','which','who','how','do','does','did','a','the','an']
			question_content = [a for a in question_tokenized_lower if a not in filtered_list]

			#	Lemmatize the question
			#question_lemmatized = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(question_content)]
			
			
			#	Find the most similar sentences in the pool 
			max_similarity = None
			most_similar_sent = [] #	We need to consider ties

			for sent_idx in range(len(sentences_pool)):
				sent = sentences_pool[sent_idx]
				
				#similarity_score = jaccard_similarity(sent,question_content)+similarity(sent,question_content)
				if question_start == 'why':
					similarity_score = similarity_why(sent,question_content)
				else:
					similarity_score = similarity(sent,question_content, 0.8)
				
				if max_similarity == None:
					max_similarity = similarity_score
					#	Append the origin un-lemmatized sentence
					most_similar_sent.append(sentences_pool[sent_idx])
				elif similarity_score > max_similarity:
					max_similarity = similarity_score
					most_similar_sent.append(sentences_pool[sent_idx])
				else:
					pass
			

			# print((most_similar_sent))
			# print

			#	Now, build the answer from the retrieved sentence
			same_word = set(most_similar_sent[0])
			for s in most_similar_sent[1:]:
				same_word.intersection_update(s)
			
			#	Find the most relevant sentence
			max_similarity_2 = None
			max_similar_sent = None

			for sent in most_similar_sent:
				sent_filtered = [a for a in sent if not a in same_word]
				similarity_socre_2 = similarity(sent_filtered,question_content, 1)
				if max_similarity_2 == None:
					max_similarity_2 = similarity_socre_2
					max_similar_sent = sent
				elif similarity_socre_2 > max_similarity_2:
					max_similarity_2 = similarity_socre_2
					max_similar_sent = sent
			# print(max_similar_sent)
			#	Build answer based on different type of question
			answer = "NULL"
			try:
				#	Yes/No question: answer should contain only yes or no.	
				if question_start in ["is","was","are","were","do","does","did","have","has","had", "wasn't","isn't","aren't"]:
					#	First, convert sentence into a declarative sentence
					if max_similarity_2 == 0:
						answer = "No"
					else:
						question_parse = parser.parse(question_tokenized)
						for parse in question_parse:
							# print(parse)
							verb = parse[0][0].leaves()
							sub = (parse[0][1].leaves())
							obj = (parse[0][2].leaves())
							#substring = " ".join((sub+verb+obj))
							# If yes, most of the words in objects should be in the original sentence
							obj = [a.lower() for a in obj]
							if float(len(intersection(obj,max_similar_sent))) / len(obj)  >= 0.8:
								answer = "Yes"
							else:
								answer = "No"
							
						#	TODO: parse candidate sentence
						# answer = "No"
						# similar_sent_parse = parser.parse(max_similar_sent)
						# for parse in similar_sent_parse:
						# 	verb_ = parse[0][0].leaves()
						# 	sub_ = (parse[0][2].leaves())
						# 	obj_ = (parse[0][1].leaves())
						
				elif question_start == 'why':
					max_similar_sent_str = " ".join(max_similar_sent)
					reason_idx = max_similar_sent_str.index('because of')
					answer = max_similar_sent_str[len('because of'):len(max_similar_sent_str)]
					if reason_idx == -1:
						reason_idx = max_similar_sent_str.index('because')
						answer = max_similar_sent_str[len('because'):len(max_similar_sent_str)]
					if reason_idx == -1:
						reason_idx = max_similar_sent_str.index('for')
						answer = max_similar_sent_str[len('for'):len(max_similar_sent_str)]
					if reason_idx == -1:
						answer = "NULL"


				elif question_start == 'when':
					# 1. Tag: 'DATE', 'TIME'
					# 2.1. one PP or one CD in PP, return it
					# 2.2. multi candidate, return max_similar_sent
					found_DATE = False
					max_similar_sent_tag = ner.tag(max_similar_sent)
					# print max_similar_sent_tag 
					# Uncomment for dry run
					for pair in max_similar_sent_tag:
						if pair[1] == 'DATE' or pair[1] == 'TIME':
							answer = pair[0]
							found_DATE = True
					if not found_DATE:
						#TODO: deal with this situation
						max_similar_parse = parser.parse(max_similar_sent)
						for mparse in max_similar_parse:
							#print mparse
							stack = mparse
							answer = max_similar_parse
							record1 = []                            
							record2 = []
							for i in stack:
								searchLabel(i, "PP", record1)
								# print "-------", record1
							if len(record1) == 1:
								answer = record1[0].leaves()     
							else:
								for j in record1:
									searchLabel(j, "CD", record2)
								if len(record2) == 1:
									answer = record2[0].leaves()

				elif question_start == 'who':
					max_similar_sent_tag = ner.tag(max_similar_sent)
					found_PERSON = False
					for pair in max_similar_sent_tag:
						if pair[1] == 'PERSON':
							answer = pair[0]
							found_PERSON = True
					if not found_PERSON:
						#TODO: deal with this situation
						pass

				elif question_start == 'where':
					found_LOCATION = False
					max_similar_sent_tag = ner.tag(max_similar_sent)
					for pair in max_similar_sent_tag:
						if pair[1] == 'LOCATION' or pair[1] == 'LOCATION':
							answer = pair[0]
							found_LOCATION = True
					if not found_LOCATION:
						max_similar_parse = parser.parse(max_similar_sent)
						for mparse in max_similar_parse:
							#print mparse
							stack = mparse
							answer = max_similar_sent
							record1 = []
							record2 = []
							for i in stack:
								searchLabel(i, "PP", record1)
								# print "-------", record1
							if len(record1) == 1:
								if record1[0][0][0] in ("in", "from", "at", "on", "under"):
									answer = record1[0].leaves()     
							else:
								for j in record1:
									searchLabel(j, "CD", record2)
								if len(record2) == 1:
									answer = record2[0].leaves()

				elif question_start == 'how':
					question_second = question_tokenized_lower[1]
					temp = ['old', 'long', 'many', 'much', 'tall', 'heavy']
					max_similar_sent_str = " ".join(max_similar_sent)
					if question_second not in temp:
					  answer = max_similar_sent_str
					else:
					  number = [int(s) for s in max_similar_sent_str.split() if s.isdigit()]
					  tagged = pos_tag(max_similar_sent)
					  token_candidates = []
					  for token, label in tagged:
						splited = token.split('-')
						if len(splited) > 1:
						  for t in splited:
							if t.isdigit():
							  token_candidates.append(t)
						if label == 'CD':
						  token_candidates.append(token)
					  if len(token_candidates) > 1:
						answer = max_similar_sent_str
					  elif len(token_candidates) == 1:
						answer = token_candidates[0] 
					  else:
						answer = "NULL" 
					
				#For what, which,and others
				else:
					#print(count,question)
					try:
						question_parse = parser.parse(question_tokenized)
						for parse in question_parse:
							#print(parse)
							verb = parse[0][1].leaves()
							sub = (parse[0][1][1].leaves())
							#obj = (parse[0][2].leaves())
							#print(verb,sub)

						similar_sent_parse = parser.parse(max_similar_sent)
						for parse in similar_sent_parse:
							# print(parse)
							answer = parse[0][1][1].leaves()				
					except:
						pass
						#TODO: deal with this situation


				#Capitalize first letter

				if not answer:
					answer = "NULL"
				elif question_start == 'how':
					answer = answer.capitalize()
				else:
					answer = " ".join(answer)

					a = list(answer)
					if a:
						a[0] = a[0].upper()
						answer = "".join(a)
				print(' '.join(question_tokenized))
				print(answer)
			except:
				print(" ".join(max_similar_sent))