Пример #1
0
SPECIAL_TOKENS = {
    "pad": "<pad>",
    "oov": "<oov>",
    "sos": "<sos>",
    "eos": "<eos>"
}
SPECIAL_TOKEN2ID = {"<pad>": 0, "<oov>": 1, "<sos>": 2, "<eos>": 3}

# spaCy
NLP = spacy.load("en")
# prevent tokenizer split special tokens
for special_token in SPECIAL_TOKENS.values():
    NLP.tokenizer.add_special_case(special_token, [{ORTH: special_token}])

# benepar
PARSER = benepar.Parser("benepar_en2")

# glove
GLOVE = gensim.models.KeyedVectors.load_word2vec_format(GLOVE_BIN_PATH,
                                                        binary=True)

# SBERT
SBERT_MODEL = SentenceTransformer('bert-base-nli-mean-tokens')

# device
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# env
EXP_PLATFORM = "others"  # set it to be "venus" or any other string. This is just used for run experiments on Venus platform.

print("Finished loading constants ...")
Пример #2
0
            width += sub_width
        else:
            width += 1

    if root.label() in labels:
        assert width == len(root.leaves())
        chunks.append((offset, offset+width))

    return width


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

print('Loading model...')
parser = benepar.Parser("benepar_zh")

line = "刘德华 偶尔 玩 下 由 腾讯 开发 的 王者荣耀 这 款 游戏 。 平时 , 经常 会 跟 小伙伴 通过 微信 聊聊天 。"
line = line.strip().split()
tree = parser.parse(line)
print(str(tree)+'\n========')

chunks = []
labels = ['NP',]
line_len = extract_spans_recur(tree, 0, chunks, labels)
assert len(line) == line_len
for st, ed in chunks:
    print('\t'+' '.join(line[st:ed]))

#tree_str = ' '.join(str(tree).split())
#print(tree_str)
Пример #3
0
def find_sentences_of_interest(train):
    """
    train: the file name from sys.argv[1]
    

    Return: a list of sentence of interest, where each element is of structure:
    (sentence Tree(is a nltk.tree.tree), dict{NER:tag}, heuristic socre)

    for example: first element of the returned list acquired from set1/a1.txt is
        (Tree('S', [Tree('NP', [Tree('DT', ['the']), Tree('NNP', ['old']), 
        Tree('NNP', ['kingdom'])]), Tree('VP', [Tree('VBZ', ['is']), 
        Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['period'])]), 
        Tree('PP', [Tree('PP', [Tree('IN', ['in']), Tree('NP', [Tree('DT', ['the']),
         Tree('JJ', ['third']), Tree('NN', ['millennium'])])]), 
         Tree('PRN', [Tree('-LRB-', ['-LRB-']), Tree('NP', [Tree('.', ['c.']),
          Tree('CD', ['2686-2181']), Tree('NNP', ['bc'])]), Tree('-RRB-', ['-RRB-'])])]), 
          Tree('VP', [Tree('ADVP', [Tree('RB', ['also'])]), Tree('VBN', ['known']), 
          Tree('PP', [Tree('IN', ['as']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), 
          Tree('NN', ["'age"])]), Tree('PP', [Tree('PP', [Tree('IN', ['of']), 
          Tree('NP', [Tree('DT', ['the']), Tree('NNS', ['pyramids'])])]),
           Tree("''", ["'"])])]), Tree('CC', ['or']), Tree('NP', [Tree('NN', ["'age"])]), 
           Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['the']), 
           Tree('NN', ['pyramid']), Tree('NNS', ['builders'])])]), Tree("''", ["'"])])]),
            Tree('SBAR', [Tree('IN', ['as']), Tree('S', [Tree('NP', [Tree('PRP', ['it'])]), 
            Tree('VP', [Tree('VBZ', ['includes']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), 
            Tree('JJ', ['great']), Tree('JJ', ['4th']), Tree('NNP', ['dynasty'])]), 
            Tree('SBAR', [Tree('WHADVP', [Tree('WRB', ['when'])]), 
            Tree('S', [Tree('S', [Tree('NP', [Tree('NNP', ['king']), Tree('NNP', ['sneferu'])]), 
            Tree('VP', [Tree('VBD', ['perfected']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), 
            Tree('NN', ['art'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('NN', ['pyramid']), 
            Tree('NN', ['building'])])])])])]), Tree('CC', ['and']), Tree('S', [Tree('NP', [Tree('NP', [Tree('DT', ['the']),
             Tree('NNS', ['pyramids'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('NNP', ['giza'])])])]), 
             Tree('VP', [Tree('VBD', ['were']), Tree('VP', [Tree('VBN', ['constructed']), Tree('PP', [Tree('IN', ['under']), 
             Tree('NP', [Tree('DT', ['the']), Tree('NNS', ['kings']), Tree('NNP', ['khufu']), Tree(',', [',']), Tree('NNP', ['khafre']),
              Tree(',', [',']), Tree('CC', ['and']), Tree('NNP', ['menkaure'])])])])])])])])])])])])])])]), Tree('.', ['.'])]), 
              {'4th': 'ORDINAL', '2686-2181': 'DATE'}, 2)
              

        str = " ".join(sent.leaves()) 
        should give you the whole sentence, where "sent" is the tree structure shown above
    """
    #----------------------------------------read docs----------------------------------------------------------------
    # d is dict where key is the title and value is a list of sentence
    d = dict()
    s = ""
    with open(train, encoding='utf-8', mode='r') as _f:
        for i, line in enumerate(_f):
            if i == 0:
                title = line.strip().lower()
            # delete content after "see also"
            elif line.strip().lower() in set(["see also", 'references']):
                break
            else:
                s += line.strip().lower()
    d[title] = sent_tokenize(s)

    #----------------------------------------parse tree----------------------------------------------------------------
    # parse tree : select NP-VP structured sentence
    candidate = []
    # parser = stanford.StanfordParser(model_path="C:/Users/geyiyang/OneDrive/CMU/2019 Spring/NLP/team project/QAProject/englishPCFG.ser.gz",encoding='utf8')
    parser = benepar.Parser("benepar_en2")
    # sentences = parser.raw_parse_sents(('Hello,My name is completely Melro.','Are you ok?'))
    # pdb.set_trace()
    for v in d.values():
        # sentences = parser.raw_parse_sents(v)
        sentences = parser.parse_sents(v)
        for sentence in sentences:
            if sentence.label() == "S":  # start
                for i in range(len(sentence) - 1):
                    if sentence[i].label() == "NP" and sentence[
                            i + 1].label() == "VP":
                        candidate.append(
                            sentence
                        )  #save this NP-VP sentence as a tree structure
                        break

    #----------------------------------------TF-IDF----------------------------------------------------------------
    Nones = set(["NN", "NNS", "NNP", "NNPS"])
    #extract None
    # freq_dict = []

    # t = []
    # for sent in candidate: #sent is a tree
    #     for word, tag in sent.pos(): # POS
    #         if tag in Nones:
    #             t.append(word)
    # none_len = len(t)
    # freq_dict = Counter(t)

    # dev_data = ['set1','set2','set3','set4','set5']
    # return a tf_idf dict, word:score
    # pdb.set_trace()
    # tf_idf = computeTFIDF(none_len, freq_dict, dev_data)

    # js = json.dumps(tf_idf)
    # with open("tfidf.json",'w') as f:
    # f.write(js)
    with open("repo/QAProject/tfidf.json") as f:
        tf_idf = json.load(f)
    scores = []  #a list of tfidf score for every sentence
    for sent in candidate:  #sent is a tree
        score = 0
        for word, tag in sent.pos():  # POS
            if tag in Nones:
                score += tf_idf[word]
        scores.append(score)
    #----------------------------------------NER tag----------------------------------------------------------------
    # NER = set(["PERSON","NORP","FAC","ORG","GPE","LOC","PRODUCT","EVENT",\
    #     "WORK_OF_ART","LAW","LANGUAGE","DATE","TIME","PERCENT","MONEY","QUANTITY","ORDINAL","CARDINAL"])
    #compute NER for all candidate sentences
    NER = {"PERSON", "ORG", "DATE", "GPE"}

    # pdb.set_trace()
    # heuristic weight for tf-idf and NER
    alpha, beta = 1, 1

    candidate2 = []
    selected = []
    nlp = spacy.load("en_core_web_sm")
    for i, sent in enumerate(candidate):
        str = " ".join(sent.leaves())
        x = nlp(str)
        # pprint([(X.text, X.label_) for X in x.ents])
        for X in x.ents:
            label = X.label_
            if label in NER:  # contains NER tag that we want
                #each sentence is store as a triplet (sent tree, NER tag dict, score)
                candidate2.append([
                    sent,
                    dict([(X.text, X.label_) for X in x.ents]),
                    len(x.ents)
                ])
                selected.append(i)
                break
    ner_socres = np.array([c[2] for c in candidate2])
    ner_socres = np.exp(ner_socres) / sum(np.exp(ner_socres))
    tfidf_scores = [scores[i] for i in selected]
    tfidf_scores = np.exp(tfidf_scores) / sum(np.exp(tfidf_scores))
    for i in range(len(candidate2)):
        candidate2[i][2] = alpha * ner_socres[i] + beta * tfidf_scores[i]
    candidate2.sort(key=lambda x: x[2])

    return candidate2
    output_size = 1
    dropout = 0
    print_every = 1000


def extract_vp(parent):
    if parent.label() == "VP":
        return " ".join(parent.leaves())
    for node in parent:
        if type(node) is nltk.Tree:
            if node.label() == "VP":
                return " ".join(node.leaves())
    return "-"


parser = benepar.Parser("benepar_en2")

args = Arguments()
model = load_model(args)

engine = db_connect()
create_table(engine)
Session = sessionmaker(bind=engine)
session = Session()

methods = session.query(Method).all()

for method in methods:
    description = method.description
    sentences = nltk.sent_tokenize(description)
    if len(sentences) > 0:
def main():
  input_conll_file = sys.argv[1]
  benepar.download('benepar_en2')
  parser = benepar.Parser("benepar_en2")
  add_predconst(input_conll_file, parser)
Пример #6
0
import benepar
import os
fr_parser = benepar.Parser("benepar_fr")
en_parser = benepar.Parser("benepar_en2")


def getFredaoutput(tree):
    #print(tree.pretty_print())
    if type(tree) == str:
        return " " + tree + " "
    elif len(tree) == 1:
        return getFredaoutput(tree[0])

    res = ""
    for t in tree:
        res += getFredaoutput(t)

    res = "(" + res + ")"
    res = res.replace("(", " ( ")
    res = res.replace(")", " ) ")
    res = ' '.join(res.split())
    return res


def tellDiff(s1, s2):
    if len(s1) != len(s2):
        print("len!!!====")
        print(len(s1))
        print(len(s2))
        return False
    for i in range(len(s1)):
Пример #7
0
import benepar

# In Python2, wrap sys.stdin and sys.stdout to work with unicode.
if sys.version_info[0] < 3:
    import codecs
    import locale
    encoding = locale.getpreferredencoding()
    sys.stdin = codecs.getreader(encoding)(sys.stdin)
    sys.stdout = codecs.getwriter(encoding)(sys.stdout)

if sys.version_info.major == 3:
    raw_input = input

model = sys.argv[1] # maybe "benepar_en"

parser = benepar.Parser(model)

def parse(tokens, tags):
    sentence = list(zip(tokens, tags))
    parse_raw, tags_raw, sentence = next(parser._batched_parsed_raw([(tokens, sentence)]))
    tree = parser._make_nltk_tree(sentence, tags_raw, *parse_raw)
    return tree

while True:
    tokens = raw_input()
    tags = raw_input()

    tokens = tokens.split(' ')
    tags = tags.split(' ')

    tree = parse(tokens, tags)
def get_true_false_questions(text, num_questions):

	"""

		Get true/false questions for the specified text
		Args:
			• text: text for which to create questions
			• num_questions: number of questions to create

		Output:
			• question_answers_list: list of questions, where
			each entry is the question + answers for that question

	"""

	# load GPT2 (for generating false sequences) and BERT (for finding sentence similarity of our real sentence against 
	# our fake sentence
	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
	model = GPT2LMHeadModel.from_pretrained("gpt2") # we'll use GPT2 to generate sentences
	# load BERT model
	model_BERT = SentenceTransformer('bert-base-nli-mean-tokens') # we'll use BERT to filter sentences based on similarity

	# load necessary NLP tools + parser
	nltk.download("punkt")
	nlp = spacy.load("en")
	benepar.download("benepar_en2")
	benepar_parser = benepar.Parser("benepar_en2")

	# clean + split text
	text = clean_text(text)
	cleaned_text = get_sentences(text)
	cleaned_text = [clean_text(x) for x in cleaned_text]

	# use parser to split sentences, remove last verb phrase or last noun phrase
	sentence_completion_dict = get_sentence_completions(cleaned_text)

	# get false sentences
	probability_true = 0.5 # probability that we'll add a True statement, rather than the False statement
	num_fake_sentences = 3 # number of (maximum) fake sentences that we'd like to create for each real partial sentence
	answer_choices = " (a) True  (b) False" # define our answer choices
	question_answers_list = [] # list to hold our questions and answers

	for key_sentence in sentence_completion_dict:

		# get our partial sentence
		partial_sentences_list = sentence_completion_dict[key_sentence]

		# start creating false sentences
		false_sentences = []

		print(f"The number of false sentences that we have for the keyword of ({key_sentence}) is: {len(partial_sentences_list)}")
    
    	# loop through list of partial sentences
		for sentence in partial_sentences_list:

			# create our false sentences
			false_sents = generate_sentences(sentence, key_sentence, num_fake_sentences)
			false_sentences.extend(false_sents)

		print(f"After the for loop through the partial sentences, we have {len(false_sentences)} false sentences")

		for idx, false_sent in enumerate(false_sentences):
        	
			# for each fake option, we now need to decide if we'll use a fake question or a real question

			# return the actual question
			if np.random.uniform() <= probability_true:
				question = f" (ANSWER: True) {key_sentence} : " + answer_choices + "\n" # e.g., "(Answer: True) : 2 + 2 = 4"
			# return the false sentence
			else:
				question = f" (ANSWER: False) {false_sent} : " + answer_choices + "\n" # e.g., "(Answer: False) : 2 + 2 = 5"

        	# add question to question list
			question_answers_list.append(question)

			print(f"We have {len(question_answers_list)} questions in our list")

			if len(question_answers_list) >= num_questions:
				break

	# shuffle our questions
	random.shuffle(question_answers_list)

	# get the first "num_questions" values
	return question_answers_list[:num_questions]
Пример #9
0
def benepar_setup():
    berkeley_parser = benepar.Parser("benepar_en2")
    return berkeley_parser
Пример #10
0
	with open("{}.parsed.{}.{}".format(section, dep_type_ind, model_ind), "w") as f:
		for arc in dataset.arcs:
			f.write(' '.join(map(str, arc)))
			f.write('\n')
		print("finished {} {}".format(parser_ind, "heads"))
	with open("{}.parsed.{}.{}.labels".format(section, dep_type_ind, model_ind), "w") as f:
		for rel in dataset.rels:
			f.write(' '.join(map(str, rel)))
			f.write('\n')
		print("finished {} {}".format(parser_ind, "labels"))

if parser_ind == 'benepar':
	sents = [line.split(' ') for line in lines]

	import benepar, nltk
	parser = benepar.Parser("benepar_en3")
	# nlp = spacy.load('en_core_web_md')
	# if spacy.__version__.startswith('2'):
	# 	nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
	# else:
	# 	nlp.add_pipe("benepar", config={"model": "benepar_en3"})
	sents = [benepar.InputSentence(words=sent) for sent in sents]
	print(sents[0])
	dts = parser.parse_sents(sents)
	results = [' '.join(str(item).split()) for item in dts]
	print(len(results))
	# input = "{}.cdeps.{}".format(section, parser_ind)

	with open("{}.cdeps.{}".format(section, parser_ind), "w") as f:
		for item in results:
Пример #11
0
 def __init__(self, model="benepar_en2"):
     self.parser = benepar.Parser(model)
Пример #12
0
def get_cloze_data(input_data, clause_extract=False, proc=None):
    if clause_extract:
        parser = benepar.Parser("benepar_en2")

    ner = spacy.load("en", disable=['parser', 'tagger'])
    tagger = spacy.load("en", disable=['parser', 'ner'])

    cloze_data = []

    q_count = 0
    c_count = 0

    for item in tqdm(input_data, desc="cloze"):
        entry = {}
        entry['title'] = item["document"][0]
        paragraph = {}
        paragraph["context"] = ' '.join(item["document"])

        qas = []

        for sent in item['summary']:
            sent_doc = ner(sent)

            if clause_extract:
                try:
                    clause = get_clause_v2(sent, parser)
                except Exception as e:
                    continue

            for ent in sent_doc.ents:
                answer = ent.text

                question = None
                if clause_extract:
                    for each in clause:
                        if each.find(answer) != -1:
                            question = each.replace(
                                answer, entity_type_map[ent.label_], 1)
                            break
                else:
                    question = sent[:ent.start_char] + \
                            sent[ent.start_char:].replace(answer,entity_type_map[ent.label_], 1)
                if not question:
                    continue

                answer_start = get_answer_start(answer, question,
                                                item['document'], tagger)
                if answer_start == -1:
                    continue

                qas.append({
                    "question":
                    question,
                    "id":
                    "%s_%d" % (item['uid'], q_count),
                    "is_impossible":
                    False,
                    "answers": [{
                        "answer_start": answer_start,
                        "text": answer,
                        "type": ent.label_
                    }],
                    "plausible_answers": []
                })
                q_count += 1

        paragraph['qas'] = qas
        entry['paragraphs'] = [paragraph]

        cloze_data.append(entry)
        #if q_count > 10:
        #    break
        c_count += 1
        if c_count % 2000 == 0:
            print(proc, 'processing %d/%d ...' % (c_count, len(input_data)))

    if proc is not None:
        json.dump(
            cloze_data,
            open(os.path.join(data_dir, 'tmp_store_%d.json' % proc),
                 'w',
                 encoding='utf-8'))

    print('Questions Number', q_count)
    return {"version": "v2.0", 'data': cloze_data}
Пример #13
0
import os, sys, json, time
import benepar

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

print('Loading model...')
parser = benepar.Parser("cn_roberta_aux")
#parser = benepar.Parser("/data2/lfsong/exp.parsing/servc.chinese/cn_roberta_aux")

print('Decoding...')
fout = open('test_pred.txt', 'w')
count = 0
st = time.time()
for line in open(sys.argv[1], 'r'):
    tree = parser.parse(line.strip().split())
    fout.write(str(tree)+'\n')
    count += 1

print('Decoding time for {} sentences: {}'.format(count, time.time() - st))
fout.close()