def features(docList): import time # download model (only needs to be done once) model_dir = download_and_install_model('WSJ', '/tmp/models') # Loading the model is slow, but only needs to be done once rrp = RerankingParser.from_unified_model_dir(model_dir) rrp.set_parser_options(nbest=5) features = [] scores = [] with open("output_log.txt", "w") as logF, open("syn_feats.pkl", "w") as synFile, open("syn_scores.pkl", "w") as scoresFile: for i, doc in enumerate(docList): start_time = time.time() features.append(defaultdict(float)) scores.append(defaultdict(list)) for sentence in doc: parses = rrp.parse(sentence, rerank=False) #print(len(parses)) #print(sentence, file = logF) try: parse_score = parses[0].parser_score rerank_score = parses[0].reranker_score scores[i]['parse'].append(parse_score) scores[i]['rerank'].append(rerank_score) scores[i]['sent_length'].append( len(parses[0].ptb_parse.tokens())) best_parse = parses[0].ptb_parse # print(best_parse, file = logF) for t in best_parse.all_subtrees(): levels = buildSubtrees(t) for l in levels: features[i][l] += 1.0 except: print("No parse available - skipping") features[i] = {x: v for x, v in features[i].items()} print("{0}".format( sorted(features[i].items(), key=operator.itemgetter(1), reverse=True)), file=logF) print("--- {0} seconds for {1} sentences ---".format( time.time() - start_time, len(doc))) pickle.dump(features, synFile) pickle.dump(scores, scoresFile) # t_bllip = Timer(lambda: rrp.parse(sentence)) # print ("bllip", t_bllip.timeit(number=5)) pass
def parse(self, sent_filename): """ use Charniak parser to parse sentences then convert results to Stanford Dependency """ from bllipparser.ModelFetcher import download_and_install_model from bllipparser import RerankingParser #path_to_model = './bllip-parser/models/WSJ+Gigaword' #if not.path.exists(path_to_model): model_type = 'WSJ+Gigaword' path_to_model = download_and_install_model(model_type, './bllip-parser/models') print "Loading Charniak parser model: %s ..." % (model_type) rrp = RerankingParser.from_unified_model_dir(path_to_model) print "Begin Charniak parsing ..." parsed_filename = sent_filename + '.charniak.parse' parsed_trees = '' lineno = 0 with open(sent_filename, 'r') as f, open(parsed_filename, 'w') as of: for l in f: lineno += 1 print >> logs, 'lineno %s, %s' % (lineno, l) parsed_trees = rrp.simple_parse(l.strip().split()) parsed_trees += '\n' of.write(parsed_trees) # convert parse tree to dependency tree print "Convert Charniak parse tree to Stanford Dependency tree ..." subprocess.call('./scripts/stdconvert.sh ' + parsed_filename, shell=True)
def parse(self,sent_filename): """ use Charniak parser to parse sentences then convert results to Stanford Dependency """ from bllipparser.ModelFetcher import download_and_install_model from bllipparser import RerankingParser #path_to_model = './bllip-parser/models/WSJ+Gigaword' #if not.path.exists(path_to_model): model_type = 'WSJ+Gigaword' path_to_model = download_and_install_model(model_type,'./bllip-parser/models') print "Loading Charniak parser model: %s ..." % (model_type) rrp = RerankingParser.from_unified_model_dir(path_to_model) print "Begin Charniak parsing ..." parsed_filename = sent_filename+'.charniak.parse' parsed_trees = '' with open(sent_filename,'r') as f: for l in f: parsed_trees += rrp.simple_parse(l.strip().split()) parsed_trees += '\n' with open(parsed_filename,'w') as of: of.write(parsed_trees) # convert parse tree to dependency tree print "Convert Charniak parse tree to Stanford Dependency tree ..." subprocess.call('./scripts/stdconvert.sh '+parsed_filename,shell=True)
def test_parse(self): path = bllip_wrapper.init_model() rrp = RerankingParser.from_unified_model_dir(path) tree = bllip_wrapper.parse(rrp, 'hello world!') self.assertIsNotNone(tree) print tree.ptb_parse self.assertEqual(str(tree.ptb_parse), '(S1 (S (NP (NN hello) (NN world) (NN !))))')
def __init__(self, papers, presentations): self.papers = papers self.presentations = presentations self.train_features, self.vectorizer = self.createVectorizer( papers, presentations) model_dir = find('models/bllip_wsj_no_aux').path self.parser = RerankingParser.from_unified_model_dir(model_dir)
def __init__(self): if CharniakParser.parser is None: from bllipparser.ModelFetcher import download_and_install_model from bllipparser import RerankingParser model_type = 'WSJ+Gigaword' path_to_model = download_and_install_model(model_type,'./bllip-parser/models') print "Loading Charniak parser model: %s ..." % (model_type) CharniakParser.parser = RerankingParser.from_unified_model_dir(path_to_model)
def __init__(self, model_dir=None): if model_dir is None: logging.debug("downloading GENIA+PubMed model if necessary ...") model_dir = ModelFetcher.download_and_install_model( 'GENIA+PubMed', os.path.join(tempfile.gettempdir(), 'models')) self.model_dir = os.path.expanduser(model_dir) logging.debug('loading model %s ...', self.model_dir) self.rrp = RerankingParser.from_unified_model_dir(self.model_dir)
def __init__(self, model_dir=None): if model_dir is None: logging.debug("downloading GENIA+PubMed model if necessary ...") model_dir = ModelFetcher.download_and_install_model( 'GENIA+PubMed', os.path.join(tempfile.gettempdir(), 'models')) elif 'pathlib' in str(type(model_dir)): # avoid python 2/3 compatibility issues with os/pathlib2 model_dir = str(model_dir) self.model_dir = os.path.expanduser(model_dir) logging.debug('loading model %s ...' % self.model_dir) self.rrp = RerankingParser.from_unified_model_dir(self.model_dir)
def features(docList): import time # download model (only needs to be done once) model_dir = download_and_install_model('WSJ', '/tmp/models') # Loading the model is slow, but only needs to be done once rrp = RerankingParser.from_unified_model_dir(model_dir) rrp.set_parser_options(nbest = 5) features = [] scores = [] with open("output_log.txt", "w") as logF, open("syn_feats.pkl", "w") as synFile, open("syn_scores.pkl", "w") as scoresFile: for i, doc in enumerate(docList): start_time = time.time() features.append(defaultdict(float)) scores.append(defaultdict(list)) for sentence in doc: parses = rrp.parse(sentence, rerank=False) #print(len(parses)) #print(sentence, file = logF) try: parse_score = parses[0].parser_score rerank_score = parses[0].reranker_score scores[i]['parse'].append(parse_score) scores[i]['rerank'].append(rerank_score) scores[i]['sent_length'].append(len(parses[0].ptb_parse.tokens())) best_parse = parses[0].ptb_parse # print(best_parse, file = logF) for t in best_parse.all_subtrees(): levels = buildSubtrees(t) for l in levels: features[i][l] += 1.0 except: print("No parse available - skipping") features[i] = {x:v for x,v in features[i].items()} print("{0}".format(sorted(features[i].items(), key=operator.itemgetter(1), reverse=True)), file = logF) print("--- {0} seconds for {1} sentences ---" .format(time.time() - start_time, len(doc))) pickle.dump(features, synFile) pickle.dump(scores, scoresFile) # t_bllip = Timer(lambda: rrp.parse(sentence)) # print ("bllip", t_bllip.timeit(number=5)) pass
def main(): # model_dir = download_and_install_model('WSJ', '/tmp/models') model_dir = download_and_install_model('WSJ+Gigaword-v2', '/tmp/models') parser = RerankingParser.from_unified_model_dir(model_dir) goodArticles = [] badArticles = [] articles = importArticles('trainingSet.dat') labels = getFakeGood('trainingSetLabels.dat') fg = open('goodArticlesBllip.txt', 'w') fb = open('badArticlesBllip.txt', 'w') i = 0 for label in labels: if label == 1: goodArticles.append(articles[i]) articleScores = [] for sentence in articles[i]: logging.debug("Looking into good sentence: %s" % sentence) sentenceParses = parser.parse(sentence, 1) sentenceBestScore = sentenceParses[0].parser_score logging.debug("Score for good sentence: %s" % sentenceBestScore) articleScores.append(sentenceBestScore) sum = 0 for a in articleScores: a = float(a) sum = sum + a averageScore = sum / len(articleScores) fg.write("%s, %s, %f\n" % (articles[i], articleScores, averageScore)) if label == 0: badArticles.append(articles[i]) articleScores = [] for sentence in articles[i]: logging.debug("Looking into bad sentence: %s" % sentence) sentenceParses = parser.parse(sentence, 1) sentenceBestScore = sentenceParses[0].parser_score logging.debug("Score for bad sentence: %s" % sentenceBestScore) articleScores.append(sentenceBestScore) sum = 0 for a in articleScores: a = float(a) sum = sum + a averageScore = sum / len(articleScores) fb.write("%s, %s, %f\n" % (articles[i], articleScores, averageScore)) i = i + 1 fg.close() fb.close()
def tokenize(sentence): rrp = RerankingParser.from_unified_model_dir( '/Users/pranavipotharaju/.local/share/bllipparser') sentence = sentence.rstrip("</s>") # if len(sentence) >= 399: # words = nltk.word_tokenize(sentence) # pos_tags = nltk.pos_tag(words) # else: # pos_tags = rrp.tag(sentence) try: pos_tags = rrp.tag(sentence) except Exception as e: print 'blaaa' words = nltk.word_tokenize(sentence) pos_tags = nltk.pos_tag(words) print str(e) words, tags = zip(*pos_tags) tags = ' '.join(tags) return tags + '\n'
def calculate_score(article): sc = 0 lc = 0 rrp = RerankingParser.from_unified_model_dir( '/Users/anushreekumar/.local/share/bllipparser') # print "~~~~~~~ Article ~~~~~~~~~~~" for sent in article.allSentences: # print sent.string try: best_list = rrp.parse(sent.string) score = best_list[0].parser_score except Exception as e: score = -1000 print str(e) sc += score * sent.length lc += sent.length # sent_score.append(sc) # print sent.length,score/float(sent.length),article.label # print "Grammaticality score of ",article.label," article : ",sc/float(lc) return sc / float(lc)
def __init__(self, biomodel): self.parser = RerankingParser.from_unified_model_dir(biomodel.encode('utf-8'))
from nltk.data import find from bllipparser import RerankingParser import freq import sys import re ################## Forming a parse tree ################## def textToWords(text): return re.findall(r'\w+', text.lower()) sentence=open(sys.argv[1]).read() model_dir = find('models/bllip_wsj_no_aux').path parser = RerankingParser.from_unified_model_dir(model_dir) #{'language': 'En', 'case_insensitive': False, 'nbest': 5, 'small_corpus': True, 'overparsing': 21, 'debug': 0, 'smooth_pos': 0} parser.set_parser_options(case_insensitive=True) l = parser.parse(sentence) Trees=[1,2] Trees[0]=l.get_reranker_best().ptb_parse Trees[1]=l.get_parser_best().ptb_parse S1=sentence S1=S1.split() sentence=textToWords(sentence) ################# finding all proper nouns in a sentence ################# def sortSecond(val): return val[1] def get_phrase(index,word): ans=word if(index!=0): ans=sentence[index-1]+" "+ans if(index!=len(sentence)-1): ans=ans+" "+sentence[index+1] return ans def get_phrase1(index,word):
def main(sentence): model_dir = find('models/bllip_wsj_no_aux').path parser = RerankingParser.from_unified_model_dir(model_dir) #{'language': 'En', 'case_insensitive': False, 'nbest': 5, 'small_corpus': True, 'overparsing': 21, 'debug': 0, 'smooth_pos': 0} parser.set_parser_options(case_insensitive=True) l = parser.parse(sentence) Trees = [1, 2] Trees[0] = l.get_reranker_best().ptb_parse Trees[1] = l.get_parser_best().ptb_parse synlist = [] try: for x in range(2): synlist += find_syn(Trees[x], 1)[1] except: a = 0 synlist = rem_dupl(synlist) synlist = list(map(lambda x: x[1], synlist)) #print(synlist) #done to split puncts separately for i in puncts: sentence = sentence.replace(i, ' ' + i + ' ') # sentence = sentence.replace('\n', ' ') WORDS = sentence.split() # WORDS = list(map(lambda x: x.lower(), WORDS)) #now WORDS = list of puncts and lower-cased words in arg text #print(WORDS) Dict = {} it = 1 for w in synlist: while (WORDS[it - 1] != w): Dict[WORDS[it - 1]] = [] it = it + 1 if w in puncts: continue if w.lower() in iitb_lingo: print([it, w, [iitb_lingo[w.lower()]]]) else: synonyms = [] q = "https://api.datamuse.com/words?ml=" + w #building trigram such that adjacent words shouldn't be iitb lingo or punctuation if it > 1 and not (WORDS[it - 2] in iitb_lingo) and not ( WORDS[it - 2] in puncts): q = q + '&lc=' + WORDS[it - 2] if it < len(WORDS) and not (WORDS[it] in iitb_lingo) and not ( WORDS[it] in puncts): q = q + '&rc=' + WORDS[it] response = requests.get(q) l = response.json() for i in l: synonyms.append(i["word"]) #phrase finder # freq = [] # for i in synonyms: # phrase = i # if w > 0 and not(WORDS[w-1] in iitb_lingo) and not(WORDS[w-1] in puncts): # phrase = WORDS[w-1] + ' ' + phrase # if w < len(WORDS)-1 and not(WORDS[w+1] in iitb_lingo) and not(WORDS[w+1] in puncts): # phrase = phrase + ' ' + WORDS[w+1] # encoded_query = urllib.parse.quote(phrase) # params = {'corpus': 'eng-gb', 'query': encoded_query} # params = '&'.join('{}={}'.format(name, value) for name, value in params.items()) # response = requests.get('https://api.phrasefinder.io/search?' + params) # assert response.status_code == 200 # if len(response.json()["phrases"]) > 0: # freq.append(response.json()["phrases"][0]["mc"]) # else: # freq.append(0) # zipped = list(zip(synonyms, freq)) # zipped = sorted(zipped, key = lambda x: x[1], reverse = True) # res = [] # for i in range(min(3,len(zipped))): # res.append(zipped[i][0]) # print(res) #top 5 synonyms Dict[WORDS[it - 1]] = synonyms[:3] print([it, w, synonyms[:3]]) it = it + 1 return Dict
def train_svm(kernel_type): trainDF = fex.read_data('/home/baseline_AC/train_AC_combined_models.csv') testDF = fex.read_data( '/home/baseline_AC/test_AC_combined_models_duplicate_included.csv') model_dir = find('models/bllip_wsj_no_aux').path parser = RerankingParser.from_unified_model_dir(model_dir) #ctx = mx.gpu(0) #bert = BertEmbedding(ctx=ctx) #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #model = BertModel.from_pretrained('bert-base-uncased') #tokenizer = RobertaTokenizer.from_pretrained('roberta-base') #model = RobertaModel.from_pretrained('roberta-base') #tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') #model = XLNetModel.from_pretrained('xlnet-base-cased') model_path = '/home/AC_models_Argument_corpus/roberta/' model = ClassificationModel( 'roberta', model_path, num_labels=4, args={"config": { "output_hidden_states": True }}) #trainDF_x = fex.extract_features(trainDF,parser) #trainDF_x = fex.extract_features(trainDF, parser, tokenizer, model) trainDF_x = fex.extract_features(trainDF, parser, model) feature_train_x = fex.make_feature_vector(trainDF_x) #testDF_x = fex.extract_features(testDF,parser) #testDF_x = fex.extract_features(testDF, parser, tokenizer, model) testDF_x = fex.extract_features(testDF, parser, model) feature_test_x = fex.make_feature_vector(testDF_x) # label encode the target variable train_y = [] test_y = [] for index, row in trainDF.iterrows(): if row['label'] == 'Claim': train_y.append(1) elif row['label'] == 'Premise': train_y.append(0) elif row['label'] == 'MajorClaim': train_y.append(3) else: train_y.append(2) for index, row in testDF.iterrows(): if row['label'] == 'Claim': test_y.append(1) elif row['label'] == 'Premise': test_y.append(0) elif row['label'] == 'MajorClaim': test_y.append(3) else: test_y.append(2) #train_y = encoder.fit_transform(train_y) svmclassifier = svm.SVC(kernel=kernel_type) svmclassifier.fit(feature_train_x, train_y) filename = 'finalized_model_linear.sav' #joblib.dump(svmclassifier, filename) y_pred = svmclassifier.predict(feature_test_x) print("argument corpus results for test:") print(confusion_matrix(test_y, y_pred)) print(classification_report(test_y, y_pred)) print("two law set results for test:") testlawDF = fex.read_data( '/home/baseline_AC/test_judgement_AC_combined_models_duplicate_included.csv' ) #testlawDF_x = fex.extract_features(testlawDF,parser) #testlawDF_x = fex.extract_features(testlawDF, parser, tokenizer, model) testlawDF_x = fex.extract_features(testlawDF, parser, model) feature_test_law = fex.make_feature_vector(testlawDF_x) test_y_law = [] for index, row in testlawDF.iterrows(): if row['label'] == 'Claim': test_y_law.append(1) elif row['label'] == 'Premise': test_y_law.append(0) elif row['label'] == 'MajorClaim': test_y_law.append(3) else: test_y_law.append(2) y_pred_2 = svmclassifier.predict(feature_test_law) print(confusion_matrix(test_y_law, y_pred_2)) print(classification_report(test_y_law, y_pred_2)) filename = 'finalized_model_svm_roberta_finetuned_embedding.sav' joblib.dump(svmclassifier, filename)
parser.add_argument("-v", "--verbose", action="store_true", help="print debug information") args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) if not os.path.isfile(args.input): sys.stderr.write('Cannot find input file: %s\n' % args.input) sys.exit(2) logging.info('Input file: %s' % args.input) logging.info('Output file: %s' % args.output) return args.input, args.output if __name__ == "__main__": inputfilename, outputfilename = parse_argv() model_dir = init_model() logging.info('loading model %s ...' % model_dir) rrp = RerankingParser.from_unified_model_dir(model_dir) collection = parse(inputfilename) collection.clear_infons() collection.infons['tool'] = 'Bllip' collection.infons['process'] = 'parse' parse_bioc(rrp, collection) collection.tobiocfile(outputfilename)
from bllipparser import RerankingParser rrp = RerankingParser.from_unified_model_dir( '/home/kashefi/.local/share/bllipparser/WSJ-PTB3') sentence = "In the 3rd level I would place my little brother in. because my little brother is a very greedy little bot he always wants something." pcfg = rrp.simple_parse(sentence.split(' ')) pcfg = pcfg[4:len(pcfg) - 1] print pcfg ''' pcfg = rrp.simple_parse(sentence) pcfg = pcfg[4:len(pcfg)-1] print(pcfg) '''
from bllipparser import RerankingParser as rrp from nltk.parse.api import ParserI from nltk.tree import Tree from nltk.data import find model_dir = find('models/bllip_wsj_no_aux').path bllip = rrp.from_unified_model_dir(model_dir) f = open("../Fragments_for_testing/text2", "r") sentence = f.read() all_parses = bllip.parse(sentence) ptb = all_parses[0].ptb_parse tree = Tree.fromstring(str(ptb)) tree.draw()
""" Create the Semantic Representation """ sentenceList = [] # a list for the SemanticRepresentation objects workPath = os.getcwd() dependencyInputFile = workPath+'/senna/input.txt' with open(dependencyInputFile) as f: dlines = f.readlines() #Load model to parse PENN TreeBank print 'Loading parsing model...' # only for the first run (uncomment the following line): # rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True) # when it is not the first run: rrp = RerankingParser.from_unified_model_dir('/Users/evania/.local/share/bllipparser/WSJ-PTB3') # Load model to parse PENN TreeBank - is finished #Now try to parse the text: print 'Parsing the dependency for the sentence(s)...' len_dlines = len(dlines) count_dlines = 1 for l in dlines: if l != '\n': # if not an empty line theDependencyResult = getDependency(l) theID = 0 semList = [] for token in theDependencyResult: #print token stringToken = str(token) sem = SemanticRepresentation()
import copy import re from bllipparser import RerankingParser import itertools import urllib import pydot import os from bllipparser.ModelFetcher import download_and_install_model import re if not os.path.exists( os.path.join( os.getcwd(), "bllip", "models", "WSJ") ): print "Downloading the BLLIP model ... " download_and_install_model('WSJ', os.path.join( os.getcwd(), "bllip", "models") ) print "Done Downloading." rrp = RerankingParser.from_unified_model_dir('bllip/models/WSJ') def get_svg(data): graphs = pydot.graph_from_dot_data( data ) svg_string = graphs[0].create_svg() return svg_string def get_fsm_code(list_of_sentences): global rrp list_of_sentences = map( lambda sentence: (str(sentence)).lower(), list_of_sentences) list_of_sentences = map( lambda sentence: re.sub(r'\..*', "", sentence ), list_of_sentences) list_of_parsed_strings = map( lambda sentence: rrp.simple_parse(sentence) , list_of_sentences) list_of_codified_parse_strings = map( lambda parse_string: ParseForest.codify_parse_string(parse_string) , list_of_parsed_strings) list_of_parse_forests = map( lambda codified_parse_string: ParseForest(codified_parse_string), list_of_codified_parse_strings) # list_of_parse_forests = map( lambda codified_parse_string: ParseForest(codified_parse_string), list_of_parsed_strings)