def test_unknown_model(self): self.assertRaises(UnknownParserModel, download_and_install_model, "bogusmodelname") try: download_and_install_model("bogusmodelname") except UnknownParserModel as u: # make sure these don't crash (this was once a problem...) str(u) repr(u)
def test_download1(self): model_dir = download_and_install_model("WSJ", verbose=False) self.failUnless(isinstance(model_dir, str)) self.failUnless(model_dir) self.failUnless(exists(model_dir)) # make sure it works a second time -- would ideally verify that # it didn't download a second time but there's no API for that # currently model_dir = download_and_install_model("WSJ", verbose=False) self.failUnless(isinstance(model_dir, str)) self.failUnless(model_dir) self.failUnless(exists(model_dir)) shutil.rmtree(model_dir)
def parse(self,sent_filename): """ use Charniak parser to parse sentences then convert results to Stanford Dependency """ from bllipparser.ModelFetcher import download_and_install_model from bllipparser import RerankingParser #path_to_model = './bllip-parser/models/WSJ+Gigaword' #if not.path.exists(path_to_model): model_type = 'WSJ+Gigaword' path_to_model = download_and_install_model(model_type,'./bllip-parser/models') print "Loading Charniak parser model: %s ..." % (model_type) rrp = RerankingParser.from_unified_model_dir(path_to_model) print "Begin Charniak parsing ..." parsed_filename = sent_filename+'.charniak.parse' parsed_trees = '' with open(sent_filename,'r') as f: for l in f: parsed_trees += rrp.simple_parse(l.strip().split()) parsed_trees += '\n' with open(parsed_filename,'w') as of: of.write(parsed_trees) # convert parse tree to dependency tree print "Convert Charniak parse tree to Stanford Dependency tree ..." subprocess.call('./scripts/stdconvert.sh '+parsed_filename,shell=True)
def parse(self, sent_filename): """ use Charniak parser to parse sentences then convert results to Stanford Dependency """ from bllipparser.ModelFetcher import download_and_install_model from bllipparser import RerankingParser #path_to_model = './bllip-parser/models/WSJ+Gigaword' #if not.path.exists(path_to_model): model_type = 'WSJ+Gigaword' path_to_model = download_and_install_model(model_type, './bllip-parser/models') print "Loading Charniak parser model: %s ..." % (model_type) rrp = RerankingParser.from_unified_model_dir(path_to_model) print "Begin Charniak parsing ..." parsed_filename = sent_filename + '.charniak.parse' parsed_trees = '' lineno = 0 with open(sent_filename, 'r') as f, open(parsed_filename, 'w') as of: for l in f: lineno += 1 print >> logs, 'lineno %s, %s' % (lineno, l) parsed_trees = rrp.simple_parse(l.strip().split()) parsed_trees += '\n' of.write(parsed_trees) # convert parse tree to dependency tree print "Convert Charniak parse tree to Stanford Dependency tree ..." subprocess.call('./scripts/stdconvert.sh ' + parsed_filename, shell=True)
def test_download3(self): model_dir = download_and_install_model("WSJ-PTB3", verbose=True, models_directory="/tmp/models") self.failUnless(isinstance(model_dir, str)) self.failUnless(model_dir) self.failUnless(exists(model_dir)) shutil.rmtree(model_dir)
def features(docList): import time # download model (only needs to be done once) model_dir = download_and_install_model('WSJ', '/tmp/models') # Loading the model is slow, but only needs to be done once rrp = RerankingParser.from_unified_model_dir(model_dir) rrp.set_parser_options(nbest=5) features = [] scores = [] with open("output_log.txt", "w") as logF, open("syn_feats.pkl", "w") as synFile, open("syn_scores.pkl", "w") as scoresFile: for i, doc in enumerate(docList): start_time = time.time() features.append(defaultdict(float)) scores.append(defaultdict(list)) for sentence in doc: parses = rrp.parse(sentence, rerank=False) #print(len(parses)) #print(sentence, file = logF) try: parse_score = parses[0].parser_score rerank_score = parses[0].reranker_score scores[i]['parse'].append(parse_score) scores[i]['rerank'].append(rerank_score) scores[i]['sent_length'].append( len(parses[0].ptb_parse.tokens())) best_parse = parses[0].ptb_parse # print(best_parse, file = logF) for t in best_parse.all_subtrees(): levels = buildSubtrees(t) for l in levels: features[i][l] += 1.0 except: print("No parse available - skipping") features[i] = {x: v for x, v in features[i].items()} print("{0}".format( sorted(features[i].items(), key=operator.itemgetter(1), reverse=True)), file=logF) print("--- {0} seconds for {1} sentences ---".format( time.time() - start_time, len(doc))) pickle.dump(features, synFile) pickle.dump(scores, scoresFile) # t_bllip = Timer(lambda: rrp.parse(sentence)) # print ("bllip", t_bllip.timeit(number=5)) pass
def init_model(): """Downloads and installs model if necessary. :return: model path :rtype: str """ logging.info("downloading GENIA+PubMed model if necessary ...") return download_and_install_model('GENIA+PubMed', os.path.join(tempfile.gettempdir(), 'models'))
def __init__(self): if CharniakParser.parser is None: from bllipparser.ModelFetcher import download_and_install_model from bllipparser import RerankingParser model_type = 'WSJ+Gigaword' path_to_model = download_and_install_model(model_type,'./bllip-parser/models') print "Loading Charniak parser model: %s ..." % (model_type) CharniakParser.parser = RerankingParser.from_unified_model_dir(path_to_model)
def test_download2(self): model_info = get_model_info("WSJ") self.assertEqual( str(model_info), "Wall Street Journal corpus from Penn Treebank, " 'version 2 ("AnyDomain" version) [52MB]' ) model_dir = download_and_install_model(model_info.url, verbose=True) self.failUnless(isinstance(model_dir, str)) self.failUnless(model_dir) self.failUnless(exists(model_dir)) shutil.rmtree(model_dir)
def features(docList): import time # download model (only needs to be done once) model_dir = download_and_install_model('WSJ', '/tmp/models') # Loading the model is slow, but only needs to be done once rrp = RerankingParser.from_unified_model_dir(model_dir) rrp.set_parser_options(nbest = 5) features = [] scores = [] with open("output_log.txt", "w") as logF, open("syn_feats.pkl", "w") as synFile, open("syn_scores.pkl", "w") as scoresFile: for i, doc in enumerate(docList): start_time = time.time() features.append(defaultdict(float)) scores.append(defaultdict(list)) for sentence in doc: parses = rrp.parse(sentence, rerank=False) #print(len(parses)) #print(sentence, file = logF) try: parse_score = parses[0].parser_score rerank_score = parses[0].reranker_score scores[i]['parse'].append(parse_score) scores[i]['rerank'].append(rerank_score) scores[i]['sent_length'].append(len(parses[0].ptb_parse.tokens())) best_parse = parses[0].ptb_parse # print(best_parse, file = logF) for t in best_parse.all_subtrees(): levels = buildSubtrees(t) for l in levels: features[i][l] += 1.0 except: print("No parse available - skipping") features[i] = {x:v for x,v in features[i].items()} print("{0}".format(sorted(features[i].items(), key=operator.itemgetter(1), reverse=True)), file = logF) print("--- {0} seconds for {1} sentences ---" .format(time.time() - start_time, len(doc))) pickle.dump(features, synFile) pickle.dump(scores, scoresFile) # t_bllip = Timer(lambda: rrp.parse(sentence)) # print ("bllip", t_bllip.timeit(number=5)) pass
def main(): # model_dir = download_and_install_model('WSJ', '/tmp/models') model_dir = download_and_install_model('WSJ+Gigaword-v2', '/tmp/models') parser = RerankingParser.from_unified_model_dir(model_dir) goodArticles = [] badArticles = [] articles = importArticles('trainingSet.dat') labels = getFakeGood('trainingSetLabels.dat') fg = open('goodArticlesBllip.txt', 'w') fb = open('badArticlesBllip.txt', 'w') i = 0 for label in labels: if label == 1: goodArticles.append(articles[i]) articleScores = [] for sentence in articles[i]: logging.debug("Looking into good sentence: %s" % sentence) sentenceParses = parser.parse(sentence, 1) sentenceBestScore = sentenceParses[0].parser_score logging.debug("Score for good sentence: %s" % sentenceBestScore) articleScores.append(sentenceBestScore) sum = 0 for a in articleScores: a = float(a) sum = sum + a averageScore = sum / len(articleScores) fg.write("%s, %s, %f\n" % (articles[i], articleScores, averageScore)) if label == 0: badArticles.append(articles[i]) articleScores = [] for sentence in articles[i]: logging.debug("Looking into bad sentence: %s" % sentence) sentenceParses = parser.parse(sentence, 1) sentenceBestScore = sentenceParses[0].parser_score logging.debug("Score for bad sentence: %s" % sentenceBestScore) articleScores.append(sentenceBestScore) sum = 0 for a in articleScores: a = float(a) sum = sum + a averageScore = sum / len(articleScores) fb.write("%s, %s, %f\n" % (articles[i], articleScores, averageScore)) i = i + 1 fg.close() fb.close()
#!/usr/bin/python from nltk.parse.bllip import BllipParser from bllipparser.ModelFetcher import download_and_install_model from nltk import word_tokenize, sent_tokenize from pymongo import MongoClient from progressbar import ProgressBar # version parsing logic so we don't unnecessarily reparse confessions PARSER_VERSION = 2 # download SANCL2012-Uniform (trained on WSJ Portion of OntoNotes # and Google Web Treebank) model if not already present model_dir = download_and_install_model('SANCL2012-Uniform') # load model (slow) print "Loading model (this may take a few minutes)..." bllip = BllipParser.from_unified_model_dir(model_dir) # db client = MongoClient() db = client.confessions # db.parses.drop() # find all raw confessions not already parsed (by this version of the parser) confessions = db.confessions.find({ "parsed": { "$ne": PARSER_VERSION } }, limit=1000) total = confessions.count() current = 0 with ProgressBar(max_value=total) as progress: for confession in confessions:
from nltk import word_tokenize from nltk.tree import MultiParentedTree from awesome_print import ap import copy import re from bllipparser import RerankingParser import itertools import urllib import pydot import os from bllipparser.ModelFetcher import download_and_install_model import re if not os.path.exists( os.path.join( os.getcwd(), "bllip", "models", "WSJ") ): print "Downloading the BLLIP model ... " download_and_install_model('WSJ', os.path.join( os.getcwd(), "bllip", "models") ) print "Done Downloading." rrp = RerankingParser.from_unified_model_dir('bllip/models/WSJ') def get_svg(data): graphs = pydot.graph_from_dot_data( data ) svg_string = graphs[0].create_svg() return svg_string def get_fsm_code(list_of_sentences): global rrp list_of_sentences = map( lambda sentence: (str(sentence)).lower(), list_of_sentences) list_of_sentences = map( lambda sentence: re.sub(r'\..*', "", sentence ), list_of_sentences) list_of_parsed_strings = map( lambda sentence: rrp.simple_parse(sentence) , list_of_sentences)
#!/usr/bin/env python __author__ = "VanKos" __version__ = '$Id$' from bllipparser import RerankingParser from bllipparser.ModelFetcher import download_and_install_model model_dir = download_and_install_model('WSJ', '/tmp/models') parser = RerankingParser.from_unified_model_dir(model_dir) import nltk from nltk.tree import * from nltk import treetransforms, bracket_parse from copy import deepcopy import sys import re import logging from os.path import join from collections import defaultdict import math class TreeNode: def __init__(self): self.sName=''; # stores the name of the node self.pChild=list() # stores the array of pointers to the childern of this node self.nodeID=0 self.production='' self.pre_terminal=0 class nodePair: Nx=TreeNode() Nz=TreeNode() class OrderedTreeNode: def __init__(self): self.sName='' # stores the production at the node
from bllipparser.ModelFetcher import download_and_install_model model_dir = download_and_install_model('GENIA+PubMed', './model/', True)