Пример #1
0
 def test_unknown_model(self):
     self.assertRaises(UnknownParserModel, download_and_install_model, "bogusmodelname")
     try:
         download_and_install_model("bogusmodelname")
     except UnknownParserModel as u:
         # make sure these don't crash (this was once a problem...)
         str(u)
         repr(u)
Пример #2
0
    def test_download1(self):
        model_dir = download_and_install_model("WSJ", verbose=False)
        self.failUnless(isinstance(model_dir, str))
        self.failUnless(model_dir)
        self.failUnless(exists(model_dir))

        # make sure it works a second time -- would ideally verify that
        # it didn't download a second time but there's no API for that
        # currently
        model_dir = download_and_install_model("WSJ", verbose=False)
        self.failUnless(isinstance(model_dir, str))
        self.failUnless(model_dir)
        self.failUnless(exists(model_dir))

        shutil.rmtree(model_dir)
Пример #3
0
    def parse(self,sent_filename):
        """
        use Charniak parser to parse sentences then convert results to Stanford Dependency
        """
        from bllipparser.ModelFetcher import download_and_install_model
        from bllipparser import RerankingParser
        #path_to_model = './bllip-parser/models/WSJ+Gigaword'
        #if not.path.exists(path_to_model):
        model_type = 'WSJ+Gigaword'
        path_to_model = download_and_install_model(model_type,'./bllip-parser/models')
        print "Loading Charniak parser model: %s ..." % (model_type)
        rrp = RerankingParser.from_unified_model_dir(path_to_model)
        print "Begin Charniak parsing ..."
        parsed_filename = sent_filename+'.charniak.parse'
        parsed_trees = ''
        with open(sent_filename,'r') as f:
            for l in f:
                parsed_trees += rrp.simple_parse(l.strip().split())
                parsed_trees += '\n'

        with open(parsed_filename,'w') as of:
            of.write(parsed_trees)
                

        # convert parse tree to dependency tree
        print "Convert Charniak parse tree to Stanford Dependency tree ..."
        subprocess.call('./scripts/stdconvert.sh '+parsed_filename,shell=True)
Пример #4
0
    def parse(self, sent_filename):
        """
        use Charniak parser to parse sentences then convert results to Stanford Dependency
        """
        from bllipparser.ModelFetcher import download_and_install_model
        from bllipparser import RerankingParser
        #path_to_model = './bllip-parser/models/WSJ+Gigaword'
        #if not.path.exists(path_to_model):
        model_type = 'WSJ+Gigaword'
        path_to_model = download_and_install_model(model_type,
                                                   './bllip-parser/models')
        print "Loading Charniak parser model: %s ..." % (model_type)
        rrp = RerankingParser.from_unified_model_dir(path_to_model)
        print "Begin Charniak parsing ..."
        parsed_filename = sent_filename + '.charniak.parse'
        parsed_trees = ''
        lineno = 0
        with open(sent_filename, 'r') as f, open(parsed_filename, 'w') as of:
            for l in f:
                lineno += 1
                print >> logs, 'lineno %s, %s' % (lineno, l)
                parsed_trees = rrp.simple_parse(l.strip().split())
                parsed_trees += '\n'
                of.write(parsed_trees)

        # convert parse tree to dependency tree
        print "Convert Charniak parse tree to Stanford Dependency tree ..."
        subprocess.call('./scripts/stdconvert.sh ' + parsed_filename,
                        shell=True)
Пример #5
0
    def test_download3(self):
        model_dir = download_and_install_model("WSJ-PTB3", verbose=True, models_directory="/tmp/models")
        self.failUnless(isinstance(model_dir, str))
        self.failUnless(model_dir)
        self.failUnless(exists(model_dir))

        shutil.rmtree(model_dir)
def features(docList):
    import time

    # download model (only needs to be done once)
    model_dir = download_and_install_model('WSJ', '/tmp/models')
    # Loading the model is slow, but only needs to be done once
    rrp = RerankingParser.from_unified_model_dir(model_dir)
    rrp.set_parser_options(nbest=5)
    features = []
    scores = []
    with open("output_log.txt",
              "w") as logF, open("syn_feats.pkl",
                                 "w") as synFile, open("syn_scores.pkl",
                                                       "w") as scoresFile:

        for i, doc in enumerate(docList):
            start_time = time.time()

            features.append(defaultdict(float))
            scores.append(defaultdict(list))

            for sentence in doc:

                parses = rrp.parse(sentence, rerank=False)
                #print(len(parses))
                #print(sentence, file = logF)
                try:
                    parse_score = parses[0].parser_score
                    rerank_score = parses[0].reranker_score
                    scores[i]['parse'].append(parse_score)
                    scores[i]['rerank'].append(rerank_score)
                    scores[i]['sent_length'].append(
                        len(parses[0].ptb_parse.tokens()))

                    best_parse = parses[0].ptb_parse
                    # print(best_parse, file = logF)

                    for t in best_parse.all_subtrees():
                        levels = buildSubtrees(t)
                        for l in levels:
                            features[i][l] += 1.0
                except:
                    print("No parse available - skipping")
            features[i] = {x: v for x, v in features[i].items()}
            print("{0}".format(
                sorted(features[i].items(),
                       key=operator.itemgetter(1),
                       reverse=True)),
                  file=logF)
            print("--- {0} seconds for {1} sentences ---".format(
                time.time() - start_time, len(doc)))

        pickle.dump(features, synFile)
        pickle.dump(scores, scoresFile)


#     t_bllip = Timer(lambda: rrp.parse(sentence))
#     print ("bllip", t_bllip.timeit(number=5))

    pass
Пример #7
0
def init_model():
    """Downloads and installs model if necessary.

    :return: model path
    :rtype: str
    """
    logging.info("downloading GENIA+PubMed model if necessary ...")
    return download_and_install_model('GENIA+PubMed', os.path.join(tempfile.gettempdir(), 'models'))
Пример #8
0
 def __init__(self):
     if CharniakParser.parser is None:
         from bllipparser.ModelFetcher import download_and_install_model
         from bllipparser import RerankingParser
         model_type = 'WSJ+Gigaword'
         path_to_model = download_and_install_model(model_type,'./bllip-parser/models')
         print "Loading Charniak parser model: %s ..." % (model_type)
         CharniakParser.parser = RerankingParser.from_unified_model_dir(path_to_model)
Пример #9
0
    def test_download2(self):
        model_info = get_model_info("WSJ")
        self.assertEqual(
            str(model_info), "Wall Street Journal corpus from Penn Treebank, " 'version 2 ("AnyDomain" version) [52MB]'
        )
        model_dir = download_and_install_model(model_info.url, verbose=True)
        self.failUnless(isinstance(model_dir, str))
        self.failUnless(model_dir)
        self.failUnless(exists(model_dir))

        shutil.rmtree(model_dir)
Пример #10
0
def features(docList):
    import time

    
    # download model (only needs to be done once)
    model_dir = download_and_install_model('WSJ', '/tmp/models')
    # Loading the model is slow, but only needs to be done once
    rrp = RerankingParser.from_unified_model_dir(model_dir)
    rrp.set_parser_options(nbest = 5)
    features = []
    scores = []
    with open("output_log.txt", "w") as logF, open("syn_feats.pkl", "w")  as synFile, open("syn_scores.pkl", "w")  as scoresFile:

        for i, doc in enumerate(docList):
            start_time = time.time()

            features.append(defaultdict(float))
            scores.append(defaultdict(list))

            for sentence in doc:
                
                parses = rrp.parse(sentence, rerank=False)
                #print(len(parses))
                #print(sentence, file = logF)
                try:
                    parse_score = parses[0].parser_score
                    rerank_score = parses[0].reranker_score
                    scores[i]['parse'].append(parse_score)
                    scores[i]['rerank'].append(rerank_score)
                    scores[i]['sent_length'].append(len(parses[0].ptb_parse.tokens()))
    
                    best_parse = parses[0].ptb_parse
                    # print(best_parse, file = logF)
                
                    for t in best_parse.all_subtrees():
                        levels = buildSubtrees(t)
                        for l in levels:
                            features[i][l] += 1.0
                except:
                    print("No parse available - skipping")
            features[i] = {x:v for x,v in features[i].items()}
            print("{0}".format(sorted(features[i].items(), key=operator.itemgetter(1), reverse=True)), file = logF)
            print("--- {0} seconds for {1} sentences ---" .format(time.time() - start_time, len(doc)))

        pickle.dump(features, synFile)
        pickle.dump(scores, scoresFile)


#     t_bllip = Timer(lambda: rrp.parse(sentence))
#     print ("bllip", t_bllip.timeit(number=5))
    
    pass
Пример #11
0
def main():
    #   model_dir = download_and_install_model('WSJ', '/tmp/models')
    model_dir = download_and_install_model('WSJ+Gigaword-v2', '/tmp/models')
    parser = RerankingParser.from_unified_model_dir(model_dir)
    goodArticles = []
    badArticles = []
    articles = importArticles('trainingSet.dat')
    labels = getFakeGood('trainingSetLabels.dat')
    fg = open('goodArticlesBllip.txt', 'w')
    fb = open('badArticlesBllip.txt', 'w')
    i = 0
    for label in labels:
        if label == 1:
            goodArticles.append(articles[i])
            articleScores = []
            for sentence in articles[i]:
                logging.debug("Looking into good sentence: %s" % sentence)
                sentenceParses = parser.parse(sentence, 1)
                sentenceBestScore = sentenceParses[0].parser_score
                logging.debug("Score for good sentence: %s" %
                              sentenceBestScore)
                articleScores.append(sentenceBestScore)
            sum = 0
            for a in articleScores:
                a = float(a)
                sum = sum + a
            averageScore = sum / len(articleScores)
            fg.write("%s, %s, %f\n" %
                     (articles[i], articleScores, averageScore))
        if label == 0:
            badArticles.append(articles[i])
            articleScores = []
            for sentence in articles[i]:
                logging.debug("Looking into bad sentence: %s" % sentence)
                sentenceParses = parser.parse(sentence, 1)
                sentenceBestScore = sentenceParses[0].parser_score
                logging.debug("Score for bad sentence: %s" % sentenceBestScore)
                articleScores.append(sentenceBestScore)
            sum = 0
            for a in articleScores:
                a = float(a)
                sum = sum + a
            averageScore = sum / len(articleScores)
            fb.write("%s, %s, %f\n" %
                     (articles[i], articleScores, averageScore))
        i = i + 1
    fg.close()
    fb.close()
Пример #12
0
#!/usr/bin/python
from nltk.parse.bllip import BllipParser
from bllipparser.ModelFetcher import download_and_install_model
from nltk import word_tokenize, sent_tokenize
from pymongo import MongoClient
from progressbar import ProgressBar

# version parsing logic so we don't unnecessarily reparse confessions
PARSER_VERSION = 2

# download SANCL2012-Uniform (trained on WSJ Portion of OntoNotes
# and Google Web Treebank) model if not already present
model_dir = download_and_install_model('SANCL2012-Uniform')

# load model (slow)
print "Loading model (this may take a few minutes)..."
bllip = BllipParser.from_unified_model_dir(model_dir)

# db
client = MongoClient()
db = client.confessions
# db.parses.drop()

# find all raw confessions not already parsed (by this version of the parser)
confessions = db.confessions.find({
    "parsed": { "$ne": PARSER_VERSION }
}, limit=1000)
total = confessions.count()
current = 0
with ProgressBar(max_value=total) as progress:
    for confession in confessions:
Пример #13
0
from nltk import word_tokenize
from nltk.tree import MultiParentedTree
from awesome_print import ap
import copy
import re
from bllipparser import RerankingParser
import itertools
import urllib
import pydot
import os
from bllipparser.ModelFetcher import download_and_install_model
import re

if not os.path.exists(  os.path.join( os.getcwd(), "bllip", "models", "WSJ")  ):
	print "Downloading the BLLIP model ... "
	download_and_install_model('WSJ', os.path.join( os.getcwd(), "bllip", "models") )
	print "Done Downloading."

rrp = RerankingParser.from_unified_model_dir('bllip/models/WSJ')


def get_svg(data):
	graphs = pydot.graph_from_dot_data( data )
	svg_string = graphs[0].create_svg()
	return svg_string

def get_fsm_code(list_of_sentences):
	global rrp
	list_of_sentences = map( lambda sentence: (str(sentence)).lower(), list_of_sentences)
	list_of_sentences = map( lambda sentence:  re.sub(r'\..*', "", sentence ), list_of_sentences)
	list_of_parsed_strings = map( lambda sentence: rrp.simple_parse(sentence) , list_of_sentences)
Пример #14
0
#!/usr/bin/env python
__author__ = "VanKos"
__version__ = '$Id$'

from bllipparser import RerankingParser
from bllipparser.ModelFetcher import download_and_install_model
model_dir = download_and_install_model('WSJ', '/tmp/models')
parser = RerankingParser.from_unified_model_dir(model_dir)
import nltk
from nltk.tree import *
from nltk import treetransforms, bracket_parse 
from copy import deepcopy 
import sys
import re
import logging
from os.path import join
from collections import defaultdict      
import math
class TreeNode:
	def __init__(self):
		self.sName=''; # stores the name of the node
		self.pChild=list() # stores the array of pointers to the childern of this node 
		self.nodeID=0
		self.production=''
		self.pre_terminal=0
class nodePair:
	Nx=TreeNode()	               
	Nz=TreeNode()
class OrderedTreeNode:
	def __init__(self):
		self.sName='' 	   # stores the production at the node
Пример #15
0
from bllipparser.ModelFetcher import download_and_install_model

model_dir = download_and_install_model('GENIA+PubMed', './model/', True)