def build_model(fmt='binary'): print 'Loading training data...' train_paths = [ find('corpora/ace_data/ace.dev'), find('corpora/ace_data/ace.heldout'), find('corpora/ace_data/bbn.dev'), find('corpora/ace_data/muc.dev') ] train_trees = load_ace_data(train_paths, fmt) train_data = [postag_tree(t) for t in train_trees] print 'Training...' cp = NEChunkParser(train_data) del train_data print 'Loading eval data...' eval_paths = [find('corpora/ace_data/ace.eval')] eval_trees = load_ace_data(eval_paths, fmt) eval_data = [postag_tree(t) for t in eval_trees] print 'Evaluating...' chunkscore = ChunkScore() for i, correct in enumerate(eval_data): guess = cp.parse(correct.leaves()) chunkscore.score(correct, guess) if i < 3: cmp_chunks(correct, guess) print chunkscore outfilename = '/tmp/ne_chunker_%s.pickle' % fmt print 'Saving chunker to %s...' % outfilename out = open(outfilename, 'wb') pickle.dump(cp, out, -1) out.close() return cp
def test_corpus_bleu(self): ref_file = find('models/wmt15_eval/ref.ru') hyp_file = find('models/wmt15_eval/google.ru') mteval_output_file = find('models/wmt15_eval/mteval-13a.output') # Reads the BLEU scores from the `mteval-13a.output` file. # The order of the list corresponds to the order of the ngrams. with open(mteval_output_file, 'r') as mteval_fin: # The numbers are located in the last 2nd line of the file. # The first and 2nd item in the list are the score and system names. mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1]) with io.open(ref_file, 'r', encoding='utf8') as ref_fin: with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin: # Whitespace tokenize the file. # Note: split() automatically strip(). hypothesis = list(map(lambda x: x.split(), hyp_fin)) # Note that the corpus_bleu input is list of list of references. references = list(map(lambda x: [x.split()], ref_fin)) # Without smoothing. for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores): nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i) # Check that the BLEU scores difference is less than 0.005 . # Note: This is an approximate comparison; as much as # +/- 0.01 BLEU might be "statistically significant", # the actual translation quality might not be. assert abs(mteval_bleu - nltk_bleu) < 0.005 # With the same smoothing method used in mteval-v13a.pl chencherry = SmoothingFunction() for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores): nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i, smoothing_function=chencherry.method3) assert abs(mteval_bleu - nltk_bleu) < 0.005
def nltk_download_corpus(resource_path): """ Download the specified NLTK corpus file unless it has already been downloaded. Returns True if the corpus needed to be downloaded. """ from nltk.data import find from nltk import download from os.path import split, sep from zipfile import BadZipfile # Download the NLTK data only if it is not already downloaded _, corpus_name = split(resource_path) if not resource_path.endswith(sep): resource_path = resource_path + sep downloaded = False try: find(resource_path) except LookupError: download(corpus_name) downloaded = True except BadZipfile: raise BadZipfile( 'The NLTK corpus file being opened is not a zipfile, ' 'or it has been corrupted and needs to be manually deleted.') return downloaded
def test_sentence_nist(self): ref_file = find("models/wmt15_eval/ref.ru") hyp_file = find("models/wmt15_eval/google.ru") mteval_output_file = find("models/wmt15_eval/mteval-13a.output") # Reads the NIST scores from the `mteval-13a.output` file. # The order of the list corresponds to the order of the ngrams. with open(mteval_output_file) as mteval_fin: # The numbers are located in the last 4th line of the file. # The first and 2nd item in the list are the score and system names. mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1]) with open(ref_file, encoding="utf8") as ref_fin: with open(hyp_file, encoding="utf8") as hyp_fin: # Whitespace tokenize the file. # Note: split() automatically strip(). hypotheses = list(map(lambda x: x.split(), hyp_fin)) # Note that the corpus_bleu input is list of list of references. references = list(map(lambda x: [x.split()], ref_fin)) # Without smoothing. for i, mteval_nist in zip(range(1, 10), mteval_nist_scores): nltk_nist = corpus_nist(references, hypotheses, i) # Check that the NIST scores difference is less than 0.5 assert abs(mteval_nist - nltk_nist) < 0.05
def build_model(fmt="binary"): print("Loading training data...") train_paths = [ find("corpora/ace_data/ace.dev"), find("corpora/ace_data/ace.heldout"), find("corpora/ace_data/bbn.dev"), find("corpora/ace_data/muc.dev"), ] train_trees = load_ace_data(train_paths, fmt) train_data = [postag_tree(t) for t in train_trees] print("Training...") cp = NEChunkParser(train_data) del train_data print("Loading eval data...") eval_paths = [find("corpora/ace_data/ace.eval")] eval_trees = load_ace_data(eval_paths, fmt) eval_data = [postag_tree(t) for t in eval_trees] print("Evaluating...") chunkscore = ChunkScore() for i, correct in enumerate(eval_data): guess = cp.parse(correct.leaves()) chunkscore.score(correct, guess) if i < 3: cmp_chunks(correct, guess) print(chunkscore) outfilename = f"/tmp/ne_chunker_{fmt}.pickle" print(f"Saving chunker to {outfilename}...") with open(outfilename, "wb") as outfile: pickle.dump(cp, outfile, -1) return cp
def build_model(fmt='binary'): print('Loading training data...') train_paths = [ find('corpora/ace_data/ace.dev'), find('corpora/ace_data/ace.heldout'), find('corpora/ace_data/bbn.dev'), find('corpora/ace_data/muc.dev') ] train_trees = load_ace_data(train_paths, fmt) train_data = [postag_tree(t) for t in train_trees] print('Training...') cp = NEChunkParser(train_data) del train_data print('Loading eval data...') eval_paths = [find('corpora/ace_data/ace.eval')] eval_trees = load_ace_data(eval_paths, fmt) eval_data = [postag_tree(t) for t in eval_trees] print('Evaluating...') chunkscore = ChunkScore() for i, correct in enumerate(eval_data): guess = cp.parse(correct.leaves()) chunkscore.score(correct, guess) if i < 3: cmp_chunks(correct, guess) print(chunkscore) outfilename = '/tmp/ne_chunker_{0}.pickle'.format(fmt) print('Saving chunker to {0}...'.format(outfilename)) with open(outfilename, 'wb') as outfile: pickle.dump(cp, outfile, -1) return cp
def demo(): from itertools import islice # zip_path = find('corpora/toolbox.zip') # lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse() file_path = find('corpora/toolbox/rotokas.dic') lexicon = ToolboxData(file_path).parse() print('first field in fourth record:') print(lexicon[3][0].tag) print(lexicon[3][0].text) print('\nfields in sequential order:') for field in islice(lexicon.find('record'), 10): print(field.tag, field.text) print('\nlx fields:') for field in islice(lexicon.findall('record/lx'), 10): print(field.text) settings = ToolboxSettings() file_path = find('corpora/toolbox/MDF/MDF_AltH.typ') settings.open(file_path) # settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ')) tree = settings.parse(unwrap=False, encoding='cp1252') print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text) settings_tree = ElementTree(tree) print(to_settings_string(settings_tree).encode('utf8'))
def demo(): from itertools import islice # zip_path = find('corpora/toolbox.zip') # lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse() file_path = find('corpora/toolbox/rotokas.dic') lexicon = ToolboxData(file_path).parse() print 'first field in fourth record:' print lexicon[3][0].tag print lexicon[3][0].text print '\nfields in sequential order:' for field in islice(lexicon.find('record'), 10): print field.tag, field.text print '\nlx fields:' for field in islice(lexicon.findall('record/lx'), 10): print field.text settings = ToolboxSettings() file_path = find('corpora/toolbox/MDF/MDF_AltH.typ') settings.open(file_path) # settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ')) tree = settings.parse(unwrap=False, encoding='cp1252') print tree.find('expset/expMDF/rtfPageSetup/paperSize').text settings_tree = ElementTree(tree) print to_settings_string(settings_tree).encode('utf8')
def build_model(fmt="binary"): print("Loading training data...") train_paths = [ find("corpora/ace_data/ace.dev"), find("corpora/ace_data/ace.heldout"), find("corpora/ace_data/bbn.dev"), find("corpora/ace_data/muc.dev"), ] train_trees = load_ace_data(train_paths, fmt) train_data = [postag_tree(t) for t in train_trees] print("Training...") cp = NEChunkParser(train_data) del train_data print("Loading eval data...") eval_paths = [find("corpora/ace_data/ace.eval")] eval_trees = load_ace_data(eval_paths, fmt) eval_data = [postag_tree(t) for t in eval_trees] print("Evaluating...") chunkscore = ChunkScore() for i, correct in enumerate(eval_data): guess = cp.parse(correct.leaves()) chunkscore.score(correct, guess) if i < 3: cmp_chunks(correct, guess) print(chunkscore) outfilename = "/tmp/ne_chunker_%s.pickle" % fmt print("Saving chunker to %s..." % outfilename) with open(outfilename, "wb") as out: pickle.dump(cp, out, -1) return cp
def build_model(fmt='binary'): print('Loading training data...') train_paths = [find('corpora/ace_data/ace.dev'), find('corpora/ace_data/ace.heldout'), find('corpora/ace_data/bbn.dev'), find('corpora/ace_data/muc.dev')] train_trees = load_ace_data(train_paths, fmt) train_data = [postag_tree(t) for t in train_trees] print('Training...') cp = NEChunkParser(train_data) del train_data print('Loading eval data...') eval_paths = [find('corpora/ace_data/ace.eval')] eval_trees = load_ace_data(eval_paths, fmt) eval_data = [postag_tree(t) for t in eval_trees] print('Evaluating...') chunkscore = ChunkScore() for i, correct in enumerate(eval_data): guess = cp.parse(correct.leaves()) chunkscore.score(correct, guess) if i < 3: cmp_chunks(correct, guess) print(chunkscore) outfilename = '/tmp/ne_chunker_%s.pickle' % fmt print('Saving chunker to %s...' % outfilename) with open(outfilename, 'wb') as outfile: pickle.dump(cp, outfile, -1) return cp
def nltk_download_corpus(resource_path): """ Download the specified NLTK corpus file unless it has already been downloaded. Returns True if the corpus needed to be downloaded. """ from nltk.data import find from nltk import download from os.path import split # Download the wordnet data only if it is not already downloaded _, corpus_name = split(resource_path) ## From http://www.nltk.org/api/nltk.html ## # When using find() to locate a directory contained in a zipfile, # the resource name must end with the forward slash character. # Otherwise, find() will not locate the directory. #### # Helps when resource_path=='sentiment/vader_lexicon'' if not resource_path.endswith('/'): resource_path = resource_path + '/' downloaded = False try: find(resource_path) except LookupError: download(corpus_name) downloaded = True return downloaded
def demo(): from itertools import islice # zip_path = find('corpora/toolbox.zip') # lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse() file_path = find("corpora/toolbox/rotokas.dic") lexicon = ToolboxData(file_path).parse() print("first field in fourth record:") print(lexicon[3][0].tag) print(lexicon[3][0].text) print("\nfields in sequential order:") for field in islice(lexicon.find("record"), 10): print(field.tag, field.text) print("\nlx fields:") for field in islice(lexicon.findall("record/lx"), 10): print(field.text) settings = ToolboxSettings() file_path = find("corpora/toolbox/MDF/MDF_AltH.typ") settings.open(file_path) # settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ')) tree = settings.parse(unwrap=False, encoding="cp1252") print(tree.find("expset/expMDF/rtfPageSetup/paperSize").text) settings_tree = ElementTree(tree) print(to_settings_string(settings_tree).encode("utf8"))
def load_model(word2vec_modelfname=None, is_binary=False): if word2vec_modelfname == 'GoogleNews-vectors-negative300.bin.gz': try: sem_model = gensim.models.KeyedVectors.load_word2vec_format( word2vec_modelfname, binary=True) print( "Google-news model of %d words, each represented by %d-dimensional vectors, successfully loaded." % (len(sem_model.vocab), sem_model.vector_size)) return sem_model except FileNotFoundError: sys.stderr.write( "Model file with name %s not found in directory. Please download it from %s\n(direct link to google drive: %s)." % (word2vec_modelfname, "https://code.google.com/p/word2vec/", "https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing" )) return None if word2vec_modelfname is None: try: word2vec_modelfname = str( find('models/word2vec_sample/pruned.word2vec.txt')) is_binary = False except LookupError: nltk.download('word2vec_sample') word2vec_modelfname = str( find('models/word2vec_sample/pruned.word2vec.txt')) sem_model = gensim.models.KeyedVectors.load_word2vec_format( word2vec_modelfname, binary=is_binary) print( "Semantic model of %d words, each represented by %d-dimensional vectors, successfully loaded." % (len(sem_model.vocab), sem_model.vector_size)) return sem_model
def test_corpus_bleu(self): ref_file = find('models/wmt15_eval/ref.ru') hyp_file = find('models/wmt15_eval/google.ru') mteval_output_file = find('models/wmt15_eval/mteval-13a.output') # Reads the BLEU scores from the `mteval-13a.output` file. # The order of the list corresponds to the order of the ngrams. with open(mteval_output_file, 'r') as mteval_fin: # The numbers are located in the last 2nd line of the file. # The first and 2nd item in the list are the score and system names. mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1]) with io.open(ref_file, 'r', encoding='utf8') as ref_fin: with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin: # Whitespace tokenize the file. # Note: split() automatically strip(). hypothesis = list(map(lambda x: x.split(), hyp_fin)) # Note that the corpus_bleu input is list of list of references. references = list(map(lambda x: [x.split()],ref_fin)) # Without smoothing. for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores): nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i) # Check that the BLEU scores difference is less than 0.005 . # Note: This is an approximate comparison; as much as # +/- 0.01 BLEU might be "statistically significant", # the actual translation quality might not be. assert abs(mteval_bleu - nltk_bleu) < 0.005 # With the same smoothing method used in mteval-v13a.pl chencherry = SmoothingFunction() for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores): nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i, smoothing_function=chencherry.method3) assert abs(mteval_bleu - nltk_bleu) < 0.005
def __init__(self): from nltk.data import find from nltk import download try: find('wordnet.zip') except LookupError: download('wordnet')
def __init__(self): from nltk.data import find from nltk import download # Download the punkt data only if it is not already downloaded try: find('punkt.zip') except LookupError: download('punkt')
def __init__(self): from nltk.data import find from nltk import download # Download the stopwords data only if it is not already downloaded try: find('stopwords.zip') except LookupError: download('stopwords')
def __init__(self): """ Contains various synset related functions. """ try: data.find(os.path.join("corpora", "wordnet")) except LookupError: download("wordnet") self.API = ImageNetAPI()
def require(corpora: list = []): """Download the required NLTK corpus if not found. Keyword Arguments: corpora {list} -- The identifier or name of NLTK corpus (default: {[]}) """ for corpus in corpora: try: find(corpus) except LookupError: download(corpus)
def create_app(config, debug=False, testing=False, config_overrides=None): app = Flask(__name__) app.config.from_object(config) app.debug = debug app.testing = testing if config_overrides: app.config.update(config_overrides) # Configure logging if not app.testing: logging.basicConfig(level=logging.INFO) # Setup the data model. with app.app_context(): model = get_model() model.init_app(app) # Register the Bookshelf CRUD blueprint. from .crud import crud app.register_blueprint(crud, url_prefix='/wcloud') # Add a default root route. @app.route("/") def index(): return redirect(url_for('crud.wcloud')) # Add an error handler. This is useful for debugging the live application, # however, you should disable the output of the exception for production # applications. @app.errorhandler(500) def server_error(e): return """ An internal error occurred: <pre>{}</pre> See logs for full stacktrace. """.format(e), 500 #download nltk corpus first time ran, if needed try: data.find('tokenizers/tokenize') except LookupError: #hack due to SSL issue with downloading from wrong local location #obviously not the right thing to do, just to get working for now try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context download('popular') return app
def setup_nltk(self, **kw): import nltk from nltk.data import find tagger = "averaged_perceptron_tagger" try: find("taggers/%s" % tagger) except LookupError: click.echo("Downloading NTLK data (~2MB)...") nltk.download(tagger) return True return False
def __init__(self, filename='drt_glue.semtype'): try: f = open(data.find('grammars/%s' % filename)) except LookupError: f = open(filename) lines = f.readlines() f.close() for line in lines: # example: 'verb : (\\x.(<word> x), ( subj -o f )) : [subj]' # lambdacalc -^ linear logic -^ line = line.strip() # remove trailing newline if not len(line): continue # skip empty lines if line[0] == '#': continue # skip commented out lines parts = line.split( ' : ') # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]'] glue_formulas = [] parenCount = 0 tuple_start = 0 tuple_comma = 0 relationships = None for i in range(len(parts[1])): if parts[1][i] == '(': if parenCount == 0: # if it's the first '(' of a tuple tuple_start = i + 1 # then save the index parenCount += 1 elif parts[1][i] == ')': parenCount -= 1 if parenCount == 0: # if it's the last ')' of a tuple meaning_term = parts[1][ tuple_start:tuple_comma] # '\\x.(<word> x)' glue_term = parts[1][tuple_comma + 1:i] # '(v-r)' glue_formulas.append( [meaning_term, glue_term]) # add the GlueFormula to the list if len(parts) > 2: relationships = frozenset([ r.strip() for r in parts[2] [parts[2].index('[') + 1:parts[2].index(']')].split(',') ]) elif parts[1][i] == ',' or parts[1][i] == ':': if parenCount == 1: # if it's a comma separating the parts of the tuple tuple_comma = i # then save the index elif parts[1][i] == '#': # skip comments at the ends of lines if parenCount != 0: # if the line hasn't parsed correctly so far raise RuntimeError, 'Formula syntax is incorrect for entry %s' % ( line) break # break to the next line if parts[0] in self: self[parts[0]][relationships] = glue_formulas else: self[parts[0]] = { relationships: glue_formulas } # add the glue entry to the dictionary
def demo(): from nltk.data import find corpus_root = find('corpora/childes/data-xml/Eng-USA/') childes = CHILDESCorpusReader(corpus_root, u'.*.xml') # describe all corpus for file in childes.fileids()[:5]: corpus = '' corpus_id = '' for (key,value) in childes.corpus(file)[0].items(): if key == "Corpus": corpus = value if key == "Id": corpus_id = value print 'Reading', corpus,corpus_id,' .....' print "words:", childes.words(file)[:7],"..." print "words with replaced words:", childes.words(file, replace=True)[:7]," ..." print "words with pos tags:", childes.words(file, pos=True)[:7]," ..." print "words (only MOT):", childes.words(file, speaker='MOT')[:7], "..." print "words (only CHI):", childes.words(file, speaker='CHI')[:7], "..." print "stemmed words:", childes.words(file, stem=True)[:7]," ..." print "words with relations and pos-tag:", childes.words(file, relation=True)[:5]," ..." print "sentence:", childes.sents(file)[:2]," ..." for (participant, values) in childes.participants(file)[0].items(): for (key, value) in values.items(): print "\tparticipant", participant, key, ":", value print "num of sent:", len(childes.sents(file)) print "num of morphemes:", len(childes.words(file, stem=True)) print "age:", childes.age(file) print "age in month:", childes.age(file, month=True) print "MLU:", childes.MLU(file) print '\r'
def construct_pos_list(word2vec_sample_path): nltk.download('word2vec_sample') word2vec_sample = str(find(word2vec_sample_path)) model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False) position_list = [ 'left', 'right', 'above', 'below', 'inside', 'surrounding' ] x = [] y = [] posi_lists = [] for i in range(len(position_list)): tmp_lists = [] numberofsyn = len(wn.synsets(position_list[i])) for j in range(numberofsyn): for w in wn.synsets(position_list[i])[j].lemma_names(): if '_' not in w and w in model: tmp_lists.append(w) x.append(model[w]) y.append(i) posi_lists.append(list(set(tmp_lists))) pca = PCA(n_components=10) pca.fit(x) x = pca.transform(x) x = np.array(x) y = np.array(y) clf = svm.SVC() clf.fit(x, y) return posi_lists, x, y, pca, clf, model
def sql_query(dbname, query): """ Execute an SQL query over a database. :param dbname: filename of persistent store :type schema: str :param query: SQL query :type rel_name: str """ try: import sqlite3 path = find(dbname) connection = sqlite3.connect(path) # return ASCII strings if possible connection.text_factory = sqlite3.OptimizedUnicode cur = connection.cursor() return cur.execute(query) except ImportError: import warnings warnings.warn( "To run this function, first install pysqlite, or else use Python 2.5 or later." ) raise except ValueError: import warnings warnings.warn( "Make sure the database file %s is installed and uncompressed." % dbname) raise
def _vocabulary(self): return ( data.find('stemmers/porter_test/porter_vocabulary.txt') .open(encoding='utf-8') .read() .splitlines() )
def demo(): from nltk.data import find corpus_root = find('corpora/childes/data-xml/Eng-USA/') childes = CHILDESCorpusReader(corpus_root, u'.*.xml') # describe all corpus for file in childes.fileids()[:5]: corpus = '' corpus_id = '' for (key, value) in childes.corpus(file)[0].items(): if key == "Corpus": corpus = value if key == "Id": corpus_id = value print 'Reading', corpus, corpus_id, ' .....' print "words:", childes.words(file)[:7], "..." print "words with replaced words:", childes.words( file, replace=True)[:7], " ..." print "words with pos tags:", childes.words(file, pos=True)[:7], " ..." print "words (only MOT):", childes.words(file, speaker='MOT')[:7], "..." print "words (only CHI):", childes.words(file, speaker='CHI')[:7], "..." print "stemmed words:", childes.words(file, stem=True)[:7], " ..." print "words with relations and pos-tag:", childes.words( file, relation=True)[:5], " ..." print "sentence:", childes.sents(file)[:2], " ..." for (participant, values) in childes.participants(file)[0].items(): for (key, value) in values.items(): print "\tparticipant", participant, key, ":", value print "num of sent:", len(childes.sents(file)) print "num of morphemes:", len(childes.words(file, stem=True)) print "age:", childes.age(file) print "age in month:", childes.age(file, month=True) print "MLU:", childes.MLU(file) print '\r'
def _vocabulary(self): with closing( data.find('stemmers/porter_test/porter_vocabulary.txt').open( encoding='utf-8' ) ) as fp: return fp.read().splitlines()
def __init__(self, papers, presentations): self.papers = papers self.presentations = presentations self.train_features, self.vectorizer = self.createVectorizer( papers, presentations) model_dir = find('models/bllip_wsj_no_aux').path self.parser = RerankingParser.from_unified_model_dir(model_dir)
def namedEntityRecognizer(): echo2("Performing NER on incoming stream") content = request.stream.read() #print content if Verbose: echo2("Incoming content is "+content) PICKLE = "averaged_perceptron_tagger.pickle" AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE)) tagger = PerceptronTagger(load=False) tagger.load(AP_MODEL_LOC) pos_tag = tagger.tag start = time.time() #date_time = timex.tag(content) tokenized = nltk.word_tokenize(content) tagged = pos_tag(tokenized) namedEnt = nltk.ne_chunk(tagged, binary=True) names = extract_entity_names(namedEnt, 'NE') #names.extend(date_time) result = {"result" : "success", "names" : names} if Units: grammar = '''unit: {<CD><NNS>?<NN.*>?}, unit: {<CD><JJ>?<NN.*>} ''' parser = nltk.RegexpParser(grammar) units = extract_entity_names(parser.parse(tagged),'unit') result['units'] = units jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': ')) end = time.time() print "NER took "+str(end - start)+" seconds" return jsonDoc
def load_model(word2vec_modelfname=None, is_binary=False): if word2vec_modelfname is None: try: word2vec_modelfname = str( find('models/word2vec_sample/pruned.word2vec.txt')) is_binary = False except LookupError: nltk.download('word2vec_sample') word2vec_modelfname = str( find('models/word2vec_sample/pruned.word2vec.txt')) model = gensim.models.KeyedVectors.load_word2vec_format( word2vec_modelfname, binary=is_binary) print( "Semantic model of %d words, each represented by %d-dimensional vectors, successfully loaded." % (len(model.vocab), model.vector_size)) return model
def get_instance(cls): if not cls._instance: model_dir = find('models/bllip_wsj_no_aux').path bllipParser = BllipParser.from_unified_model_dir(model_dir) Parser._instance = Parser() Parser._instance._initialize(bllipParser) return Parser._instance
def test_vocabulary_nltk_mode(self): self._test_against_expected_output( PorterStemmer.NLTK_EXTENSIONS, data.find('stemmers/porter_test/porter_nltk_output.txt') .open(encoding='utf-8') .read() .splitlines() )
def __init__(self, load=True): self.model = AveragedPerceptron() self.tagdict = {} self.classes = set() if load: AP_MODEL_LOC = "file:" + str( find("taggers/averaged_perceptron_tagger/" + PICKLE)) self.load(AP_MODEL_LOC)
def _get_tagger(lang=None): if lang == "rus": tagger = PerceptronTagger(False) ap_russian_model_loc = "file:" + str(find(RUS_PICKLE)) tagger.load(ap_russian_model_loc) else: tagger = PerceptronTagger() return tagger
def _get_tagger(lang=None): if lang == 'rus': tagger = PerceptronTagger(False) ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE)) tagger.load(ap_russian_model_loc) else: tagger = PerceptronTagger() return tagger
def demo(corpus_root=None): """ The CHILDES corpus should be manually downloaded and saved to ``[NLTK_Data_Dir]/corpora/childes/`` """ if not corpus_root: from nltk.data import find corpus_root = find("corpora/childes/data-xml/Eng-USA/") try: childes = CHILDESCorpusReader(corpus_root, ".*.xml") # describe all corpus for file in childes.fileids()[:5]: corpus = "" corpus_id = "" for (key, value) in childes.corpus(file)[0].items(): if key == "Corpus": corpus = value if key == "Id": corpus_id = value print("Reading", corpus, corpus_id, " .....") print("words:", childes.words(file)[:7], "...") print( "words with replaced words:", childes.words(file, replace=True)[:7], " ...", ) print("words with pos tags:", childes.tagged_words(file)[:7], " ...") print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...") print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...") print("stemmed words:", childes.words(file, stem=True)[:7], " ...") print( "words with relations and pos-tag:", childes.words(file, relation=True)[:5], " ...", ) print("sentence:", childes.sents(file)[:2], " ...") for (participant, values) in childes.participants(file)[0].items(): for (key, value) in values.items(): print("\tparticipant", participant, key, ":", value) print("num of sent:", len(childes.sents(file))) print("num of morphemes:", len(childes.words(file, stem=True))) print("age:", childes.age(file)) print("age in month:", childes.age(file, month=True)) print("MLU:", childes.MLU(file)) print() except LookupError as e: print("""The CHILDES corpus, or the parts you need, should be manually downloaded from https://childes.talkbank.org/data-xml/ and saved at [NLTK_Data_Dir]/corpora/childes/ Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.: demo('/path/to/childes/data-xml/Eng-USA/") """)
def demo(corpus_root=None): """ The CHILDES corpus should be manually downloaded and saved to ``[NLTK_Data_Dir]/corpora/childes/`` """ if not corpus_root: from nltk.data import find corpus_root = find('corpora/childes/data-xml/Eng-USA/') try: childes = CHILDESCorpusReader(corpus_root, '.*.xml') # describe all corpus for file in childes.fileids()[:5]: corpus = '' corpus_id = '' for (key, value) in childes.corpus(file)[0].items(): if key == "Corpus": corpus = value if key == "Id": corpus_id = value print('Reading', corpus, corpus_id, ' .....') print("words:", childes.words(file)[:7], "...") print( "words with replaced words:", childes.words(file, replace=True)[:7], " ...", ) print("words with pos tags:", childes.tagged_words(file)[:7], " ...") print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...") print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...") print("stemmed words:", childes.words(file, stem=True)[:7], " ...") print( "words with relations and pos-tag:", childes.words(file, relation=True)[:5], " ...", ) print("sentence:", childes.sents(file)[:2], " ...") for (participant, values) in childes.participants(file)[0].items(): for (key, value) in values.items(): print("\tparticipant", participant, key, ":", value) print("num of sent:", len(childes.sents(file))) print("num of morphemes:", len(childes.words(file, stem=True))) print("age:", childes.age(file)) print("age in month:", childes.age(file, month=True)) print("MLU:", childes.MLU(file)) print() except LookupError as e: print( """The CHILDES corpus, or the parts you need, should be manually downloaded from https://childes.talkbank.org/data-xml/ and saved at [NLTK_Data_Dir]/corpora/childes/ Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.: demo('/path/to/childes/data-xml/Eng-USA/") """ )
def __init__(self): from nltk.data import find from nltk import download import os # Download the wordnet data only if it is not already downloaded wordnet_path = None if os.name == 'nt': wordnet_path = os.path.join(os.getenv('APPDATA'), 'nltk_data', 'corpora', 'wordnet.zip') else: wordnet_path = os.path.join(os.path.expanduser('~'), 'nltk_data', 'corpora', 'wordnet.zip') try: if not os.path.isfile(wordnet_path): find('wordnet.zip') except LookupError: download('wordnet')
def test_vocabulary_original_mode(self): # The list of stems for this test was generated by taking the # Martin-blessed stemmer from # http://tartarus.org/martin/PorterStemmer/c.txt # and removing all the --DEPARTURE-- sections from it and # running it against Martin's test vocabulary. with closing( data.find('stemmers/porter_test/porter_original_output.txt'). open(encoding='utf-8')) as fp: self._test_against_expected_output( PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()) self._test_against_expected_output( PorterStemmer.ORIGINAL_ALGORITHM, data.find('stemmers/porter_test/porter_original_output.txt').open( encoding='utf-8').read().splitlines())
def __init__(self): from nltk.data import find from nltk import download import os # Download the punkt data only if it is not already downloaded punkt_path = None if os.name == 'nt': punkt_path = os.path.join(os.getenv('APPDATA'), 'nltk_data', 'tokenizers', 'punkt.zip') else: punkt_path = os.path.join(os.path.expanduser('~'), 'nltk_data', 'tokenizers', 'punkt.zip') try: if not os.path.isfile(punkt_path): find('punkt.zip') except LookupError: download('punkt')
def _get_tagger(lang=None): if lang == "rus": tagger = PerceptronTagger(False) ap_russian_model_loc = "file:" + str(find(RUS_PICKLE)) tagger.load(ap_russian_model_loc) elif lang == "eng": tagger = PerceptronTagger() else: tagger = PerceptronTagger() return tagger
def __init__(self, load=True): ''' :param load: Load the pickled model upon instantiation. ''' self.model = AveragedPerceptron() self.tagdict = {} self.classes = set() if load: AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE)) self.load(AP_MODEL_LOC)
def test_vocabulary_original_mode(self): # The list of stems for this test was generated by taking the # Martin-blessed stemmer from # http://tartarus.org/martin/PorterStemmer/c.txt # and removing all the --DEPARTURE-- sections from it and # running it against Martin's test vocabulary. with closing(data.find('stemmers/porter_test/porter_original_output.txt').open(encoding='utf-8')) as fp: self._test_against_expected_output( PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines() ) self._test_against_expected_output( PorterStemmer.ORIGINAL_ALGORITHM, data.find('stemmers/porter_test/porter_original_output.txt') .open(encoding='utf-8') .read() .splitlines() )
def nltk_download_corpus(resource_path): """ Download the specified NLTK corpus file unless it has already been downloaded. Returns True if the corpus needed to be downloaded. """ from nltk.data import find from nltk import download from os.path import split, sep from zipfile import BadZipfile # Download the NLTK data only if it is not already downloaded _, corpus_name = split(resource_path) # From http://www.nltk.org/api/nltk.html # When using find() to locate a directory contained in a zipfile, # the resource name must end with the forward slash character. # Otherwise, find() will not locate the directory. # # Helps when resource_path=='sentiment/vader_lexicon'' if not resource_path.endswith(sep): resource_path = resource_path + sep downloaded = False try: find(resource_path) except LookupError: download(corpus_name) downloaded = True except BadZipfile: raise BadZipfile( 'The NLTK corpus file being opened is not a zipfile, ' 'or it has been corrupted and needs to be manually deleted.' ) return downloaded
def _str2records(filename, rel): """ Read a file into memory and convert each relation clause into a list. """ recs = [] path = find("corpora/chat80/%s" % filename) for line in path.open(): if line.startswith(rel): line = re.sub(rel+r'\(', '', line) line = re.sub(r'\)\.$', '', line) line = line[:-1] record = line.split(',') recs.append(record) return recs
def syllable_pos_setup(self): """Sets up syllables and POS tagging""" en_list = ['en_CA', 'en_PH', 'en_NA', 'en_NZ', 'en_JM', 'en_BS', 'en_US', 'en_IE', 'en_MW', 'en_IN', 'en_BZ', 'en_TT', 'en_ZA', 'en_AU', 'en_GH', 'en_ZW', 'en_GB'] for lang in en_list: if not dictools.is_installed(lang): dictools.install(lang) self.cmu_dict = cmudict.dict() # sets up POS try: nltk.pos_tag(['test']) self.pos_tag = nltk.pos_tag except urllib2.URLError: PICKLE = "averaged_perceptron_tagger.pickle" AP_MODEL_LOC = 'file:' + str(find('taggers/averaged_perceptron_tagger/' + PICKLE)) tagger = PerceptronTagger(load=False) tagger.load(AP_MODEL_LOC) self.pos_tag = tagger.tag self.tag_dict = {'NN': 'Noun', 'FW': 'Noun', 'JJ': 'Adjective', 'VB': 'Verb', 'IN': 'Preposition', 'CC': 'Conjunction', 'RP': 'Connector', 'TO': 'Connector', 'MD': 'Connector', 'RB': 'Adverb', 'WR': 'Wh-adverb', 'DT': 'DetPro', 'WD': 'DetPro', 'PD': 'DetPro', 'PR': 'DetPro', 'WP': 'DetPro', 'CD': 'Cardinal', 'EX': 'Existential there'} ## self.tag_dict={'NN':'Noun', 'JJ':'Adjective','RB':'Adverb','VB':'Verb', ## 'IN':'Preposition','PR':'Pronoun','CC':'Conjunction', ## 'RP':'Particle','WR':'Wh-adverb','DT':'Determiner', ## 'TO':'To','MD':'Modal Aux','CD':'Cardinal', 'PD':'Predeterminer', ## 'WD':'Wh-determiner', 'WP':'Wh-pronoun','EX':'Existential there'} # POS which are allowed to happen twice in a row self.pos_double = [] # ['Noun','Adjective'] # POS which can only occur sequentially # i.e. an Adverb must occur in fron of a verb self.pos_lead = {'Adverb': ['Verb'], 'Pronoun': ['Noun'], 'Adjective': ['Noun'], 'Preposition': ['Noun', 'Pronoun']} # POS which cannot occur sequentially # i.e. a preposition cannot come before a verb self.pos_restrict_lead = {'Preposition': 'Verb',} return
def __init__(self, filename='drt_glue.semtype'): try: f = open(data.find('grammars/%s' % filename)) except LookupError: f = open(filename) lines = f.readlines() f.close() for line in lines: # example: 'verb : (\\x.(<word> x), ( subj -o f )) : [subj]' # lambdacalc -^ linear logic -^ line = line.strip() # remove trailing newline if not len(line): continue # skip empty lines if line[0] == '#': continue # skip commented out lines parts = line.split(' : ') # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]'] glue_formulas = [] parenCount = 0 tuple_start = 0 tuple_comma = 0 relationships = None for i in range(len(parts[1])): if parts[1][i] == '(': if parenCount == 0: # if it's the first '(' of a tuple tuple_start = i+1 # then save the index parenCount += 1 elif parts[1][i] == ')': parenCount -= 1 if parenCount == 0: # if it's the last ')' of a tuple meaning_term = parts[1][tuple_start:tuple_comma] # '\\x.(<word> x)' glue_term = parts[1][tuple_comma+1:i] # '(v-r)' glue_formulas.append([meaning_term, glue_term]) # add the GlueFormula to the list if len(parts) > 2: relationships = frozenset([r.strip() for r in parts[2][parts[2].index('[')+1:parts[2].index(']')].split(',')]) elif parts[1][i] == ',' or parts[1][i] == ':': if parenCount == 1: # if it's a comma separating the parts of the tuple tuple_comma = i # then save the index elif parts[1][i] == '#': # skip comments at the ends of lines if parenCount != 0: # if the line hasn't parsed correctly so far raise RuntimeError, 'Formula syntax is incorrect for entry %s' % (line) break # break to the next line if parts[0] in self: self[parts[0]][relationships] = glue_formulas else: self[parts[0]] = {relationships: glue_formulas} # add the glue entry to the dictionary
def test_sentence_nist(self): ref_file = find('models/wmt15_eval/ref.ru') hyp_file = find('models/wmt15_eval/google.ru') mteval_output_file = find('models/wmt15_eval/mteval-13a.output') # Reads the NIST scores from the `mteval-13a.output` file. # The order of the list corresponds to the order of the ngrams. with open(mteval_output_file, 'r') as mteval_fin: # The numbers are located in the last 4th line of the file. # The first and 2nd item in the list are the score and system names. mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1]) with io.open(ref_file, 'r', encoding='utf8') as ref_fin: with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin: # Whitespace tokenize the file. # Note: split() automatically strip(). hypotheses = list(map(lambda x: x.split(), hyp_fin)) # Note that the corpus_bleu input is list of list of references. references = list(map(lambda x: [x.split()], ref_fin)) # Without smoothing. for i, mteval_nist in zip(range(1, 10), mteval_nist_scores): nltk_nist = corpus_nist(references, hypotheses, i) # Check that the NIST scores difference is less than 0.5 assert abs(mteval_nist - nltk_nist) < 0.05
def test_vocabulary_martin_mode(self): """Tests all words from the test vocabulary provided by M Porter The sample vocabulary and output were sourced from: http://tartarus.org/martin/PorterStemmer/voc.txt http://tartarus.org/martin/PorterStemmer/output.txt and are linked to from the Porter Stemmer algorithm's homepage at http://tartarus.org/martin/PorterStemmer/ """ with closing(data.find('stemmers/porter_test/porter_martin_output.txt').open(encoding='utf-8')) as fp: self._test_against_expected_output( PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines() )
def demo(): """This assumes the Python module bllipparser is installed.""" # download and install a basic unified parsing model (Wall Street Journal) # sudo python -m nltk.downloader bllip_wsj_no_aux from nltk.data import find model_dir = find('models/bllip_wsj_no_aux').path print('Loading BLLIP Parsing models...') # the easiest way to get started is to use a unified model bllip = BllipParser.from_unified_model_dir(model_dir) print('Done.') sentence1 = 'British left waffles on Falklands .'.split() sentence2 = 'I saw the man with the telescope .'.split() # this sentence is known to fail under the WSJ parsing model fail1 = '# ! ? : -'.split() for sentence in (sentence1, sentence2, fail1): print('Sentence: %r' % ' '.join(sentence)) try: tree = next(bllip.parse(sentence)) print(tree) except StopIteration: print("(parse failed)") # n-best parsing demo for i, parse in enumerate(bllip.parse(sentence1)): print('parse %d:\n%s' % (i, parse)) # using external POS tag constraints print( "forcing 'tree' to be 'NN':", next(bllip.tagged_parse([('A', None), ('tree', 'NN')])), ) print( "forcing 'A' to be 'DT' and 'tree' to be 'NNP':", next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')])), ) # constraints don't have to make sense... (though on more complicated # sentences, they may cause the parse to fail) print( "forcing 'A' to be 'NNP':", next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)])), )
def read_file(self, empty_first=True): if empty_first: self.clear() try: f = open(data.find("grammars/glue.semtype")) except LookupError: f = open("glue.semtype") lines = f.readlines() f.close() for line in lines: # example: 'n : (\\x.(<word> x), (v-r))' # lambdacalc -^ linear logic -^ line = line.strip() # remove trailing newline if not len(line): continue # skip empty lines if line[0] == "#": continue # skip commented out lines parts = line.split(" : ", 1) # ['n', '(\\x.(<word> x), (v-r))'] glue_formulas = [] parenCount = 0 tuple_start = 0 tuple_comma = 0 for i in range(len(parts[1])): if parts[1][i] == "(": if parenCount == 0: # if it's the first '(' of a tuple tuple_start = i + 1 # then save the index parenCount += 1 elif parts[1][i] == ")": parenCount -= 1 if parenCount == 0: # if it's the last ')' of a tuple meaning_term = parts[1][tuple_start:tuple_comma] # '\\x.(<word> x)' glue_term = parts[1][tuple_comma + 1 : i] # '(v-r)' glue_formulas.append([meaning_term, glue_term]) # add the GlueFormula to the list elif parts[1][i] == "," or parts[1][i] == ":": if parenCount == 1: # if it's a comma separating the parts of the tuple tuple_comma = i # then save the index elif parts[1][i] == "#": # skip comments at the ends of lines if parenCount != 0: # if the line hasn't parsed correctly so far raise RuntimeError, "Formula syntax is incorrect for entry %s" % (line) break # break to the next line self[parts[0]] = glue_formulas # add the glue entry to the dictionary
def sql_query(dbname, query): """ Execute an SQL query over a database. :param dbname: filename of persistent store :type schema: str :param query: SQL query :type rel_name: str """ try: import sqlite3 path = find(dbname) connection = sqlite3.connect(path) # return ASCII strings if possible connection.text_factory = sqlite3.OptimizedUnicode cur = connection.cursor() return cur.execute(query) except ImportError: import warnings warnings.warn("To run this function, first install pysqlite, or else use Python 2.5 or later.") raise
def second_lexicon(positive_seeds,negative_seeds): word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt')) model = gensim.models.Word2Vec.load_word2vec_format(word2vec_sample, binary=False) positive_list=[] negative_list=[] for aword in model.vocab: score=0 for pseed in positive_seeds: score+=model.similarity(aword, pseed) for nseed in negative_seeds: score-=model.similarity(aword,nseed) score=score/16.0 if score>0.03: positive_list.append(aword) elif score<-0.03: negative_list.append(aword) return positive_list,negative_list