Exemplo n.º 1
0
def build_model(fmt='binary'):
    print 'Loading training data...'
    train_paths = [
        find('corpora/ace_data/ace.dev'),
        find('corpora/ace_data/ace.heldout'),
        find('corpora/ace_data/bbn.dev'),
        find('corpora/ace_data/muc.dev')
    ]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print 'Training...'
    cp = NEChunkParser(train_data)
    del train_data

    print 'Loading eval data...'
    eval_paths = [find('corpora/ace_data/ace.eval')]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print 'Evaluating...'
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3: cmp_chunks(correct, guess)
    print chunkscore

    outfilename = '/tmp/ne_chunker_%s.pickle' % fmt
    print 'Saving chunker to %s...' % outfilename
    out = open(outfilename, 'wb')
    pickle.dump(cp, out, -1)
    out.close()

    return cp
Exemplo n.º 2
0
    def test_corpus_bleu(self):
        ref_file = find('models/wmt15_eval/ref.ru')
        hyp_file = find('models/wmt15_eval/google.ru')
        mteval_output_file = find('models/wmt15_eval/mteval-13a.output')

        # Reads the BLEU scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file, 'r') as mteval_fin:
            # The numbers are located in the last 2nd line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])

        with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
            with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypothesis = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()], ref_fin))
                # Without smoothing.
                for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i)
                    # Check that the BLEU scores difference is less than 0.005 .
                    # Note: This is an approximate comparison; as much as
                    #       +/- 0.01 BLEU might be "statistically significant",
                    #       the actual translation quality might not be.
                    assert abs(mteval_bleu - nltk_bleu) < 0.005

                # With the same smoothing method used in mteval-v13a.pl
                chencherry = SmoothingFunction()
                for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(references, hypothesis,
                                            weights=(1.0/i,)*i,
                                            smoothing_function=chencherry.method3)
                    assert abs(mteval_bleu - nltk_bleu) < 0.005
Exemplo n.º 3
0
def nltk_download_corpus(resource_path):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    from nltk.data import find
    from nltk import download
    from os.path import split, sep
    from zipfile import BadZipfile

    # Download the NLTK data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    if not resource_path.endswith(sep):
        resource_path = resource_path + sep

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True
    except BadZipfile:
        raise BadZipfile(
            'The NLTK corpus file being opened is not a zipfile, '
            'or it has been corrupted and needs to be manually deleted.')

    return downloaded
    def test_sentence_nist(self):
        ref_file = find("models/wmt15_eval/ref.ru")
        hyp_file = find("models/wmt15_eval/google.ru")
        mteval_output_file = find("models/wmt15_eval/mteval-13a.output")

        # Reads the NIST scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file) as mteval_fin:
            # The numbers are located in the last 4th line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_nist_scores = map(float,
                                     mteval_fin.readlines()[-4].split()[1:-1])

        with open(ref_file, encoding="utf8") as ref_fin:
            with open(hyp_file, encoding="utf8") as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypotheses = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()], ref_fin))
                # Without smoothing.
                for i, mteval_nist in zip(range(1, 10), mteval_nist_scores):
                    nltk_nist = corpus_nist(references, hypotheses, i)
                    # Check that the NIST scores difference is less than 0.5
                    assert abs(mteval_nist - nltk_nist) < 0.05
def build_model(fmt="binary"):
    print("Loading training data...")
    train_paths = [
        find("corpora/ace_data/ace.dev"),
        find("corpora/ace_data/ace.heldout"),
        find("corpora/ace_data/bbn.dev"),
        find("corpora/ace_data/muc.dev"),
    ]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print("Training...")
    cp = NEChunkParser(train_data)
    del train_data

    print("Loading eval data...")
    eval_paths = [find("corpora/ace_data/ace.eval")]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print("Evaluating...")
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3:
            cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = f"/tmp/ne_chunker_{fmt}.pickle"
    print(f"Saving chunker to {outfilename}...")

    with open(outfilename, "wb") as outfile:
        pickle.dump(cp, outfile, -1)

    return cp
Exemplo n.º 6
0
def build_model(fmt='binary'):
    print('Loading training data...')
    train_paths = [
        find('corpora/ace_data/ace.dev'),
        find('corpora/ace_data/ace.heldout'),
        find('corpora/ace_data/bbn.dev'),
        find('corpora/ace_data/muc.dev')
    ]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print('Training...')
    cp = NEChunkParser(train_data)
    del train_data

    print('Loading eval data...')
    eval_paths = [find('corpora/ace_data/ace.eval')]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print('Evaluating...')
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3: cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = '/tmp/ne_chunker_{0}.pickle'.format(fmt)
    print('Saving chunker to {0}...'.format(outfilename))

    with open(outfilename, 'wb') as outfile:
        pickle.dump(cp, outfile, -1)

    return cp
Exemplo n.º 7
0
def demo():
    from itertools import islice

#    zip_path = find('corpora/toolbox.zip')
#    lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
    file_path = find('corpora/toolbox/rotokas.dic')
    lexicon = ToolboxData(file_path).parse()
    print('first field in fourth record:')
    print(lexicon[3][0].tag)
    print(lexicon[3][0].text)

    print('\nfields in sequential order:')
    for field in islice(lexicon.find('record'), 10):
        print(field.tag, field.text)

    print('\nlx fields:')
    for field in islice(lexicon.findall('record/lx'), 10):
        print(field.text)

    settings = ToolboxSettings()
    file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
    settings.open(file_path)
#    settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
    tree = settings.parse(unwrap=False, encoding='cp1252')
    print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text)
    settings_tree = ElementTree(tree)
    print(to_settings_string(settings_tree).encode('utf8'))
Exemplo n.º 8
0
Arquivo: toolbox.py Projeto: sp00/nltk
def demo():
    from itertools import islice

    #    zip_path = find('corpora/toolbox.zip')
    #    lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
    file_path = find('corpora/toolbox/rotokas.dic')
    lexicon = ToolboxData(file_path).parse()
    print 'first field in fourth record:'
    print lexicon[3][0].tag
    print lexicon[3][0].text

    print '\nfields in sequential order:'
    for field in islice(lexicon.find('record'), 10):
        print field.tag, field.text

    print '\nlx fields:'
    for field in islice(lexicon.findall('record/lx'), 10):
        print field.text

    settings = ToolboxSettings()
    file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
    settings.open(file_path)
    #    settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
    tree = settings.parse(unwrap=False, encoding='cp1252')
    print tree.find('expset/expMDF/rtfPageSetup/paperSize').text
    settings_tree = ElementTree(tree)
    print to_settings_string(settings_tree).encode('utf8')
Exemplo n.º 9
0
def build_model(fmt="binary"):
    print("Loading training data...")
    train_paths = [
        find("corpora/ace_data/ace.dev"),
        find("corpora/ace_data/ace.heldout"),
        find("corpora/ace_data/bbn.dev"),
        find("corpora/ace_data/muc.dev"),
    ]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print("Training...")
    cp = NEChunkParser(train_data)
    del train_data

    print("Loading eval data...")
    eval_paths = [find("corpora/ace_data/ace.eval")]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print("Evaluating...")
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3:
            cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = "/tmp/ne_chunker_%s.pickle" % fmt
    print("Saving chunker to %s..." % outfilename)

    with open(outfilename, "wb") as out:
        pickle.dump(cp, out, -1)

    return cp
def build_model(fmt='binary'):
    print('Loading training data...')
    train_paths = [find('corpora/ace_data/ace.dev'),
                   find('corpora/ace_data/ace.heldout'),
                   find('corpora/ace_data/bbn.dev'),
                   find('corpora/ace_data/muc.dev')]
    train_trees = load_ace_data(train_paths, fmt)
    train_data = [postag_tree(t) for t in train_trees]
    print('Training...')
    cp = NEChunkParser(train_data)
    del train_data

    print('Loading eval data...')
    eval_paths = [find('corpora/ace_data/ace.eval')]
    eval_trees = load_ace_data(eval_paths, fmt)
    eval_data = [postag_tree(t) for t in eval_trees]

    print('Evaluating...')
    chunkscore = ChunkScore()
    for i, correct in enumerate(eval_data):
        guess = cp.parse(correct.leaves())
        chunkscore.score(correct, guess)
        if i < 3: cmp_chunks(correct, guess)
    print(chunkscore)

    outfilename = '/tmp/ne_chunker_%s.pickle' % fmt
    print('Saving chunker to %s...' % outfilename)

    with open(outfilename, 'wb') as outfile:
        pickle.dump(cp, outfile, -1)

    return cp
Exemplo n.º 11
0
def nltk_download_corpus(resource_path):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    from nltk.data import find
    from nltk import download
    from os.path import split

    # Download the wordnet data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    ## From http://www.nltk.org/api/nltk.html ##
    # When using find() to locate a directory contained in a zipfile,
    # the resource name must end with the forward slash character.
    # Otherwise, find() will not locate the directory.
    ####
    # Helps when resource_path=='sentiment/vader_lexicon''
    if not resource_path.endswith('/'):
        resource_path = resource_path + '/'

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True

    return downloaded
Exemplo n.º 12
0
def demo():
    from itertools import islice

    #    zip_path = find('corpora/toolbox.zip')
    #    lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
    file_path = find("corpora/toolbox/rotokas.dic")
    lexicon = ToolboxData(file_path).parse()
    print("first field in fourth record:")
    print(lexicon[3][0].tag)
    print(lexicon[3][0].text)

    print("\nfields in sequential order:")
    for field in islice(lexicon.find("record"), 10):
        print(field.tag, field.text)

    print("\nlx fields:")
    for field in islice(lexicon.findall("record/lx"), 10):
        print(field.text)

    settings = ToolboxSettings()
    file_path = find("corpora/toolbox/MDF/MDF_AltH.typ")
    settings.open(file_path)
    #    settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
    tree = settings.parse(unwrap=False, encoding="cp1252")
    print(tree.find("expset/expMDF/rtfPageSetup/paperSize").text)
    settings_tree = ElementTree(tree)
    print(to_settings_string(settings_tree).encode("utf8"))
Exemplo n.º 13
0
def nltk_download_corpus(resource_path):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    from nltk.data import find
    from nltk import download
    from os.path import split

    # Download the wordnet data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    ## From http://www.nltk.org/api/nltk.html ##
    # When using find() to locate a directory contained in a zipfile,
    # the resource name must end with the forward slash character.
    # Otherwise, find() will not locate the directory.
    ####
    # Helps when resource_path=='sentiment/vader_lexicon''
    if not resource_path.endswith('/'):
        resource_path = resource_path + '/'

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True

    return downloaded
Exemplo n.º 14
0
def load_model(word2vec_modelfname=None, is_binary=False):
    if word2vec_modelfname == 'GoogleNews-vectors-negative300.bin.gz':
        try:
            sem_model = gensim.models.KeyedVectors.load_word2vec_format(
                word2vec_modelfname, binary=True)
            print(
                "Google-news model of %d words, each represented by %d-dimensional vectors,    successfully loaded."
                % (len(sem_model.vocab), sem_model.vector_size))
            return sem_model
        except FileNotFoundError:
            sys.stderr.write(
                "Model file with name %s not found in directory. Please download it from %s\n(direct link to google drive: %s)."
                %
                (word2vec_modelfname, "https://code.google.com/p/word2vec/",
                 "https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing"
                 ))
            return None

    if word2vec_modelfname is None:
        try:
            word2vec_modelfname = str(
                find('models/word2vec_sample/pruned.word2vec.txt'))
            is_binary = False
        except LookupError:
            nltk.download('word2vec_sample')
            word2vec_modelfname = str(
                find('models/word2vec_sample/pruned.word2vec.txt'))

    sem_model = gensim.models.KeyedVectors.load_word2vec_format(
        word2vec_modelfname, binary=is_binary)
    print(
        "Semantic model of %d words, each represented by %d-dimensional vectors, successfully loaded."
        % (len(sem_model.vocab), sem_model.vector_size))
    return sem_model
Exemplo n.º 15
0
    def test_corpus_bleu(self):
        ref_file = find('models/wmt15_eval/ref.ru')
        hyp_file = find('models/wmt15_eval/google.ru')
        mteval_output_file = find('models/wmt15_eval/mteval-13a.output')

        # Reads the BLEU scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file, 'r') as mteval_fin:
            # The numbers are located in the last 2nd line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])

        with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
            with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypothesis = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()],ref_fin))
                # Without smoothing.
                for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i)
                    # Check that the BLEU scores difference is less than 0.005 .
                    # Note: This is an approximate comparison; as much as
                    #       +/- 0.01 BLEU might be "statistically significant",
                    #       the actual translation quality might not be.
                    assert abs(mteval_bleu - nltk_bleu) < 0.005

                # With the same smoothing method used in mteval-v13a.pl
                chencherry = SmoothingFunction()
                for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(references, hypothesis,
                                            weights=(1.0/i,)*i,
                                            smoothing_function=chencherry.method3)
                    assert abs(mteval_bleu - nltk_bleu) < 0.005
Exemplo n.º 16
0
    def __init__(self):
        from nltk.data import find
        from nltk import download

        try:
            find('wordnet.zip')
        except LookupError:
            download('wordnet')
Exemplo n.º 17
0
    def __init__(self):
        from nltk.data import find
        from nltk import download

        # Download the punkt data only if it is not already downloaded
        try:
            find('punkt.zip')
        except LookupError:
            download('punkt')
Exemplo n.º 18
0
    def __init__(self):
        from nltk.data import find
        from nltk import download

        # Download the punkt data only if it is not already downloaded
        try:
            find('punkt.zip')
        except LookupError:
            download('punkt')
Exemplo n.º 19
0
    def __init__(self):
        from nltk.data import find
        from nltk import download

        # Download the stopwords data only if it is not already downloaded
        try:
            find('stopwords.zip')
        except LookupError:
            download('stopwords')
Exemplo n.º 20
0
    def __init__(self):
        """
        Contains various synset related functions.
        """
        try:
            data.find(os.path.join("corpora", "wordnet"))
        except LookupError:
            download("wordnet")

        self.API = ImageNetAPI()
Exemplo n.º 21
0
def require(corpora: list = []):
    """Download the required NLTK corpus if not found.

    Keyword Arguments:
        corpora {list} -- The identifier or name of NLTK corpus (default: {[]})
    """
    for corpus in corpora:
        try:
            find(corpus)
        except LookupError:
            download(corpus)
Exemplo n.º 22
0
def create_app(config, debug=False, testing=False, config_overrides=None):
    app = Flask(__name__)
    app.config.from_object(config)

    app.debug = debug
    app.testing = testing

    if config_overrides:
        app.config.update(config_overrides)

    # Configure logging
    if not app.testing:
        logging.basicConfig(level=logging.INFO)

    # Setup the data model.
    with app.app_context():
        model = get_model()
        model.init_app(app)

    # Register the Bookshelf CRUD blueprint.
    from .crud import crud
    app.register_blueprint(crud, url_prefix='/wcloud')

    # Add a default root route.
    @app.route("/")
    def index():
        return redirect(url_for('crud.wcloud'))

    # Add an error handler. This is useful for debugging the live application,
    # however, you should disable the output of the exception for production
    # applications.
    @app.errorhandler(500)
    def server_error(e):
        return """
        An internal error occurred: <pre>{}</pre>
        See logs for full stacktrace.
        """.format(e), 500

    #download nltk corpus first time ran, if needed
    try:
        data.find('tokenizers/tokenize')
    except LookupError:
        #hack due to SSL issue with downloading from wrong local location
        #obviously not the right thing to do, just to get working for now
        try:
            _create_unverified_https_context = ssl._create_unverified_context
        except AttributeError:
            pass
        else:
            ssl._create_default_https_context = _create_unverified_https_context
        
        download('popular')

    return app
Exemplo n.º 23
0
    def setup_nltk(self, **kw):
        import nltk
        from nltk.data import find

        tagger = "averaged_perceptron_tagger"

        try:
            find("taggers/%s" % tagger)
        except LookupError:
            click.echo("Downloading NTLK data (~2MB)...")
            nltk.download(tagger)
            return True

        return False
Exemplo n.º 24
0
    def __init__(self, filename='drt_glue.semtype'):
        try:
            f = open(data.find('grammars/%s' % filename))
        except LookupError:
            f = open(filename)
        lines = f.readlines()
        f.close()

        for line in lines:  # example: 'verb : (\\x.(<word> x), ( subj -o f )) : [subj]'
            #             lambdacalc -^  linear logic -^
            line = line.strip()  # remove trailing newline
            if not len(line): continue  # skip empty lines
            if line[0] == '#': continue  # skip commented out lines

            parts = line.split(
                ' : ')  # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']

            glue_formulas = []
            parenCount = 0
            tuple_start = 0
            tuple_comma = 0

            relationships = None

            for i in range(len(parts[1])):
                if parts[1][i] == '(':
                    if parenCount == 0:  # if it's the first '(' of a tuple
                        tuple_start = i + 1  # then save the index
                    parenCount += 1
                elif parts[1][i] == ')':
                    parenCount -= 1
                    if parenCount == 0:  # if it's the last ')' of a tuple
                        meaning_term = parts[1][
                            tuple_start:tuple_comma]  # '\\x.(<word> x)'
                        glue_term = parts[1][tuple_comma + 1:i]  # '(v-r)'
                        glue_formulas.append(
                            [meaning_term,
                             glue_term])  # add the GlueFormula to the list

                        if len(parts) > 2:
                            relationships = frozenset([
                                r.strip() for r in parts[2]
                                [parts[2].index('[') +
                                 1:parts[2].index(']')].split(',')
                            ])
                elif parts[1][i] == ',' or parts[1][i] == ':':
                    if parenCount == 1:  # if it's a comma separating the parts of the tuple
                        tuple_comma = i  # then save the index
                elif parts[1][i] == '#':  # skip comments at the ends of lines
                    if parenCount != 0:  # if the line hasn't parsed correctly so far
                        raise RuntimeError, 'Formula syntax is incorrect for entry %s' % (
                            line)
                    break  # break to the next line

            if parts[0] in self:
                self[parts[0]][relationships] = glue_formulas
            else:
                self[parts[0]] = {
                    relationships: glue_formulas
                }  # add the glue entry to the dictionary
Exemplo n.º 25
0
def demo():
    from nltk.data import find
    corpus_root = find('corpora/childes/data-xml/Eng-USA/')
    childes = CHILDESCorpusReader(corpus_root, u'.*.xml')

    # describe all corpus
    for file in childes.fileids()[:5]:
        corpus = ''
        corpus_id = ''
        for (key,value) in childes.corpus(file)[0].items():
            if key == "Corpus": corpus = value
            if key == "Id": corpus_id = value
        print 'Reading', corpus,corpus_id,' .....'
        print "words:", childes.words(file)[:7],"..."
        print "words with replaced words:", childes.words(file, replace=True)[:7]," ..."
        print "words with pos tags:", childes.words(file, pos=True)[:7]," ..."
        print "words (only MOT):", childes.words(file, speaker='MOT')[:7], "..."
        print "words (only CHI):", childes.words(file, speaker='CHI')[:7], "..."
        print "stemmed words:", childes.words(file, stem=True)[:7]," ..."
        print "words with relations and pos-tag:", childes.words(file, relation=True)[:5]," ..."
        print "sentence:", childes.sents(file)[:2]," ..."
        for (participant, values) in childes.participants(file)[0].items():
                for (key, value) in values.items():
                    print "\tparticipant", participant, key, ":", value
        print "num of sent:", len(childes.sents(file))
        print "num of morphemes:", len(childes.words(file, stem=True))
        print "age:", childes.age(file)    
        print "age in month:", childes.age(file, month=True)    
        print "MLU:", childes.MLU(file)
        print '\r'
Exemplo n.º 26
0
def construct_pos_list(word2vec_sample_path):
    nltk.download('word2vec_sample')
    word2vec_sample = str(find(word2vec_sample_path))
    model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample,
                                                            binary=False)
    position_list = [
        'left', 'right', 'above', 'below', 'inside', 'surrounding'
    ]
    x = []
    y = []
    posi_lists = []
    for i in range(len(position_list)):
        tmp_lists = []
        numberofsyn = len(wn.synsets(position_list[i]))
        for j in range(numberofsyn):
            for w in wn.synsets(position_list[i])[j].lemma_names():
                if '_' not in w and w in model:
                    tmp_lists.append(w)
                    x.append(model[w])
                    y.append(i)
        posi_lists.append(list(set(tmp_lists)))

    pca = PCA(n_components=10)
    pca.fit(x)
    x = pca.transform(x)
    x = np.array(x)
    y = np.array(y)
    clf = svm.SVC()
    clf.fit(x, y)
    return posi_lists, x, y, pca, clf, model
Exemplo n.º 27
0
Arquivo: chat80.py Projeto: sp00/nltk
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    try:
        import sqlite3
        path = find(dbname)
        connection = sqlite3.connect(path)
        # return ASCII strings if possible
        connection.text_factory = sqlite3.OptimizedUnicode
        cur = connection.cursor()
        return cur.execute(query)
    except ImportError:
        import warnings
        warnings.warn(
            "To run this function, first install pysqlite, or else use Python 2.5 or later."
        )
        raise
    except ValueError:
        import warnings
        warnings.warn(
            "Make sure the database file %s is installed and uncompressed." %
            dbname)
        raise
Exemplo n.º 28
0
 def _vocabulary(self):
     return (
         data.find('stemmers/porter_test/porter_vocabulary.txt')
             .open(encoding='utf-8')
             .read()
             .splitlines()
     )
Exemplo n.º 29
0
def demo():
    from nltk.data import find
    corpus_root = find('corpora/childes/data-xml/Eng-USA/')
    childes = CHILDESCorpusReader(corpus_root, u'.*.xml')

    # describe all corpus
    for file in childes.fileids()[:5]:
        corpus = ''
        corpus_id = ''
        for (key, value) in childes.corpus(file)[0].items():
            if key == "Corpus": corpus = value
            if key == "Id": corpus_id = value
        print 'Reading', corpus, corpus_id, ' .....'
        print "words:", childes.words(file)[:7], "..."
        print "words with replaced words:", childes.words(
            file, replace=True)[:7], " ..."
        print "words with pos tags:", childes.words(file, pos=True)[:7], " ..."
        print "words (only MOT):", childes.words(file,
                                                 speaker='MOT')[:7], "..."
        print "words (only CHI):", childes.words(file,
                                                 speaker='CHI')[:7], "..."
        print "stemmed words:", childes.words(file, stem=True)[:7], " ..."
        print "words with relations and pos-tag:", childes.words(
            file, relation=True)[:5], " ..."
        print "sentence:", childes.sents(file)[:2], " ..."
        for (participant, values) in childes.participants(file)[0].items():
            for (key, value) in values.items():
                print "\tparticipant", participant, key, ":", value
        print "num of sent:", len(childes.sents(file))
        print "num of morphemes:", len(childes.words(file, stem=True))
        print "age:", childes.age(file)
        print "age in month:", childes.age(file, month=True)
        print "MLU:", childes.MLU(file)
        print '\r'
Exemplo n.º 30
0
 def _vocabulary(self):
     with closing(
         data.find('stemmers/porter_test/porter_vocabulary.txt').open(
             encoding='utf-8'
         )
     ) as fp:
         return fp.read().splitlines()
Exemplo n.º 31
0
 def __init__(self, papers, presentations):
     self.papers = papers
     self.presentations = presentations
     self.train_features, self.vectorizer = self.createVectorizer(
         papers, presentations)
     model_dir = find('models/bllip_wsj_no_aux').path
     self.parser = RerankingParser.from_unified_model_dir(model_dir)
def namedEntityRecognizer():
    echo2("Performing NER on incoming stream")
    content = request.stream.read()
    #print content

    if Verbose:
        echo2("Incoming content is "+content)
    PICKLE = "averaged_perceptron_tagger.pickle"
    AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
    tagger = PerceptronTagger(load=False)
    tagger.load(AP_MODEL_LOC)
    pos_tag = tagger.tag
    start = time.time()
    #date_time = timex.tag(content)
    tokenized = nltk.word_tokenize(content)
    tagged = pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    names = extract_entity_names(namedEnt, 'NE')
    #names.extend(date_time)
    result = {"result" : "success", "names" : names}
    if Units:
        grammar = '''unit: {<CD><NNS>?<NN.*>?},
                     unit: {<CD><JJ>?<NN.*>}
                  '''
        parser = nltk.RegexpParser(grammar)
        units = extract_entity_names(parser.parse(tagged),'unit')
        result['units'] = units
    jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
    end = time.time()
    print "NER took "+str(end - start)+" seconds"
    return jsonDoc
Exemplo n.º 33
0
 def _vocabulary(self):
     with closing(
         data.find('stemmers/porter_test/porter_vocabulary.txt').open(
             encoding='utf-8'
         )
     ) as fp:
         return fp.read().splitlines()
def load_model(word2vec_modelfname=None, is_binary=False):
    if word2vec_modelfname is None:
        try:
            word2vec_modelfname = str(
                find('models/word2vec_sample/pruned.word2vec.txt'))
            is_binary = False
        except LookupError:
            nltk.download('word2vec_sample')
            word2vec_modelfname = str(
                find('models/word2vec_sample/pruned.word2vec.txt'))

    model = gensim.models.KeyedVectors.load_word2vec_format(
        word2vec_modelfname, binary=is_binary)
    print(
        "Semantic model of %d words, each represented by %d-dimensional vectors, successfully loaded."
        % (len(model.vocab), model.vector_size))
    return model
Exemplo n.º 35
0
    def get_instance(cls):
        if not cls._instance:
            model_dir = find('models/bllip_wsj_no_aux').path
            bllipParser = BllipParser.from_unified_model_dir(model_dir)
            Parser._instance = Parser()
            Parser._instance._initialize(bllipParser)

        return Parser._instance
Exemplo n.º 36
0
 def test_vocabulary_nltk_mode(self):
     self._test_against_expected_output(
         PorterStemmer.NLTK_EXTENSIONS,
         data.find('stemmers/porter_test/porter_nltk_output.txt')
             .open(encoding='utf-8')
             .read()
             .splitlines()
     )
Exemplo n.º 37
0
 def __init__(self, load=True):
     self.model = AveragedPerceptron()
     self.tagdict = {}
     self.classes = set()
     if load:
         AP_MODEL_LOC = "file:" + str(
             find("taggers/averaged_perceptron_tagger/" + PICKLE))
         self.load(AP_MODEL_LOC)
Exemplo n.º 38
0
def _get_tagger(lang=None):
    if lang == "rus":
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = "file:" + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    else:
        tagger = PerceptronTagger()
    return tagger
Exemplo n.º 39
0
def _get_tagger(lang=None):
    if lang == 'rus':
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    else:
        tagger = PerceptronTagger()
    return tagger
Exemplo n.º 40
0
def demo(corpus_root=None):
    """
    The CHILDES corpus should be manually downloaded and saved
    to ``[NLTK_Data_Dir]/corpora/childes/``
    """
    if not corpus_root:
        from nltk.data import find

        corpus_root = find("corpora/childes/data-xml/Eng-USA/")

    try:
        childes = CHILDESCorpusReader(corpus_root, ".*.xml")
        # describe all corpus
        for file in childes.fileids()[:5]:
            corpus = ""
            corpus_id = ""
            for (key, value) in childes.corpus(file)[0].items():
                if key == "Corpus":
                    corpus = value
                if key == "Id":
                    corpus_id = value
            print("Reading", corpus, corpus_id, " .....")
            print("words:", childes.words(file)[:7], "...")
            print(
                "words with replaced words:",
                childes.words(file, replace=True)[:7],
                " ...",
            )
            print("words with pos tags:",
                  childes.tagged_words(file)[:7], " ...")
            print("words (only MOT):",
                  childes.words(file, speaker="MOT")[:7], "...")
            print("words (only CHI):",
                  childes.words(file, speaker="CHI")[:7], "...")
            print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
            print(
                "words with relations and pos-tag:",
                childes.words(file, relation=True)[:5],
                " ...",
            )
            print("sentence:", childes.sents(file)[:2], " ...")
            for (participant, values) in childes.participants(file)[0].items():
                for (key, value) in values.items():
                    print("\tparticipant", participant, key, ":", value)
            print("num of sent:", len(childes.sents(file)))
            print("num of morphemes:", len(childes.words(file, stem=True)))
            print("age:", childes.age(file))
            print("age in month:", childes.age(file, month=True))
            print("MLU:", childes.MLU(file))
            print()

    except LookupError as e:
        print("""The CHILDES corpus, or the parts you need, should be manually
        downloaded from https://childes.talkbank.org/data-xml/ and saved at
        [NLTK_Data_Dir]/corpora/childes/
            Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
        demo('/path/to/childes/data-xml/Eng-USA/")
        """)
Exemplo n.º 41
0
def demo(corpus_root=None):
    """
    The CHILDES corpus should be manually downloaded and saved
    to ``[NLTK_Data_Dir]/corpora/childes/``
    """
    if not corpus_root:
        from nltk.data import find

        corpus_root = find('corpora/childes/data-xml/Eng-USA/')

    try:
        childes = CHILDESCorpusReader(corpus_root, '.*.xml')
        # describe all corpus
        for file in childes.fileids()[:5]:
            corpus = ''
            corpus_id = ''
            for (key, value) in childes.corpus(file)[0].items():
                if key == "Corpus":
                    corpus = value
                if key == "Id":
                    corpus_id = value
            print('Reading', corpus, corpus_id, ' .....')
            print("words:", childes.words(file)[:7], "...")
            print(
                "words with replaced words:",
                childes.words(file, replace=True)[:7],
                " ...",
            )
            print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
            print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
            print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
            print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
            print(
                "words with relations and pos-tag:",
                childes.words(file, relation=True)[:5],
                " ...",
            )
            print("sentence:", childes.sents(file)[:2], " ...")
            for (participant, values) in childes.participants(file)[0].items():
                for (key, value) in values.items():
                    print("\tparticipant", participant, key, ":", value)
            print("num of sent:", len(childes.sents(file)))
            print("num of morphemes:", len(childes.words(file, stem=True)))
            print("age:", childes.age(file))
            print("age in month:", childes.age(file, month=True))
            print("MLU:", childes.MLU(file))
            print()

    except LookupError as e:
        print(
            """The CHILDES corpus, or the parts you need, should be manually
        downloaded from https://childes.talkbank.org/data-xml/ and saved at
        [NLTK_Data_Dir]/corpora/childes/
            Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
        demo('/path/to/childes/data-xml/Eng-USA/")
        """
        )
Exemplo n.º 42
0
 def __init__(self):
     from nltk.data import find
     from nltk import download
     import os
     
     # Download the wordnet data only if it is not already downloaded
     wordnet_path = None
     if os.name == 'nt':
         wordnet_path = os.path.join(os.getenv('APPDATA'), 'nltk_data',
                                             'corpora', 'wordnet.zip')
     else:
         wordnet_path = os.path.join(os.path.expanduser('~'), 'nltk_data',
                                             'corpora', 'wordnet.zip')
     try:
         if not os.path.isfile(wordnet_path):
             find('wordnet.zip')
     except LookupError:
         download('wordnet')
Exemplo n.º 43
0
    def test_vocabulary_original_mode(self):
        # The list of stems for this test was generated by taking the
        # Martin-blessed stemmer from
        # http://tartarus.org/martin/PorterStemmer/c.txt
        # and removing all the --DEPARTURE-- sections from it and
        # running it against Martin's test vocabulary.

        with closing(
                data.find('stemmers/porter_test/porter_original_output.txt').
                open(encoding='utf-8')) as fp:
            self._test_against_expected_output(
                PorterStemmer.ORIGINAL_ALGORITHM,
                fp.read().splitlines())

        self._test_against_expected_output(
            PorterStemmer.ORIGINAL_ALGORITHM,
            data.find('stemmers/porter_test/porter_original_output.txt').open(
                encoding='utf-8').read().splitlines())
Exemplo n.º 44
0
    def __init__(self):
        from nltk.data import find
        from nltk import download
        import os

        # Download the punkt data only if it is not already downloaded
        punkt_path = None
        if os.name == 'nt':
            punkt_path = os.path.join(os.getenv('APPDATA'), 'nltk_data',
                                      'tokenizers', 'punkt.zip')
        else:
            punkt_path = os.path.join(os.path.expanduser('~'), 'nltk_data',
                                      'tokenizers', 'punkt.zip')
        try:
            if not os.path.isfile(punkt_path):
                find('punkt.zip')
        except LookupError:
            download('punkt')
Exemplo n.º 45
0
    def __init__(self):
        from nltk.data import find
        from nltk import download
        import os

        # Download the punkt data only if it is not already downloaded
        punkt_path = None
        if os.name == 'nt':
            punkt_path = os.path.join(os.getenv('APPDATA'), 'nltk_data',
                                                'tokenizers', 'punkt.zip')
        else:
            punkt_path = os.path.join(os.path.expanduser('~'), 'nltk_data',
                                                'tokenizers', 'punkt.zip')
        try:
            if not os.path.isfile(punkt_path):
                find('punkt.zip')
        except LookupError:
            download('punkt')
Exemplo n.º 46
0
    def __init__(self):
        from nltk.data import find
        from nltk import download
        import os

        # Download the wordnet data only if it is not already downloaded
        wordnet_path = None
        if os.name == 'nt':
            wordnet_path = os.path.join(os.getenv('APPDATA'), 'nltk_data',
                                        'corpora', 'wordnet.zip')
        else:
            wordnet_path = os.path.join(os.path.expanduser('~'), 'nltk_data',
                                        'corpora', 'wordnet.zip')
        try:
            if not os.path.isfile(wordnet_path):
                find('wordnet.zip')
        except LookupError:
            download('wordnet')
Exemplo n.º 47
0
def _get_tagger(lang=None):
    if lang == "rus":
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = "file:" + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    elif lang == "eng":
        tagger = PerceptronTagger()
    else:
        tagger = PerceptronTagger()
    return tagger
Exemplo n.º 48
0
 def __init__(self, load=True):
     '''
     :param load: Load the pickled model upon instantiation.
     '''
     self.model = AveragedPerceptron()
     self.tagdict = {}
     self.classes = set()
     if load:
         AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
         self.load(AP_MODEL_LOC)
Exemplo n.º 49
0
    def test_vocabulary_original_mode(self):
        # The list of stems for this test was generated by taking the
        # Martin-blessed stemmer from
        # http://tartarus.org/martin/PorterStemmer/c.txt
        # and removing all the --DEPARTURE-- sections from it and
        # running it against Martin's test vocabulary.

        with closing(data.find('stemmers/porter_test/porter_original_output.txt').open(encoding='utf-8')) as fp:
            self._test_against_expected_output(
                PorterStemmer.ORIGINAL_ALGORITHM,
                fp.read().splitlines()
            )

        self._test_against_expected_output(
            PorterStemmer.ORIGINAL_ALGORITHM,
            data.find('stemmers/porter_test/porter_original_output.txt')
                .open(encoding='utf-8')
                .read()
                .splitlines()
        )
Exemplo n.º 50
0
def nltk_download_corpus(resource_path):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    from nltk.data import find
    from nltk import download
    from os.path import split, sep
    from zipfile import BadZipfile

    # Download the NLTK data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    # From http://www.nltk.org/api/nltk.html
    # When using find() to locate a directory contained in a zipfile,
    # the resource name must end with the forward slash character.
    # Otherwise, find() will not locate the directory.
    #
    # Helps when resource_path=='sentiment/vader_lexicon''
    if not resource_path.endswith(sep):
        resource_path = resource_path + sep

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True
    except BadZipfile:
        raise BadZipfile(
            'The NLTK corpus file being opened is not a zipfile, '
            'or it has been corrupted and needs to be manually deleted.'
        )

    return downloaded
Exemplo n.º 51
0
def _str2records(filename, rel):
    """
    Read a file into memory and convert each relation clause into a list.
    """ 
    recs = []
    path = find("corpora/chat80/%s" % filename)
    for line in path.open():
        if line.startswith(rel):
            line = re.sub(rel+r'\(', '', line)
            line = re.sub(r'\)\.$', '', line)
            line = line[:-1]
            record = line.split(',')
            recs.append(record)
    return recs
Exemplo n.º 52
0
    def syllable_pos_setup(self):
        """Sets up syllables and POS tagging"""
        en_list = ['en_CA', 'en_PH', 'en_NA', 'en_NZ', 'en_JM', 'en_BS', 'en_US',
                   'en_IE', 'en_MW', 'en_IN', 'en_BZ', 'en_TT', 'en_ZA', 'en_AU',
                   'en_GH', 'en_ZW', 'en_GB']

        for lang in en_list:
            if not dictools.is_installed(lang): dictools.install(lang)

        self.cmu_dict = cmudict.dict()

        # sets up POS
        try:
            nltk.pos_tag(['test'])
            self.pos_tag = nltk.pos_tag
        except urllib2.URLError:
            PICKLE = "averaged_perceptron_tagger.pickle"
            AP_MODEL_LOC = 'file:' + str(find('taggers/averaged_perceptron_tagger/' + PICKLE))
            tagger = PerceptronTagger(load=False)
            tagger.load(AP_MODEL_LOC)
            self.pos_tag = tagger.tag

        self.tag_dict = {'NN': 'Noun', 'FW': 'Noun', 'JJ': 'Adjective', 'VB': 'Verb',
                         'IN': 'Preposition', 'CC': 'Conjunction',
                         'RP': 'Connector', 'TO': 'Connector', 'MD': 'Connector',
                         'RB': 'Adverb', 'WR': 'Wh-adverb',
                         'DT': 'DetPro', 'WD': 'DetPro', 'PD': 'DetPro', 'PR': 'DetPro', 'WP': 'DetPro',
                         'CD': 'Cardinal',
                         'EX': 'Existential there'}

        ##        self.tag_dict={'NN':'Noun', 'JJ':'Adjective','RB':'Adverb','VB':'Verb',
        ##          'IN':'Preposition','PR':'Pronoun','CC':'Conjunction',
        ##          'RP':'Particle','WR':'Wh-adverb','DT':'Determiner',
        ##          'TO':'To','MD':'Modal Aux','CD':'Cardinal', 'PD':'Predeterminer',
        ##          'WD':'Wh-determiner', 'WP':'Wh-pronoun','EX':'Existential there'}

        # POS which are allowed to happen twice in a row
        self.pos_double = []  # ['Noun','Adjective']

        # POS which can only occur sequentially
        # i.e. an Adverb must occur in fron of a verb
        self.pos_lead = {'Adverb': ['Verb'], 'Pronoun': ['Noun'], 'Adjective': ['Noun'],
                         'Preposition': ['Noun', 'Pronoun']}

        # POS which cannot occur sequentially
        # i.e. a preposition cannot come before a verb
        self.pos_restrict_lead = {'Preposition': 'Verb',}

        return
Exemplo n.º 53
0
    def __init__(self, filename='drt_glue.semtype'):
        try:
            f = open(data.find('grammars/%s' % filename))
        except LookupError:
            f = open(filename)
        lines = f.readlines()
        f.close()

        for line in lines:                          # example: 'verb : (\\x.(<word> x), ( subj -o f )) : [subj]'
                                                    #             lambdacalc -^  linear logic -^
            line = line.strip()                     # remove trailing newline
            if not len(line): continue              # skip empty lines
            if line[0] == '#': continue             # skip commented out lines

            parts = line.split(' : ')               # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']

            glue_formulas = []
            parenCount = 0
            tuple_start = 0
            tuple_comma = 0
            
            relationships = None
            
            for i in range(len(parts[1])):
                if parts[1][i] == '(':
                    if parenCount == 0:             # if it's the first '(' of a tuple
                        tuple_start = i+1           # then save the index
                    parenCount += 1
                elif parts[1][i] == ')':
                    parenCount -= 1
                    if parenCount == 0:             # if it's the last ')' of a tuple
                        meaning_term =  parts[1][tuple_start:tuple_comma]   # '\\x.(<word> x)'
                        glue_term =     parts[1][tuple_comma+1:i]           # '(v-r)'
                        glue_formulas.append([meaning_term, glue_term])     # add the GlueFormula to the list
                        
                        if len(parts) > 2:
                            relationships  = frozenset([r.strip() for r in parts[2][parts[2].index('[')+1:parts[2].index(']')].split(',')])
                elif parts[1][i] == ',' or parts[1][i] == ':':
                    if parenCount == 1:             # if it's a comma separating the parts of the tuple
                        tuple_comma = i             # then save the index
                elif parts[1][i] == '#':            # skip comments at the ends of lines
                    if parenCount != 0:             # if the line hasn't parsed correctly so far
                        raise RuntimeError, 'Formula syntax is incorrect for entry %s' % (line)
                    break                           # break to the next line
            
            if parts[0] in self:
                self[parts[0]][relationships] = glue_formulas
            else:
                self[parts[0]] = {relationships: glue_formulas} # add the glue entry to the dictionary
Exemplo n.º 54
0
    def test_sentence_nist(self):
        ref_file = find('models/wmt15_eval/ref.ru')
        hyp_file = find('models/wmt15_eval/google.ru')
        mteval_output_file = find('models/wmt15_eval/mteval-13a.output')

        # Reads the NIST scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file, 'r') as mteval_fin:
            # The numbers are located in the last 4th line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1])

        with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
            with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypotheses = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()], ref_fin))
                # Without smoothing.
                for i, mteval_nist in zip(range(1, 10), mteval_nist_scores):
                    nltk_nist = corpus_nist(references, hypotheses, i)
                    # Check that the NIST scores difference is less than 0.5
                    assert abs(mteval_nist - nltk_nist) < 0.05
Exemplo n.º 55
0
 def test_vocabulary_martin_mode(self):
     """Tests all words from the test vocabulary provided by M Porter
     
     The sample vocabulary and output were sourced from:
         http://tartarus.org/martin/PorterStemmer/voc.txt
         http://tartarus.org/martin/PorterStemmer/output.txt
     and are linked to from the Porter Stemmer algorithm's homepage
     at
         http://tartarus.org/martin/PorterStemmer/
     """
     with closing(data.find('stemmers/porter_test/porter_martin_output.txt').open(encoding='utf-8')) as fp:
         self._test_against_expected_output(
             PorterStemmer.MARTIN_EXTENSIONS,
             fp.read().splitlines()
         )
Exemplo n.º 56
0
def demo():
    """This assumes the Python module bllipparser is installed."""

    # download and install a basic unified parsing model (Wall Street Journal)
    # sudo python -m nltk.downloader bllip_wsj_no_aux

    from nltk.data import find

    model_dir = find('models/bllip_wsj_no_aux').path

    print('Loading BLLIP Parsing models...')
    # the easiest way to get started is to use a unified model
    bllip = BllipParser.from_unified_model_dir(model_dir)
    print('Done.')

    sentence1 = 'British left waffles on Falklands .'.split()
    sentence2 = 'I saw the man with the telescope .'.split()
    # this sentence is known to fail under the WSJ parsing model
    fail1 = '# ! ? : -'.split()
    for sentence in (sentence1, sentence2, fail1):
        print('Sentence: %r' % ' '.join(sentence))
        try:
            tree = next(bllip.parse(sentence))
            print(tree)
        except StopIteration:
            print("(parse failed)")

    # n-best parsing demo
    for i, parse in enumerate(bllip.parse(sentence1)):
        print('parse %d:\n%s' % (i, parse))

    # using external POS tag constraints
    print(
        "forcing 'tree' to be 'NN':",
        next(bllip.tagged_parse([('A', None), ('tree', 'NN')])),
    )
    print(
        "forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
        next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')])),
    )
    # constraints don't have to make sense... (though on more complicated
    # sentences, they may cause the parse to fail)
    print(
        "forcing 'A' to be 'NNP':",
        next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)])),
    )
Exemplo n.º 57
0
    def read_file(self, empty_first=True):
        if empty_first:
            self.clear()

        try:
            f = open(data.find("grammars/glue.semtype"))
        except LookupError:
            f = open("glue.semtype")
        lines = f.readlines()
        f.close()

        for line in lines:  # example: 'n : (\\x.(<word> x), (v-r))'
            #     lambdacalc -^  linear logic -^
            line = line.strip()  # remove trailing newline
            if not len(line):
                continue  # skip empty lines
            if line[0] == "#":
                continue  # skip commented out lines

            parts = line.split(" : ", 1)  # ['n', '(\\x.(<word> x), (v-r))']

            glue_formulas = []
            parenCount = 0
            tuple_start = 0
            tuple_comma = 0
            for i in range(len(parts[1])):
                if parts[1][i] == "(":
                    if parenCount == 0:  # if it's the first '(' of a tuple
                        tuple_start = i + 1  # then save the index
                    parenCount += 1
                elif parts[1][i] == ")":
                    parenCount -= 1
                    if parenCount == 0:  # if it's the last ')' of a tuple
                        meaning_term = parts[1][tuple_start:tuple_comma]  # '\\x.(<word> x)'
                        glue_term = parts[1][tuple_comma + 1 : i]  # '(v-r)'
                        glue_formulas.append([meaning_term, glue_term])  # add the GlueFormula to the list
                elif parts[1][i] == "," or parts[1][i] == ":":
                    if parenCount == 1:  # if it's a comma separating the parts of the tuple
                        tuple_comma = i  # then save the index
                elif parts[1][i] == "#":  # skip comments at the ends of lines
                    if parenCount != 0:  # if the line hasn't parsed correctly so far
                        raise RuntimeError, "Formula syntax is incorrect for entry %s" % (line)
                    break  # break to the next line
            self[parts[0]] = glue_formulas  # add the glue entry to the dictionary
Exemplo n.º 58
0
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query 
    :type rel_name: str
    """
    try:
        import sqlite3
        path = find(dbname)
        connection =  sqlite3.connect(path)
        # return ASCII strings if possible
        connection.text_factory = sqlite3.OptimizedUnicode
        cur = connection.cursor()
        return cur.execute(query)
    except ImportError:
        import warnings
        warnings.warn("To run this function, first install pysqlite, or else use Python 2.5 or later.")
        raise
Exemplo n.º 59
0
def second_lexicon(positive_seeds,negative_seeds):

    word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
    model = gensim.models.Word2Vec.load_word2vec_format(word2vec_sample, binary=False)
    positive_list=[]
    negative_list=[]

    for aword in model.vocab:
        score=0
        for pseed in positive_seeds:
            score+=model.similarity(aword, pseed)
        for nseed in negative_seeds:
            score-=model.similarity(aword,nseed)

        score=score/16.0
        if score>0.03:
            positive_list.append(aword)
        elif score<-0.03:
            negative_list.append(aword)

    return positive_list,negative_list